diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 8e28049888..bdf7c07c6a 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -19,8 +19,8 @@ on: description: 'debug or release' required: true type: string - pg-versions: - description: 'a json array of postgres versions to run regression tests on' + test-cfg: + description: 'a json object of postgres versions and lfc states to run regression tests on' required: true type: string @@ -276,14 +276,14 @@ jobs: options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: fail-fast: false - matrix: - pg_version: ${{ fromJson(inputs.pg-versions) }} + matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }} steps: - uses: actions/checkout@v4 with: submodules: true - name: Pytest regression tests + continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }} uses: ./.github/actions/run-python-test-set timeout-minutes: 60 with: @@ -300,6 +300,7 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml new file mode 100644 index 0000000000..cc6994397f --- /dev/null +++ b/.github/workflows/_create-release-pr.yml @@ -0,0 +1,79 @@ +name: Create Release PR + +on: + workflow_call: + inputs: + component-name: + description: 'Component name' + required: true + type: string + release-branch: + description: 'Release branch' + required: true + type: string + secrets: + ci-access-token: + description: 'CI access token' + required: true + +defaults: + run: + shell: bash -euo pipefail {0} + +jobs: + create-storage-release-branch: + runs-on: ubuntu-22.04 + + permissions: + contents: write # for `git push` + + steps: + - uses: actions/checkout@v4 + with: + ref: main + + - name: Set variables + id: vars + env: + COMPONENT_NAME: ${{ inputs.component-name }} + RELEASE_BRANCH: ${{ inputs.release-branch }} + run: | + today=$(date +'%Y-%m-%d') + echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT} + echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT} + + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + - name: Create RC branch + env: + RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} + TITLE: ${{ steps.vars.outputs.title }} + run: | + git checkout -b "${RC_BRANCH}" + + # create an empty commit to distinguish workflow runs + # from other possible releases from the same commit + git commit --allow-empty -m "${TITLE}" + + git push origin "${RC_BRANCH}" + + - name: Create a PR into ${{ inputs.release-branch }} + env: + GH_TOKEN: ${{ secrets.ci-access-token }} + RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} + RELEASE_BRANCH: ${{ inputs.release-branch }} + TITLE: ${{ steps.vars.outputs.title }} + run: | + cat << EOF > body.md + ## ${TITLE} + + **Please merge this Pull Request using 'Create a merge commit' button** + EOF + + gh pr create --title "${TITLE}" \ + --body-file "body.md" \ + --head "${RC_BRANCH}" \ + --base "${RELEASE_BRANCH}" diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 0289f552f9..ea8fee80c2 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -158,7 +158,7 @@ jobs: if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic perf testing: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -506,7 +506,7 @@ jobs: if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -541,7 +541,7 @@ jobs: runs-on: ${{ matrix.RUNNER }} container: - image: neondatabase/build-tools:pinned + image: neondatabase/build-tools:pinned-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -558,12 +558,12 @@ jobs: arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g') cd /home/nonroot - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.1-1.pgdg110+1_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.5-1.pgdg110+1_${arch}.deb" - dpkg -x libpq5_17.1-1.pgdg110+1_${arch}.deb pg - dpkg -x postgresql-16_16.5-1.pgdg110+1_${arch}.deb pg - dpkg -x postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb pg + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb" + dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg + dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg + dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg mkdir -p /tmp/neon/pg_install/v16/bin ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench @@ -643,7 +643,7 @@ jobs: if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -759,7 +759,7 @@ jobs: if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -874,7 +874,7 @@ jobs: if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -974,7 +974,7 @@ jobs: if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 9e7be76901..93da86a353 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -117,7 +117,7 @@ jobs: - name: Create multi-arch image env: - DEFAULT_DEBIAN_VERSION: bullseye + DEFAULT_DEBIAN_VERSION: bookworm IMAGE_TAG: ${{ needs.check-image.outputs.tag }} run: | for debian_version in bullseye bookworm; do diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 89fd2d0d17..9830c2a0c9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -253,7 +253,14 @@ jobs: build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds - pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }} + # run without LFC on v17 release only + test-cfg: | + ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"}, + {"pg_version":"v15", "lfc_state": "without-lfc"}, + {"pg_version":"v16", "lfc_state": "without-lfc"}, + {"pg_version":"v17", "lfc_state": "without-lfc"}, + {"pg_version":"v17", "lfc_state": "with-lfc"}]' + || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index e827539c80..092831adb9 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -38,7 +38,7 @@ jobs: contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' timeout-minutes: 90 - runs-on: macos-14 + runs-on: macos-15 env: # Use release build only, to have less debug info around @@ -52,7 +52,7 @@ jobs: submodules: true - name: Install macOS postgres dependencies - run: brew install flex bison openssl protobuf icu4c pkg-config + run: brew install flex bison openssl protobuf icu4c - name: Set pg 14 revision for caching id: pg_v14_rev diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 615937b5a1..6b98bc873f 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -29,7 +29,7 @@ jobs: trigger_bench_on_ec2_machine_in_eu_central_1: runs-on: [ self-hosted, small ] container: - image: neondatabase/build-tools:pinned + image: neondatabase/build-tools:pinned-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -72,7 +72,7 @@ jobs: echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV fi - - name: Start Bench with run_id + - name: Start Bench with run_id run: | curl -k -X 'POST' \ "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ @@ -116,7 +116,7 @@ jobs: -H 'accept: application/gzip' \ -H "Authorization: Bearer $API_KEY" \ --output "test_log_${GITHUB_RUN_ID}.gz" - + - name: Unzip Test Log and Print it into this job's log if: always() && steps.poll_step.outputs.too_many_runs != 'true' run: | @@ -134,13 +134,13 @@ jobs: if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - name: Cleanup Test Resources - if: always() + if: always() run: | curl -k -X 'POST' \ "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index c196d07d3e..5b43d97de6 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -94,7 +94,7 @@ jobs: - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR env: - DEFAULT_DEBIAN_VERSION: bullseye + DEFAULT_DEBIAN_VERSION: bookworm run: | for debian_version in bullseye bookworm; do tags=() diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 56ef6f4bbb..11f010b6d4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -26,82 +26,26 @@ defaults: jobs: create-storage-release-branch: if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }} - runs-on: ubuntu-22.04 permissions: - contents: write # for `git push` + contents: write - steps: - - name: Check out code - uses: actions/checkout@v4 - with: - ref: main - - - name: Set environment variables - run: | - echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - - - name: Create release branch - run: git checkout -b $RELEASE_BRANCH - - - name: Push new branch - run: git push origin $RELEASE_BRANCH - - - name: Create pull request into release - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - run: | - TITLE="Storage & Compute release ${RELEASE_DATE}" - - cat << EOF > body.md - ## ${TITLE} - - **Please merge this Pull Request using 'Create a merge commit' button** - EOF - - gh pr create --title "${TITLE}" \ - --body-file "body.md" \ - --head "${RELEASE_BRANCH}" \ - --base "release" + uses: ./.github/workflows/_create-release-pr.yml + with: + component-name: 'Storage & Compute' + release-branch: 'release' + secrets: + ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} create-proxy-release-branch: if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }} - runs-on: ubuntu-22.04 permissions: - contents: write # for `git push` + contents: write - steps: - - name: Check out code - uses: actions/checkout@v4 - with: - ref: main - - - name: Set environment variables - run: | - echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - - - name: Create release branch - run: git checkout -b $RELEASE_BRANCH - - - name: Push new branch - run: git push origin $RELEASE_BRANCH - - - name: Create pull request into release - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - run: | - TITLE="Proxy release ${RELEASE_DATE}" - - cat << EOF > body.md - ## ${TITLE} - - **Please merge this Pull Request using 'Create a merge commit' button** - EOF - - gh pr create --title "${TITLE}" \ - --body-file "body.md" \ - --head "${RELEASE_BRANCH}" \ - --base "release-proxy" + uses: ./.github/workflows/_create-release-pr.yml + with: + component-name: 'Proxy' + release-branch: 'release-proxy' + secrets: + ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml index 98e394a3c2..2ed044b780 100644 --- a/.github/workflows/report-workflow-stats-batch.yml +++ b/.github/workflows/report-workflow-stats-batch.yml @@ -4,10 +4,12 @@ on: schedule: - cron: '*/15 * * * *' - cron: '25 0 * * *' + - cron: '25 1 * * 6' jobs: - gh-workflow-stats-batch: - name: GitHub Workflow Stats Batch + gh-workflow-stats-batch-2h: + name: GitHub Workflow Stats Batch 2 hours + if: github.event.schedule == '*/15 * * * *' runs-on: ubuntu-22.04 permissions: actions: read @@ -16,14 +18,36 @@ jobs: uses: neondatabase/gh-workflow-stats-action@v0.2.1 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} - db_table: "gh_workflow_stats_batch_neon" + db_table: "gh_workflow_stats_neon" gh_token: ${{ secrets.GITHUB_TOKEN }} duration: '2h' - - name: Export Workflow Run for the past 24 hours - if: github.event.schedule == '25 0 * * *' + + gh-workflow-stats-batch-48h: + name: GitHub Workflow Stats Batch 48 hours + if: github.event.schedule == '25 0 * * *' + runs-on: ubuntu-22.04 + permissions: + actions: read + steps: + - name: Export Workflow Run for the past 48 hours uses: neondatabase/gh-workflow-stats-action@v0.2.1 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} - db_table: "gh_workflow_stats_batch_neon" + db_table: "gh_workflow_stats_neon" gh_token: ${{ secrets.GITHUB_TOKEN }} - duration: '24h' + duration: '48h' + + gh-workflow-stats-batch-30d: + name: GitHub Workflow Stats Batch 30 days + if: github.event.schedule == '25 1 * * 6' + runs-on: ubuntu-22.04 + permissions: + actions: read + steps: + - name: Export Workflow Run for the past 30 days + uses: neondatabase/gh-workflow-stats-action@v0.2.1 + with: + db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} + db_table: "gh_workflow_stats_neon" + gh_token: ${{ secrets.GITHUB_TOKEN }} + duration: '720h' diff --git a/.github/workflows/report-workflow-stats.yml b/.github/workflows/report-workflow-stats.yml deleted file mode 100644 index 15e446bcd7..0000000000 --- a/.github/workflows/report-workflow-stats.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Report Workflow Stats - -on: - workflow_run: - workflows: - - Add `external` label to issues and PRs created by external users - - Benchmarking - - Build and Test - - Build and Test Locally - - Build build-tools image - - Check Permissions - - Check neon with extra platform builds - - Cloud Regression Test - - Create Release Branch - - Handle `approved-for-ci-run` label - - Lint GitHub Workflows - - Notify Slack channel about upcoming release - - Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region - - Pin build-tools image - - Prepare benchmarking databases by restoring dumps - - Push images to ACR - - Test Postgres client libraries - - Trigger E2E Tests - - cleanup caches by a branch - - Pre-merge checks - types: [completed] - -jobs: - gh-workflow-stats: - name: Github Workflow Stats - runs-on: ubuntu-22.04 - permissions: - actions: read - steps: - - name: Export GH Workflow Stats - uses: neondatabase/gh-workflow-stats-action@v0.1.4 - with: - DB_URI: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} - DB_TABLE: "gh_workflow_stats_neon" - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GH_RUN_ID: ${{ github.event.workflow_run.id }} diff --git a/CODEOWNERS b/CODEOWNERS index f8ed4be816..21b0e7c51f 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,6 +1,5 @@ +/.github/ @neondatabase/developer-productivity /compute_tools/ @neondatabase/control-plane @neondatabase/compute -/storage_controller @neondatabase/storage -/storage_scrubber @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage /libs/remote_storage/ @neondatabase/storage @@ -11,4 +10,6 @@ /pgxn/neon/ @neondatabase/compute @neondatabase/storage /proxy/ @neondatabase/proxy /safekeeper/ @neondatabase/storage +/storage_controller @neondatabase/storage +/storage_scrubber @neondatabase/storage /vendor/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index c7af140f7d..c1a14210de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -46,6 +46,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned-vec" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e0966165eaf052580bd70eb1b32cb3d6245774c0104d1b2793e9650bf83b52a" +dependencies = [ + "equator", +] + [[package]] name = "allocator-api2" version = "0.2.16" @@ -146,6 +155,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "asn1-rs" version = "0.6.2" @@ -359,6 +374,28 @@ dependencies = [ "tracing", ] +[[package]] +name = "aws-sdk-kms" +version = "1.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "564a597a3c71a957d60a2e4c62c93d78ee5a0d636531e15b760acad983a5c18e" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.9", + "once_cell", + "regex-lite", + "tracing", +] + [[package]] name = "aws-sdk-s3" version = "1.52.0" @@ -575,9 +612,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.1" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87" +checksum = "a065c0fe6fdbdf9f11817eb68582b2ab4aff9e9c39e986ae48f7ec576c6322db" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -742,7 +779,7 @@ dependencies = [ "once_cell", "paste", "pin-project", - "quick-xml", + "quick-xml 0.31.0", "rand 0.8.5", "reqwest 0.11.19", "rustc_version", @@ -1220,6 +1257,10 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", + "aws-config", + "aws-sdk-kms", + "aws-sdk-s3", + "base64 0.13.1", "bytes", "camino", "cfg-if", @@ -1237,13 +1278,16 @@ dependencies = [ "opentelemetry", "opentelemetry_sdk", "postgres", + "postgres_initdb", "prometheus", "regex", "remote_storage", "reqwest 0.12.4", "rlimit", "rust-ini", + "serde", "serde_json", + "serde_with", "signal-hook", "tar", "thiserror", @@ -1381,6 +1425,15 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "cpp_demangle" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96e58d342ad113c2b878f16d5d034c03be492ae460cdbc02b7f0f2284d310c7d" +dependencies = [ + "cfg-if", +] + [[package]] name = "cpufeatures" version = "0.2.9" @@ -1904,6 +1957,26 @@ dependencies = [ "termcolor", ] +[[package]] +name = "equator" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c35da53b5a021d2484a7cc49b2ac7f2d840f8236a286f84202369bd338d761ea" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -2011,6 +2084,18 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", + "winapi", +] + [[package]] name = "fixedbitset" version = "0.4.2" @@ -2089,9 +2174,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -2099,9 +2184,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" @@ -2116,9 +2201,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-lite" @@ -2137,9 +2222,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", @@ -2148,15 +2233,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" @@ -2166,9 +2251,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -2714,6 +2799,24 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" +[[package]] +name = "inferno" +version = "0.11.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" +dependencies = [ + "ahash", + "indexmap 2.0.1", + "is-terminal", + "itoa", + "log", + "num-format", + "once_cell", + "quick-xml 0.26.0", + "rgb", + "str_stack", +] + [[package]] name = "inotify" version = "0.9.6" @@ -2764,9 +2867,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.9.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "is-terminal" @@ -3053,6 +3156,15 @@ version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +[[package]] +name = "memmap2" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.7.1" @@ -3278,6 +3390,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-format" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" +dependencies = [ + "arrayvec", + "itoa", +] + [[package]] name = "num-integer" version = "0.1.45" @@ -3619,6 +3741,7 @@ dependencies = [ "num_cpus", "once_cell", "pageserver_api", + "pageserver_client", "pageserver_compaction", "pin-project-lite", "postgres", @@ -3627,6 +3750,7 @@ dependencies = [ "postgres_backend", "postgres_connection", "postgres_ffi", + "postgres_initdb", "pq_proto", "procfs", "rand 0.8.5", @@ -4009,7 +4133,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5" dependencies = [ "bytes", "fallible-iterator", @@ -4022,7 +4146,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5" dependencies = [ "base64 0.20.0", "byteorder", @@ -4041,7 +4165,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5" dependencies = [ "bytes", "fallible-iterator", @@ -4058,7 +4182,7 @@ dependencies = [ "bytes", "once_cell", "pq_proto", - "rustls 0.23.16", + "rustls 0.23.18", "rustls-pemfile 2.1.1", "serde", "thiserror", @@ -4102,12 +4226,48 @@ dependencies = [ "utils", ] +[[package]] +name = "postgres_initdb" +version = "0.1.0" +dependencies = [ + "anyhow", + "camino", + "thiserror", + "tokio", + "workspace_hack", +] + [[package]] name = "powerfmt" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "pprof" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebbe2f8898beba44815fdc9e5a4ae9c929e21c5dc29b0c774a15555f7f58d6d0" +dependencies = [ + "aligned-vec", + "backtrace", + "cfg-if", + "criterion", + "findshlibs", + "inferno", + "libc", + "log", + "nix 0.26.4", + "once_cell", + "parking_lot 0.12.1", + "protobuf", + "protobuf-codegen-pure", + "smallvec", + "symbolic-demangle", + "tempfile", + "thiserror", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -4260,6 +4420,31 @@ dependencies = [ "prost", ] +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + +[[package]] +name = "protobuf-codegen" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6" +dependencies = [ + "protobuf", +] + +[[package]] +name = "protobuf-codegen-pure" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865" +dependencies = [ + "protobuf", + "protobuf-codegen", +] + [[package]] name = "proxy" version = "0.1.0" @@ -4333,7 +4518,7 @@ dependencies = [ "rsa", "rstest", "rustc-hash", - "rustls 0.23.16", + "rustls 0.23.18", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", "scopeguard", @@ -4371,6 +4556,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "quick-xml" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f50b1c63b38611e7d4d7f68b82d3ad0cc71a2ad2e7f61fc10f1328d917c93cd" +dependencies = [ + "memchr", +] + [[package]] name = "quick-xml" version = "0.31.0" @@ -4853,6 +5047,15 @@ dependencies = [ "subtle", ] +[[package]] +name = "rgb" +version = "0.8.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a" +dependencies = [ + "bytemuck", +] + [[package]] name = "ring" version = "0.17.6" @@ -5028,9 +5231,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.16" +version = "0.23.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" +checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f" dependencies = [ "log", "once_cell", @@ -5161,11 +5364,13 @@ dependencies = [ "itertools 0.10.5", "metrics", "once_cell", + "pageserver_api", "parking_lot 0.12.1", "postgres", "postgres-protocol", "postgres_backend", "postgres_ffi", + "pprof", "pq_proto", "rand 0.8.5", "regex", @@ -5191,6 +5396,7 @@ dependencies = [ "tracing-subscriber", "url", "utils", + "wal_decoder", "walproposer", "workspace_hack", ] @@ -5712,6 +5918,12 @@ dependencies = [ "der 0.7.8", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -5738,7 +5950,7 @@ dependencies = [ "once_cell", "parking_lot 0.12.1", "prost", - "rustls 0.23.16", + "rustls 0.23.18", "tokio", "tonic", "tonic-build", @@ -5821,7 +6033,7 @@ dependencies = [ "postgres_ffi", "remote_storage", "reqwest 0.12.4", - "rustls 0.23.16", + "rustls 0.23.18", "rustls-native-certs 0.8.0", "serde", "serde_json", @@ -5858,6 +6070,12 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "str_stack" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" + [[package]] name = "stringprep" version = "0.1.2" @@ -5905,6 +6123,29 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca" +[[package]] +name = "symbolic-common" +version = "12.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "366f1b4c6baf6cfefc234bbd4899535fca0b06c74443039a73f6dfb2fad88d77" +dependencies = [ + "debugid", + "memmap2", + "stable_deref_trait", + "uuid", +] + +[[package]] +name = "symbolic-demangle" +version = "12.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aba05ba5b9962ea5617baf556293720a8b2d0a282aa14ee4bf10e22efc7da8c8" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + [[package]] name = "syn" version = "1.0.109" @@ -6227,7 +6468,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5" dependencies = [ "async-trait", "byteorder", @@ -6254,7 +6495,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab" dependencies = [ "ring", - "rustls 0.23.16", + "rustls 0.23.18", "tokio", "tokio-postgres", "tokio-rustls 0.26.0", @@ -6288,7 +6529,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.16", + "rustls 0.23.18", "rustls-pki-types", "tokio", ] @@ -6697,7 +6938,7 @@ dependencies = [ "base64 0.22.1", "log", "once_cell", - "rustls 0.23.16", + "rustls 0.23.18", "rustls-pki-types", "url", "webpki-roots 0.26.1", @@ -6772,6 +7013,7 @@ dependencies = [ "once_cell", "pin-project-lite", "postgres_connection", + "pprof", "pq_proto", "rand 0.8.5", "regex", @@ -6781,6 +7023,7 @@ dependencies = [ "serde_assert", "serde_json", "serde_path_to_error", + "serde_with", "signal-hook", "strum", "strum_macros", @@ -7306,6 +7549,7 @@ dependencies = [ "anyhow", "axum", "axum-core", + "base64 0.13.1", "base64 0.21.1", "base64ct", "bytes", @@ -7340,6 +7584,7 @@ dependencies = [ "libc", "log", "memchr", + "nix 0.26.4", "nom", "num-bigint", "num-integer", @@ -7356,7 +7601,7 @@ dependencies = [ "regex-automata 0.4.3", "regex-syntax 0.8.2", "reqwest 0.12.4", - "rustls 0.23.16", + "rustls 0.23.18", "scopeguard", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index dbda930535..e3dc5b97f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ members = [ "libs/vm_monitor", "libs/walproposer", "libs/wal_decoder", + "libs/postgres_initdb", ] [workspace.package] @@ -57,6 +58,7 @@ async-trait = "0.1" aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] } aws-sdk-s3 = "1.52" aws-sdk-iam = "1.46.0" +aws-sdk-kms = "1.47.0" aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } aws-smithy-types = "1.2" aws-credential-types = "1.2.0" @@ -73,7 +75,7 @@ bytes = "1.0" camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } -clap = { version = "4.0", features = ["derive"] } +clap = { version = "4.0", features = ["derive", "env"] } comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" @@ -106,7 +108,7 @@ hyper-util = "0.1" tokio-tungstenite = "0.21.0" indexmap = "2" indoc = "2" -ipnet = "2.9.0" +ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" jsonwebtoken = "9" @@ -130,6 +132,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" +pprof = { version = "0.14", features = ["criterion", "flamegraph", "protobuf", "protobuf-codec"] } procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13" @@ -153,7 +156,7 @@ sentry = { version = "0.32", default-features = false, features = ["backtrace", serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_path_to_error = "0.1" -serde_with = "2.0" +serde_with = { version = "2.0", features = [ "base64" ] } serde_assert = "0.5.0" sha2 = "0.10.2" signal-hook = "0.3" @@ -212,12 +215,14 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", br compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } +pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } +postgres_initdb = { path = "./libs/postgres_initdb" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } diff --git a/Dockerfile b/Dockerfile index 785dd4598e..e888efbae2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ ARG IMAGE=build-tools ARG TAG=pinned ARG DEFAULT_PG_VERSION=17 ARG STABLE_PG_VERSION=16 -ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim # Build Postgres diff --git a/Makefile b/Makefile index 8e3b755112..dc67b87239 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,7 @@ ifeq ($(UNAME_S),Linux) # Seccomp BPF is only available for Linux PG_CONFIGURE_OPTS += --with-libseccomp else ifeq ($(UNAME_S),Darwin) + PG_CFLAGS += -DUSE_PREFETCH ifndef DISABLE_HOMEBREW # macOS with brew-installed openssl requires explicit paths # It can be configured with OPENSSL_PREFIX variable diff --git a/README.md b/README.md index e68ef70bdf..1417d6b9e7 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ make -j`sysctl -n hw.logicalcpu` -s To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory. +Python (3.11 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory. #### Running neon database diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index c1190b13f4..4f491afec5 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -1,4 +1,4 @@ -ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_VERSION=bookworm FROM debian:bookworm-slim AS pgcopydb_builder ARG DEBIAN_VERSION @@ -234,7 +234,7 @@ USER nonroot:nonroot WORKDIR /home/nonroot # Python -ENV PYTHON_VERSION=3.9.19 \ +ENV PYTHON_VERSION=3.11.10 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 32405ece86..2fcd9985bc 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -3,7 +3,7 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG -ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim ######################################################################################### @@ -1243,7 +1243,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ ######################################################################################### # -# Compile and run the Neon-specific `compute_ctl` binary +# Compile and run the Neon-specific `compute_ctl` and `fast_import` binaries # ######################################################################################### FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools @@ -1264,6 +1264,7 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de FROM debian:$DEBIAN_FLAVOR AS compute-tools-image COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import ######################################################################################### # @@ -1458,6 +1459,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import # pgbouncer and its config COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer @@ -1533,6 +1535,25 @@ RUN apt update && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 +# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2 +# used by fast_import +ARG TARGETARCH +ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb +RUN set -ex; \ + \ + # Determine the expected checksum based on TARGETARCH + if [ "${TARGETARCH}" = "amd64" ]; then \ + CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \ + elif [ "${TARGETARCH}" = "arm64" ]; then \ + CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \ + else \ + echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ + fi; \ + \ + # Compute and validate the checksum + echo "${CHECKSUM} /tmp/s5cmd.deb" | sha256sum -c - +RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb + ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 0bf4ed53d6..c0c390caef 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,6 +10,10 @@ default = [] testing = [] [dependencies] +base64.workspace = true +aws-config.workspace = true +aws-sdk-s3.workspace = true +aws-sdk-kms.workspace = true anyhow.workspace = true camino.workspace = true chrono.workspace = true @@ -27,6 +31,8 @@ opentelemetry.workspace = true opentelemetry_sdk.workspace = true postgres.workspace = true regex.workspace = true +serde.workspace = true +serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true tar.workspace = true @@ -43,6 +49,7 @@ thiserror.workspace = true url.workspace = true prometheus.workspace = true +postgres_initdb.workspace = true compute_api.workspace = true utils.workspace = true workspace_hack.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 284db005c8..4689cc2b83 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -105,6 +105,11 @@ fn main() -> Result<()> { fn init() -> Result<(String, clap::ArgMatches)> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; + opentelemetry::global::set_error_handler(|err| { + tracing::info!("OpenTelemetry error: {err}"); + }) + .expect("global error handler lock poisoned"); + let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { for sig in signals.forever() { diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs new file mode 100644 index 0000000000..3b0b990df2 --- /dev/null +++ b/compute_tools/src/bin/fast_import.rs @@ -0,0 +1,338 @@ +//! This program dumps a remote Postgres database into a local Postgres database +//! and uploads the resulting PGDATA into object storage for import into a Timeline. +//! +//! # Context, Architecture, Design +//! +//! See cloud.git Fast Imports RFC () +//! for the full picture. +//! The RFC describing the storage pieces of importing the PGDATA dump into a Timeline +//! is publicly accessible at . +//! +//! # This is a Prototype! +//! +//! This program is part of a prototype feature and not yet used in production. +//! +//! The cloud.git RFC contains lots of suggestions for improving e2e throughput +//! of this step of the timeline import process. +//! +//! # Local Testing +//! +//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build. +//! - Build the image with the following command: +//! +//! ```bash +//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com +//! docker push localhost:3030/localregistry/compute-node-v14:latest +//! ``` + +use anyhow::Context; +use aws_config::BehaviorVersion; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::Parser; +use nix::unistd::Pid; +use tracing::{info, info_span, warn, Instrument}; +use utils::fs_ext::is_directory_empty; + +#[path = "fast_import/child_stdio_to_log.rs"] +mod child_stdio_to_log; +#[path = "fast_import/s3_uri.rs"] +mod s3_uri; +#[path = "fast_import/s5cmd.rs"] +mod s5cmd; + +#[derive(clap::Parser)] +struct Args { + #[clap(long)] + working_directory: Utf8PathBuf, + #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")] + s3_prefix: s3_uri::S3Uri, + #[clap(long)] + pg_bin_dir: Utf8PathBuf, + #[clap(long)] + pg_lib_dir: Utf8PathBuf, +} + +#[serde_with::serde_as] +#[derive(serde::Deserialize)] +struct Spec { + encryption_secret: EncryptionSecret, + #[serde_as(as = "serde_with::base64::Base64")] + source_connstring_ciphertext_base64: Vec, +} + +#[derive(serde::Deserialize)] +enum EncryptionSecret { + #[allow(clippy::upper_case_acronyms)] + KMS { key_id: String }, +} + +#[tokio::main] +pub(crate) async fn main() -> anyhow::Result<()> { + utils::logging::init( + utils::logging::LogFormat::Plain, + utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::Output::Stdout, + )?; + + info!("starting"); + + let Args { + working_directory, + s3_prefix, + pg_bin_dir, + pg_lib_dir, + } = Args::parse(); + + let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + + let spec: Spec = { + let spec_key = s3_prefix.append("/spec.json"); + let s3_client = aws_sdk_s3::Client::new(&aws_config); + let object = s3_client + .get_object() + .bucket(&spec_key.bucket) + .key(spec_key.key) + .send() + .await + .context("get spec from s3")? + .body + .collect() + .await + .context("download spec body")?; + serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + }; + + match tokio::fs::create_dir(&working_directory).await { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + if !is_directory_empty(&working_directory) + .await + .context("check if working directory is empty")? + { + anyhow::bail!("working directory is not empty"); + } else { + // ok + } + } + Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), + } + + let pgdata_dir = working_directory.join("pgdata"); + tokio::fs::create_dir(&pgdata_dir) + .await + .context("create pgdata directory")?; + + // + // Setup clients + // + let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let kms_client = aws_sdk_kms::Client::new(&aws_config); + + // + // Initialize pgdata + // + let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded + postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser, + locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded, + pg_version: 140000, // XXX: this shouldn't be hard-coded but derived from which compute image we're running in + initdb_bin: pg_bin_dir.join("initdb").as_ref(), + library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. + pgdata: &pgdata_dir, + }) + .await + .context("initdb")?; + + let nproc = num_cpus::get(); + + // + // Launch postgres process + // + let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres")) + .arg("-D") + .arg(&pgdata_dir) + .args(["-c", "wal_level=minimal"]) + .args(["-c", "shared_buffers=10GB"]) + .args(["-c", "max_wal_senders=0"]) + .args(["-c", "fsync=off"]) + .args(["-c", "full_page_writes=off"]) + .args(["-c", "synchronous_commit=off"]) + .args(["-c", "maintenance_work_mem=8388608"]) + .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) + .args(["-c", &format!("max_worker_processes={nproc}")]) + .args(["-c", "effective_io_concurrency=100"]) + .env_clear() + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn postgres")?; + + info!("spawned postgres, waiting for it to become ready"); + tokio::spawn( + child_stdio_to_log::relay_process_output( + postgres_proc.stdout.take(), + postgres_proc.stderr.take(), + ) + .instrument(info_span!("postgres")), + ); + let restore_pg_connstring = + format!("host=localhost port=5432 user={superuser} dbname=postgres"); + loop { + let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await; + if res.is_ok() { + info!("postgres is ready, could connect to it"); + break; + } + } + + // + // Decrypt connection string + // + let source_connection_string = { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + let mut output = kms_client + .decrypt() + .key_id(key_id) + .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( + spec.source_connstring_ciphertext_base64, + )) + .send() + .await + .context("decrypt source connection string")?; + let plaintext = output + .plaintext + .take() + .context("get plaintext source connection string")?; + String::from_utf8(plaintext.into_inner()) + .context("parse source connection string as utf8")? + } + } + }; + + // + // Start the work + // + + let dumpdir = working_directory.join("dumpdir"); + + let common_args = [ + // schema mapping (prob suffices to specify them on one side) + "--no-owner".to_string(), + "--no-privileges".to_string(), + "--no-publications".to_string(), + "--no-security-labels".to_string(), + "--no-subscriptions".to_string(), + "--no-tablespaces".to_string(), + // format + "--format".to_string(), + "directory".to_string(), + // concurrency + "--jobs".to_string(), + num_cpus::get().to_string(), + // progress updates + "--verbose".to_string(), + ]; + + info!("dump into the working directory"); + { + let mut pg_dump = tokio::process::Command::new(pg_bin_dir.join("pg_dump")) + .args(&common_args) + .arg("-f") + .arg(&dumpdir) + .arg("--no-sync") + // POSITIONAL args + // source db (db name included in connection string) + .arg(&source_connection_string) + // how we run it + .env_clear() + .kill_on_drop(true) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn pg_dump")?; + + info!(pid=%pg_dump.id().unwrap(), "spawned pg_dump"); + + tokio::spawn( + child_stdio_to_log::relay_process_output(pg_dump.stdout.take(), pg_dump.stderr.take()) + .instrument(info_span!("pg_dump")), + ); + + let st = pg_dump.wait().await.context("wait for pg_dump")?; + info!(status=?st, "pg_dump exited"); + if !st.success() { + warn!(status=%st, "pg_dump failed, restore will likely fail as well"); + } + } + + // TODO: do it in a streaming way, plenty of internal research done on this already + // TODO: do the unlogged table trick + + info!("restore from working directory into vanilla postgres"); + { + let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore")) + .args(&common_args) + .arg("-d") + .arg(&restore_pg_connstring) + // POSITIONAL args + .arg(&dumpdir) + // how we run it + .env_clear() + .kill_on_drop(true) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn pg_restore")?; + + info!(pid=%pg_restore.id().unwrap(), "spawned pg_restore"); + tokio::spawn( + child_stdio_to_log::relay_process_output( + pg_restore.stdout.take(), + pg_restore.stderr.take(), + ) + .instrument(info_span!("pg_restore")), + ); + let st = pg_restore.wait().await.context("wait for pg_restore")?; + info!(status=?st, "pg_restore exited"); + if !st.success() { + warn!(status=%st, "pg_restore failed, restore will likely fail as well"); + } + } + + info!("shutdown postgres"); + { + nix::sys::signal::kill( + Pid::from_raw( + i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"), + ), + nix::sys::signal::SIGTERM, + ) + .context("signal postgres to shut down")?; + postgres_proc + .wait() + .await + .context("wait for postgres to shut down")?; + } + + info!("upload pgdata"); + s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/")) + .await + .context("sync dump directory to destination")?; + + info!("write status"); + { + let status_dir = working_directory.join("status"); + std::fs::create_dir(&status_dir).context("create status directory")?; + let status_file = status_dir.join("status"); + std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) + .context("write status file")?; + s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata")) + .await + .context("sync status directory to destination")?; + } + + Ok(()) +} diff --git a/compute_tools/src/bin/fast_import/child_stdio_to_log.rs b/compute_tools/src/bin/fast_import/child_stdio_to_log.rs new file mode 100644 index 0000000000..6724ef9bed --- /dev/null +++ b/compute_tools/src/bin/fast_import/child_stdio_to_log.rs @@ -0,0 +1,35 @@ +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::process::{ChildStderr, ChildStdout}; +use tracing::info; + +/// Asynchronously relays the output from a child process's `stdout` and `stderr` to the tracing log. +/// Each line is read and logged individually, with lossy UTF-8 conversion. +/// +/// # Arguments +/// +/// * `stdout`: An `Option` from the child process. +/// * `stderr`: An `Option` from the child process. +/// +pub(crate) async fn relay_process_output(stdout: Option, stderr: Option) { + let stdout_fut = async { + if let Some(stdout) = stdout { + let reader = BufReader::new(stdout); + let mut lines = reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + info!(fd = "stdout", "{}", line); + } + } + }; + + let stderr_fut = async { + if let Some(stderr) = stderr { + let reader = BufReader::new(stderr); + let mut lines = reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + info!(fd = "stderr", "{}", line); + } + } + }; + + tokio::join!(stdout_fut, stderr_fut); +} diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs new file mode 100644 index 0000000000..52bbef420f --- /dev/null +++ b/compute_tools/src/bin/fast_import/s3_uri.rs @@ -0,0 +1,75 @@ +use anyhow::Result; +use std::str::FromStr; + +/// Struct to hold parsed S3 components +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct S3Uri { + pub bucket: String, + pub key: String, +} + +impl FromStr for S3Uri { + type Err = anyhow::Error; + + /// Parse an S3 URI into a bucket and key + fn from_str(uri: &str) -> Result { + // Ensure the URI starts with "s3://" + if !uri.starts_with("s3://") { + return Err(anyhow::anyhow!("Invalid S3 URI scheme")); + } + + // Remove the "s3://" prefix + let stripped_uri = &uri[5..]; + + // Split the remaining string into bucket and key parts + if let Some((bucket, key)) = stripped_uri.split_once('/') { + Ok(S3Uri { + bucket: bucket.to_string(), + key: key.to_string(), + }) + } else { + Err(anyhow::anyhow!( + "Invalid S3 URI format, missing bucket or key" + )) + } + } +} + +impl S3Uri { + pub fn append(&self, suffix: &str) -> Self { + Self { + bucket: self.bucket.clone(), + key: format!("{}{}", self.key, suffix), + } + } +} + +impl std::fmt::Display for S3Uri { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "s3://{}/{}", self.bucket, self.key) + } +} + +impl clap::builder::TypedValueParser for S3Uri { + type Value = Self; + + fn parse_ref( + &self, + _cmd: &clap::Command, + _arg: Option<&clap::Arg>, + value: &std::ffi::OsStr, + ) -> Result { + let value_str = value.to_str().ok_or_else(|| { + clap::Error::raw( + clap::error::ErrorKind::InvalidUtf8, + "Invalid UTF-8 sequence", + ) + })?; + S3Uri::from_str(value_str).map_err(|e| { + clap::Error::raw( + clap::error::ErrorKind::InvalidValue, + format!("Failed to parse S3 URI: {}", e), + ) + }) + } +} diff --git a/compute_tools/src/bin/fast_import/s5cmd.rs b/compute_tools/src/bin/fast_import/s5cmd.rs new file mode 100644 index 0000000000..d2d9a79736 --- /dev/null +++ b/compute_tools/src/bin/fast_import/s5cmd.rs @@ -0,0 +1,27 @@ +use anyhow::Context; +use camino::Utf8Path; + +use super::s3_uri::S3Uri; + +pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { + let mut builder = tokio::process::Command::new("s5cmd"); + // s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL + if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") { + builder.arg("--endpoint-url").arg(val); + } + builder + .arg("sync") + .arg(local.as_str()) + .arg(remote.to_string()); + let st = builder + .spawn() + .context("spawn s5cmd")? + .wait() + .await + .context("wait for s5cmd")?; + if st.success() { + Ok(()) + } else { + Err(anyhow::anyhow!("s5cmd failed")) + } +} diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 4fefa831e0..2f6f82dd39 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -1,38 +1,40 @@ -use compute_api::{ - responses::CatalogObjects, - spec::{Database, Role}, -}; +use compute_api::responses::CatalogObjects; use futures::Stream; -use postgres::{Client, NoTls}; +use postgres::NoTls; use std::{path::Path, process::Stdio, result::Result, sync::Arc}; use tokio::{ io::{AsyncBufReadExt, BufReader}, process::Command, - task, + spawn, }; +use tokio_postgres::connect; use tokio_stream::{self as stream, StreamExt}; use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; -use crate::{ - compute::ComputeNode, - pg_helpers::{get_existing_dbs, get_existing_roles}, -}; +use crate::compute::ComputeNode; +use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async}; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let connstr = compute.connstr.clone(); - task::spawn_blocking(move || { - let mut client = Client::connect(connstr.as_str(), NoTls)?; - let roles: Vec; - { - let mut xact = client.transaction()?; - roles = get_existing_roles(&mut xact)?; - } - let databases: Vec = get_existing_dbs(&mut client)?.values().cloned().collect(); - Ok(CatalogObjects { roles, databases }) - }) - .await? + let (client, connection): (tokio_postgres::Client, _) = + connect(connstr.as_str(), NoTls).await?; + + spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let roles = get_existing_roles_async(&client).await?; + + let databases = get_existing_dbs_async(&client) + .await? + .into_values() + .collect(); + + Ok(CatalogObjects { roles, databases }) } #[derive(Debug, thiserror::Error)] diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index d76eaad0a0..cec2b1bed8 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,37 +1,9 @@ use anyhow::{anyhow, Ok, Result}; -use postgres::Client; use tokio_postgres::NoTls; use tracing::{error, instrument, warn}; use crate::compute::ComputeNode; -/// Create a special service table for availability checks -/// only if it does not exist already. -pub fn create_availability_check_data(client: &mut Client) -> Result<()> { - let query = " - DO $$ - BEGIN - IF NOT EXISTS( - SELECT 1 - FROM pg_catalog.pg_tables - WHERE tablename = 'health_check' - ) - THEN - CREATE TABLE health_check ( - id serial primary key, - updated_at timestamptz default now() - ); - INSERT INTO health_check VALUES (1, now()) - ON CONFLICT (id) DO UPDATE - SET updated_at = now(); - END IF; - END - $$;"; - client.execute(query, &[])?; - - Ok(()) -} - /// Update timestamp in a row in a special service table to check /// that we can actually write some data in this particular timeline. #[instrument(skip_all)] diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0a8cb14058..4f67425ba8 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,20 +1,21 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::env; use std::fs; +use std::iter::once; use std::os::unix::fs::{symlink, PermissionsExt}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; use std::sync::atomic::AtomicU32; use std::sync::atomic::Ordering; -use std::sync::{Condvar, Mutex, RwLock}; +use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::thread; use std::time::Duration; use std::time::Instant; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; -use compute_api::spec::PgIdent; +use compute_api::spec::{PgIdent, Role}; use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; @@ -31,15 +32,23 @@ use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion}; use utils::measured_stream::MeasuredReader; use nix::sys::signal::{kill, Signal}; - use remote_storage::{DownloadError, RemotePath}; +use tokio::spawn; +use url::Url; -use crate::checker::create_availability_check_data; use crate::installed_extensions::get_installed_extensions_sync; use crate::local_proxy; -use crate::logger::inlinify; use crate::pg_helpers::*; use crate::spec::*; +use crate::spec_apply::ApplySpecPhase::{ + CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser, + DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions, + RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase, +}; +use crate::spec_apply::PerDatabasePhase::{ + ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension, +}; +use crate::spec_apply::{apply_operations, MutableApplyContext, DB}; use crate::sync_sk::{check_if_synced, ping_safekeeper}; use crate::{config, extension_server}; @@ -224,10 +233,7 @@ fn maybe_cgexec(cmd: &str) -> Command { } } -/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser -/// that we give to customers -#[instrument(skip_all)] -fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> { +pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String { let roles = spec .cluster .roles @@ -296,11 +302,8 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> $$;"#, roles_decl, database_decl, ); - info!("Neon superuser created: {}", inlinify(&query)); - client - .simple_query(&query) - .map_err(|e| anyhow::anyhow!(e).context(query))?; - Ok(()) + + query } impl ComputeNode { @@ -813,21 +816,14 @@ impl ComputeNode { Ok(()) } - /// Do initial configuration of the already started Postgres. - #[instrument(skip_all)] - pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { - // If connection fails, - // it may be the old node with `zenith_admin` superuser. - // - // In this case we need to connect with old `zenith_admin` name - // and create new user. We cannot simply rename connected user, - // but we can create a new one and grant it all privileges. - let mut connstr = self.connstr.clone(); + async fn get_maintenance_client(url: &Url) -> Result { + let mut connstr = url.clone(); + connstr .query_pairs_mut() .append_pair("application_name", "apply_config"); - let mut client = match Client::connect(connstr.as_str(), NoTls) { + let (client, conn) = match tokio_postgres::connect(connstr.as_str(), NoTls).await { Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { @@ -845,8 +841,8 @@ impl ComputeNode { let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; - // Disable forwarding so that users don't get a cloud_admin role + // Disable forwarding so that users don't get a cloud_admin role let mut func = || { client.simple_query("SET neon.forward_ddl = false")?; client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; @@ -858,49 +854,309 @@ impl ComputeNode { drop(client); // reconnect with connstring with expected name - Client::connect(connstr.as_str(), NoTls)? + tokio_postgres::connect(connstr.as_str(), NoTls).await? } _ => return Err(e.into()), }, - Ok(client) => client, + Ok((client, conn)) => (client, conn), }; - // Disable DDL forwarding because control plane already knows about these roles/databases. + spawn(async move { + if let Err(e) = conn.await { + error!("maintenance client connection error: {}", e); + } + }); + + // Disable DDL forwarding because control plane already knows about the roles/databases + // we're about to modify. client .simple_query("SET neon.forward_ddl = false") + .await .context("apply_config SET neon.forward_ddl = false")?; - // Proceed with post-startup configuration. Note, that order of operations is important. - let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; - create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; - cleanup_instance(&mut client).context("apply_config cleanup_instance")?; - handle_roles(spec, &mut client).context("apply_config handle_roles")?; - handle_databases(spec, &mut client).context("apply_config handle_databases")?; - handle_role_deletions(spec, connstr.as_str(), &mut client) - .context("apply_config handle_role_deletions")?; - handle_grants( - spec, - &mut client, - connstr.as_str(), - self.has_feature(ComputeFeature::AnonExtension), - ) - .context("apply_config handle_grants")?; - handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; - handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; - create_availability_check_data(&mut client) - .context("apply_config create_availability_check_data")?; + Ok(client) + } - // 'Close' connection - drop(client); + /// Apply the spec to the running PostgreSQL instance. + /// The caller can decide to run with multiple clients in parallel, or + /// single mode. Either way, the commands executed will be the same, and + /// only commands run in different databases are parallelized. + #[instrument(skip_all)] + pub fn apply_spec_sql( + &self, + spec: Arc, + url: Arc, + concurrency: usize, + ) -> Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; - if let Some(ref local_proxy) = spec.local_proxy_config { + info!("Applying config with max {} concurrency", concurrency); + debug!("Config: {:?}", spec); + + rt.block_on(async { + // Proceed with post-startup configuration. Note, that order of operations is important. + let client = Self::get_maintenance_client(&url).await?; + let spec = spec.clone(); + + let databases = get_existing_dbs_async(&client).await?; + let roles = get_existing_roles_async(&client) + .await? + .into_iter() + .map(|role| (role.name.clone(), role)) + .collect::>(); + + let jwks_roles = Arc::new( + spec.as_ref() + .local_proxy_config + .iter() + .flat_map(|it| &it.jwks) + .flatten() + .flat_map(|setting| &setting.role_names) + .cloned() + .collect::>(), + ); + + let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext { + roles, + dbs: databases, + })); + + for phase in [ + CreateSuperUser, + DropInvalidDatabases, + RenameRoles, + CreateAndAlterRoles, + RenameAndDeleteDatabases, + CreateAndAlterDatabases, + ] { + debug!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + let db_processes = spec + .cluster + .databases + .iter() + .map(|db| DB::new(db.clone())) + // include + .chain(once(DB::SystemDB)) + .map(|db| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut url = url.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + let db = db.clone(); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + url.set_path(db.name.as_str()); + } + } + + let url = Arc::new(url); + let fut = Self::apply_spec_sql_db( + spec.clone(), + url, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + ); + + Ok(spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + handle.await??; + } + + for phase in vec![ + HandleOtherExtensions, + HandleNeonExtension, + CreateAvailabilityCheck, + DropRoles, + ] { + debug!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + Ok::<(), anyhow::Error>(()) + })?; + + Ok(()) + } + + /// Apply SQL migrations of the RunInEachDatabase phase. + /// + /// May opt to not connect to databases that don't have any scheduled + /// operations. The function is concurrency-controlled with the provided + /// semaphore. The caller has to make sure the semaphore isn't exhausted. + async fn apply_spec_sql_db( + spec: Arc, + url: Arc, + ctx: Arc>, + jwks_roles: Arc>, + concurrency_token: Arc, + db: DB, + ) -> Result<()> { + let _permit = concurrency_token.acquire().await?; + + let mut client_conn = None; + + for subphase in [ + DeleteDBRoleReferences, + ChangeSchemaPerms, + HandleAnonExtension, + ] { + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + RunInEachDatabase { + db: db.clone(), + subphase, + }, + // Only connect if apply_operation actually wants a connection. + // It's quite possible this database doesn't need any queries, + // so by not connecting we save time and effort connecting to + // that database. + || async { + if client_conn.is_none() { + let db_client = Self::get_maintenance_client(&url).await?; + client_conn.replace(db_client); + } + let client = client_conn.as_ref().unwrap(); + Ok(client) + }, + ) + .await?; + } + + drop(client_conn); + + Ok::<(), anyhow::Error>(()) + } + + /// Do initial configuration of the already started Postgres. + #[instrument(skip_all)] + pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { + // If connection fails, + // it may be the old node with `zenith_admin` superuser. + // + // In this case we need to connect with old `zenith_admin` name + // and create new user. We cannot simply rename connected user, + // but we can create a new one and grant it all privileges. + let mut url = self.connstr.clone(); + url.query_pairs_mut() + .append_pair("application_name", "apply_config"); + + let url = Arc::new(url); + let spec = Arc::new( + compute_state + .pspec + .as_ref() + .expect("spec must be set") + .spec + .clone(), + ); + + // Choose how many concurrent connections to use for applying the spec changes. + // If the cluster is not currently Running we don't have to deal with user connections, + // and can thus use all `max_connections` connection slots. However, that's generally not + // very efficient, so we generally still limit it to a smaller number. + let max_concurrent_connections = if compute_state.status != ComputeStatus::Running { + // If the settings contain 'max_connections', use that as template + if let Some(config) = spec.cluster.settings.find("max_connections") { + config.parse::().ok() + } else { + // Otherwise, try to find the setting in the postgresql_conf string + spec.cluster + .postgresql_conf + .iter() + .flat_map(|conf| conf.split("\n")) + .filter_map(|line| { + if !line.contains("max_connections") { + return None; + } + + let (key, value) = line.split_once("=")?; + let key = key + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + let value = value + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + if key != "max_connections" { + return None; + } + + value.parse::().ok() + }) + .next() + } + // If max_connections is present, use at most 1/3rd of that. + // When max_connections is lower than 30, try to use at least 10 connections, but + // never more than max_connections. + .map(|limit| match limit { + 0..10 => limit, + 10..30 => 10, + 30.. => limit / 3, + }) + // If we didn't find max_connections, default to 10 concurrent connections. + .unwrap_or(10) + } else { + // state == Running + // Because the cluster is already in the Running state, we should assume users are + // already connected to the cluster, and high concurrency could negatively + // impact user connectivity. Therefore, we can limit concurrency to the number of + // reserved superuser connections, which users wouldn't be able to use anyway. + spec.cluster + .settings + .find("superuser_reserved_connections") + .iter() + .filter_map(|val| val.parse::().ok()) + .map(|val| if val > 1 { val - 1 } else { 1 }) + .last() + .unwrap_or(3) + }; + + // Merge-apply spec & changes to PostgreSQL state. + self.apply_spec_sql(spec.clone(), url.clone(), max_concurrent_connections)?; + + if let Some(ref local_proxy) = &spec.clone().local_proxy_config { info!("configuring local_proxy"); local_proxy::configure(local_proxy).context("apply_config local_proxy")?; } // Run migrations separately to not hold up cold starts thread::spawn(move || { - let mut connstr = connstr.clone(); + let mut connstr = url.as_ref().clone(); connstr .query_pairs_mut() .append_pair("application_name", "migrations"); @@ -908,7 +1164,8 @@ impl ComputeNode { let mut client = Client::connect(connstr.as_str(), NoTls)?; handle_migrations(&mut client).context("apply_config handle_migrations") }); - Ok(()) + + Ok::<(), anyhow::Error>(()) } // Wrapped this around `pg_ctl reload`, but right now we don't use @@ -971,32 +1228,16 @@ impl ComputeNode { config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; - let mut client = Client::connect(self.connstr.as_str(), NoTls)?; - - // Proceed with post-startup configuration. Note, that order of operations is important. - // Disable DDL forwarding because control plane already knows about these roles/databases. if spec.mode == ComputeMode::Primary { - client.simple_query("SET neon.forward_ddl = false")?; - cleanup_instance(&mut client)?; - handle_roles(&spec, &mut client)?; - handle_databases(&spec, &mut client)?; - handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; - handle_grants( - &spec, - &mut client, - self.connstr.as_str(), - self.has_feature(ComputeFeature::AnonExtension), - )?; - handle_extensions(&spec, &mut client)?; - handle_extension_neon(&mut client)?; - // We can skip handle_migrations here because a new migration can only appear - // if we have a new version of the compute_ctl binary, which can only happen - // if compute got restarted, in which case we'll end up inside of apply_config - // instead of reconfigure. - } + let mut url = self.connstr.clone(); + url.query_pairs_mut() + .append_pair("application_name", "apply_config"); + let url = Arc::new(url); - // 'Close' connection - drop(client); + let spec = Arc::new(spec.clone()); + + self.apply_spec_sql(spec, url, 1)?; + } Ok(()) })?; diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index d4e413034e..d65fe73194 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -116,7 +116,7 @@ pub fn write_postgres_conf( vartype: "enum".to_owned(), }; - write!(file, "{}", opt.to_pg_setting())?; + writeln!(file, "{}", opt.to_pg_setting())?; } } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 3677582c11..8a047634df 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -20,6 +20,7 @@ use anyhow::Result; use hyper::header::CONTENT_TYPE; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; +use metrics::proto::MetricFamily; use metrics::Encoder; use metrics::TextEncoder; use tokio::task; @@ -72,10 +73,22 @@ async fn routes(req: Request, compute: &Arc) -> Response { debug!("serving /metrics GET request"); - let mut buffer = vec![]; - let metrics = installed_extensions::collect(); + // When we call TextEncoder::encode() below, it will immediately + // return an error if a metric family has no metrics, so we need to + // preemptively filter out metric families with no metrics. + let metrics = installed_extensions::collect() + .into_iter() + .filter(|m| !m.get_metric().is_empty()) + .collect::>(); + let encoder = TextEncoder::new(); - encoder.encode(&metrics, &mut buffer).unwrap(); + let mut buffer = vec![]; + + if let Err(err) = encoder.encode(&metrics, &mut buffer) { + let msg = format!("error handling /metrics request: {err}"); + error!(msg); + return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR); + } match Response::builder() .status(StatusCode::OK) diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 6dd55855db..79d8b2ca04 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -115,7 +115,7 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> { static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { register_uint_gauge_vec!( - "installed_extensions", + "compute_installed_extensions", "Number of databases where the version of extension is installed", &["extension_name", "version"] ) diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index d27ae58fa2..ee4cf2dfa5 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -23,5 +23,6 @@ pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; +mod spec_apply; pub mod swap; pub mod sync_sk; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index b2dc265864..4a1e5ee0e8 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -10,9 +10,9 @@ use std::thread::JoinHandle; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; +use futures::StreamExt; use ini::Ini; use notify::{RecursiveMode, Watcher}; -use postgres::{Client, Transaction}; use tokio::io::AsyncBufReadExt; use tokio::time::timeout; use tokio_postgres::NoTls; @@ -197,27 +197,34 @@ impl Escaping for PgIdent { } /// Build a list of existing Postgres roles -pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result> { - let postgres_roles = xact - .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])? - .iter() +pub async fn get_existing_roles_async(client: &tokio_postgres::Client) -> Result> { + let postgres_roles = client + .query_raw::( + "SELECT rolname, rolpassword FROM pg_catalog.pg_authid", + &[], + ) + .await? + .filter_map(|row| async { row.ok() }) .map(|row| Role { name: row.get("rolname"), encrypted_password: row.get("rolpassword"), options: None, }) - .collect(); + .collect() + .await; Ok(postgres_roles) } /// Build a list of existing Postgres databases -pub fn get_existing_dbs(client: &mut Client) -> Result> { +pub async fn get_existing_dbs_async( + client: &tokio_postgres::Client, +) -> Result> { // `pg_database.datconnlimit = -2` means that the database is in the // invalid state. See: // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 - let postgres_dbs: Vec = client - .query( + let rowstream = client + .query_raw::( "SELECT datname AS name, datdba::regrole::text AS owner, @@ -226,8 +233,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result FROM pg_catalog.pg_database;", &[], - )? - .iter() + ) + .await?; + + let dbs_map = rowstream + .filter_map(|r| async { r.ok() }) .map(|row| Database { name: row.get("name"), owner: row.get("owner"), @@ -235,12 +245,9 @@ pub fn get_existing_dbs(client: &mut Client) -> Result invalid: row.get("invalid"), options: None, }) - .collect(); - - let dbs_map = postgres_dbs - .iter() .map(|db| (db.name.clone(), db.clone())) - .collect::>(); + .collect::>() + .await; Ok(dbs_map) } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 73f3d1006a..c7d2deb090 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,22 +1,17 @@ -use std::collections::HashSet; +use anyhow::{anyhow, bail, Result}; +use postgres::Client; +use reqwest::StatusCode; use std::fs::File; use std::path::Path; -use std::str::FromStr; - -use anyhow::{anyhow, bail, Context, Result}; -use postgres::config::Config; -use postgres::{Client, NoTls}; -use reqwest::StatusCode; -use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; +use tracing::{error, info, instrument, warn}; use crate::config; -use crate::logger::inlinify; use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse}; -use compute_api::spec::{ComputeSpec, PgIdent, Role}; +use compute_api::spec::ComputeSpec; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request @@ -151,625 +146,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { Ok(()) } -/// Compute could be unexpectedly shut down, for example, during the -/// database dropping. This leaves the database in the invalid state, -/// which prevents new db creation with the same name. This function -/// will clean it up before proceeding with catalog updates. All -/// possible future cleanup operations may go here too. -#[instrument(skip_all)] -pub fn cleanup_instance(client: &mut Client) -> Result<()> { - let existing_dbs = get_existing_dbs(client)?; - - for (_, db) in existing_dbs { - if db.invalid { - // After recent commit in Postgres, interrupted DROP DATABASE - // leaves the database in the invalid state. According to the - // commit message, the only option for user is to drop it again. - // See: - // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 - // - // Postgres Neon extension is done the way, that db is de-registered - // in the control plane metadata only after it is dropped. So there is - // a chance that it still thinks that db should exist. This means - // that it will be re-created by `handle_databases()`. Yet, it's fine - // as user can just repeat drop (in vanilla Postgres they would need - // to do the same, btw). - let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote()); - info!("dropping invalid database {}", db.name); - client.execute(query.as_str(), &[])?; - } - } - - Ok(()) -} - -/// Given a cluster spec json and open transaction it handles roles creation, -/// deletion and update. -#[instrument(skip_all)] -pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { - let mut xact = client.transaction()?; - let existing_roles: Vec = get_existing_roles(&mut xact)?; - - let mut jwks_roles = HashSet::new(); - if let Some(local_proxy) = &spec.local_proxy_config { - for jwks_setting in local_proxy.jwks.iter().flatten() { - for role_name in &jwks_setting.role_names { - jwks_roles.insert(role_name.clone()); - } - } - } - - // Print a list of existing Postgres roles (only in debug mode) - if span_enabled!(Level::INFO) { - let mut vec = Vec::new(); - for r in &existing_roles { - vec.push(format!( - "{}:{}", - r.name, - if r.encrypted_password.is_some() { - "[FILTERED]" - } else { - "(null)" - } - )); - } - - info!("postgres roles (total {}): {:?}", vec.len(), vec); - } - - // Process delta operations first - if let Some(ops) = &spec.delta_operations { - info!("processing role renames"); - for op in ops { - match op.action.as_ref() { - "delete_role" => { - // no-op now, roles will be deleted at the end of configuration - } - // Renaming role drops its password, since role name is - // used as a salt there. It is important that this role - // is recorded with a new `name` in the `roles` list. - // Follow up roles update will set the new password. - "rename_role" => { - let new_name = op.new_name.as_ref().unwrap(); - - // XXX: with a limited number of roles it is fine, but consider making it a HashMap - if existing_roles.iter().any(|r| r.name == op.name) { - let query: String = format!( - "ALTER ROLE {} RENAME TO {}", - op.name.pg_quote(), - new_name.pg_quote() - ); - - warn!("renaming role '{}' to '{}'", op.name, new_name); - xact.execute(query.as_str(), &[])?; - } - } - _ => {} - } - } - } - - // Refresh Postgres roles info to handle possible roles renaming - let existing_roles: Vec = get_existing_roles(&mut xact)?; - - info!( - "handling cluster spec roles (total {})", - spec.cluster.roles.len() - ); - for role in &spec.cluster.roles { - let name = &role.name; - // XXX: with a limited number of roles it is fine, but consider making it a HashMap - let pg_role = existing_roles.iter().find(|r| r.name == *name); - - enum RoleAction { - None, - Update, - Create, - } - let action = if let Some(r) = pg_role { - if (r.encrypted_password.is_none() && role.encrypted_password.is_some()) - || (r.encrypted_password.is_some() && role.encrypted_password.is_none()) - { - RoleAction::Update - } else if let Some(pg_pwd) = &r.encrypted_password { - // Check whether password changed or not (trim 'md5' prefix first if any) - // - // This is a backward compatibility hack, which comes from the times when we were using - // md5 for everyone and hashes were stored in the console db without md5 prefix. So when - // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix, - // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix. - // Here is the only place so far where we compare hashes, so it seems to be the best candidate - // to place this compatibility layer. - let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") { - stripped - } else { - pg_pwd - }; - if pg_pwd != *role.encrypted_password.as_ref().unwrap() { - RoleAction::Update - } else { - RoleAction::None - } - } else { - RoleAction::None - } - } else { - RoleAction::Create - }; - - match action { - RoleAction::None => {} - RoleAction::Update => { - // This can be run on /every/ role! Not just ones created through the console. - // This means that if you add some funny ALTER here that adds a permission, - // this will get run even on user-created roles! This will result in different - // behavior before and after a spec gets reapplied. The below ALTER as it stands - // now only grants LOGIN and changes the password. Please do not allow this branch - // to do anything silly. - let mut query: String = format!("ALTER ROLE {} ", name.pg_quote()); - query.push_str(&role.to_pg_options()); - xact.execute(query.as_str(), &[])?; - } - RoleAction::Create => { - // This branch only runs when roles are created through the console, so it is - // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited - // from neon_superuser. - let mut query: String = format!( - "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", - name.pg_quote() - ); - if jwks_roles.contains(name.as_str()) { - query = format!("CREATE ROLE {}", name.pg_quote()); - } - info!("running role create query: '{}'", &query); - query.push_str(&role.to_pg_options()); - xact.execute(query.as_str(), &[])?; - } - } - - if span_enabled!(Level::INFO) { - let pwd = if role.encrypted_password.is_some() { - "[FILTERED]" - } else { - "(null)" - }; - let action_str = match action { - RoleAction::None => "", - RoleAction::Create => " -> create", - RoleAction::Update => " -> update", - }; - info!(" - {}:{}{}", name, pwd, action_str); - } - } - - xact.commit()?; - - Ok(()) -} - -/// Reassign all dependent objects and delete requested roles. -#[instrument(skip_all)] -pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> { - if let Some(ops) = &spec.delta_operations { - // First, reassign all dependent objects to db owners. - info!("reassigning dependent objects of to-be-deleted roles"); - - // Fetch existing roles. We could've exported and used `existing_roles` from - // `handle_roles()`, but we only make this list there before creating new roles. - // Which is probably fine as we never create to-be-deleted roles, but that'd - // just look a bit untidy. Anyway, the entire `pg_roles` should be in shared - // buffers already, so this shouldn't be a big deal. - let mut xact = client.transaction()?; - let existing_roles: Vec = get_existing_roles(&mut xact)?; - xact.commit()?; - - for op in ops { - // Check that role is still present in Postgres, as this could be a - // restart with the same spec after role deletion. - if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) { - reassign_owned_objects(spec, connstr, &op.name)?; - } - } - - // Second, proceed with role deletions. - info!("processing role deletions"); - let mut xact = client.transaction()?; - for op in ops { - // We do not check either role exists or not, - // Postgres will take care of it for us - if op.action == "delete_role" { - let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote()); - - warn!("deleting role '{}'", &op.name); - xact.execute(query.as_str(), &[])?; - } - } - xact.commit()?; - } - - Ok(()) -} - -fn reassign_owned_objects_in_one_db( - conf: Config, - role_name: &PgIdent, - db_owner: &PgIdent, -) -> Result<()> { - let mut client = conf.connect(NoTls)?; - - // This will reassign all dependent objects to the db owner - let reassign_query = format!( - "REASSIGN OWNED BY {} TO {}", - role_name.pg_quote(), - db_owner.pg_quote() - ); - info!( - "reassigning objects owned by '{}' in db '{}' to '{}'", - role_name, - conf.get_dbname().unwrap_or(""), - db_owner - ); - client.simple_query(&reassign_query)?; - - // This now will only drop privileges of the role - let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote()); - client.simple_query(&drop_query)?; - Ok(()) -} - -// Reassign all owned objects in all databases to the owner of the database. -fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> { - for db in &spec.cluster.databases { - if db.owner != *role_name { - let mut conf = Config::from_str(connstr)?; - conf.dbname(&db.name); - reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?; - } - } - - // Also handle case when there are no databases in the spec. - // In this case we need to reassign objects in the default database. - let conf = Config::from_str(connstr)?; - let db_owner = PgIdent::from_str("cloud_admin")?; - reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?; - - Ok(()) -} - -/// It follows mostly the same logic as `handle_roles()` excepting that we -/// does not use an explicit transactions block, since major database operations -/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level -/// atomicity should be enough here due to the order of operations and various checks, -/// which together provide us idempotency. -#[instrument(skip_all)] -pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { - let existing_dbs = get_existing_dbs(client)?; - - // Print a list of existing Postgres databases (only in debug mode) - if span_enabled!(Level::INFO) { - let mut vec = Vec::new(); - for (dbname, db) in &existing_dbs { - vec.push(format!("{}:{}", dbname, db.owner)); - } - info!("postgres databases (total {}): {:?}", vec.len(), vec); - } - - // Process delta operations first - if let Some(ops) = &spec.delta_operations { - info!("processing delta operations on databases"); - for op in ops { - match op.action.as_ref() { - // We do not check either DB exists or not, - // Postgres will take care of it for us - "delete_db" => { - // In Postgres we can't drop a database if it is a template. - // So we need to unset the template flag first, but it could - // be a retry, so we could've already dropped the database. - // Check that database exists first to make it idempotent. - let unset_template_query: String = format!( - " - DO $$ - BEGIN - IF EXISTS( - SELECT 1 - FROM pg_catalog.pg_database - WHERE datname = {} - ) - THEN - ALTER DATABASE {} is_template false; - END IF; - END - $$;", - escape_literal(&op.name), - &op.name.pg_quote() - ); - // Use FORCE to drop database even if there are active connections. - // We run this from `cloud_admin`, so it should have enough privileges. - // NB: there could be other db states, which prevent us from dropping - // the database. For example, if db is used by any active subscription - // or replication slot. - // TODO: deal with it once we allow logical replication. Proper fix should - // involve returning an error code to the control plane, so it could - // figure out that this is a non-retryable error, return it to the user - // and fail operation permanently. - let drop_db_query: String = format!( - "DROP DATABASE IF EXISTS {} WITH (FORCE)", - &op.name.pg_quote() - ); - - warn!("deleting database '{}'", &op.name); - client.execute(unset_template_query.as_str(), &[])?; - client.execute(drop_db_query.as_str(), &[])?; - } - "rename_db" => { - let new_name = op.new_name.as_ref().unwrap(); - - if existing_dbs.contains_key(&op.name) { - let query: String = format!( - "ALTER DATABASE {} RENAME TO {}", - op.name.pg_quote(), - new_name.pg_quote() - ); - - warn!("renaming database '{}' to '{}'", op.name, new_name); - client.execute(query.as_str(), &[])?; - } - } - _ => {} - } - } - } - - // Refresh Postgres databases info to handle possible renames - let existing_dbs = get_existing_dbs(client)?; - - info!( - "handling cluster spec databases (total {})", - spec.cluster.databases.len() - ); - for db in &spec.cluster.databases { - let name = &db.name; - let pg_db = existing_dbs.get(name); - - enum DatabaseAction { - None, - Update, - Create, - } - let action = if let Some(r) = pg_db { - // XXX: db owner name is returned as quoted string from Postgres, - // when quoting is needed. - let new_owner = if r.owner.starts_with('"') { - db.owner.pg_quote() - } else { - db.owner.clone() - }; - - if new_owner != r.owner { - // Update the owner - DatabaseAction::Update - } else { - DatabaseAction::None - } - } else { - DatabaseAction::Create - }; - - match action { - DatabaseAction::None => {} - DatabaseAction::Update => { - let query: String = format!( - "ALTER DATABASE {} OWNER TO {}", - name.pg_quote(), - db.owner.pg_quote() - ); - let _guard = info_span!("executing", query).entered(); - client.execute(query.as_str(), &[])?; - } - DatabaseAction::Create => { - let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote()); - query.push_str(&db.to_pg_options()); - let _guard = info_span!("executing", query).entered(); - client.execute(query.as_str(), &[])?; - let grant_query: String = format!( - "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", - name.pg_quote() - ); - client.execute(grant_query.as_str(), &[])?; - } - }; - - if span_enabled!(Level::INFO) { - let action_str = match action { - DatabaseAction::None => "", - DatabaseAction::Create => " -> create", - DatabaseAction::Update => " -> update", - }; - info!(" - {}:{}{}", db.name, db.owner, action_str); - } - } - - Ok(()) -} - -/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants -/// to allow users creating trusted extensions and re-creating `public` schema, for example. -#[instrument(skip_all)] -pub fn handle_grants( - spec: &ComputeSpec, - client: &mut Client, - connstr: &str, - enable_anon_extension: bool, -) -> Result<()> { - info!("modifying database permissions"); - let existing_dbs = get_existing_dbs(client)?; - - // Do some per-database access adjustments. We'd better do this at db creation time, - // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants - // atomically. - for db in &spec.cluster.databases { - match existing_dbs.get(&db.name) { - Some(pg_db) => { - if pg_db.restrict_conn || pg_db.invalid { - info!( - "skipping grants for db {} (invalid: {}, connections not allowed: {})", - db.name, pg_db.invalid, pg_db.restrict_conn - ); - continue; - } - } - None => { - bail!( - "database {} doesn't exist in Postgres after handle_databases()", - db.name - ); - } - } - - let mut conf = Config::from_str(connstr)?; - conf.dbname(&db.name); - - let mut db_client = conf.connect(NoTls)?; - - // This will only change ownership on the schema itself, not the objects - // inside it. Without it owner of the `public` schema will be `cloud_admin` - // and database owner cannot do anything with it. SQL procedure ensures - // that it won't error out if schema `public` doesn't exist. - let alter_query = format!( - "DO $$\n\ - DECLARE\n\ - schema_owner TEXT;\n\ - BEGIN\n\ - IF EXISTS(\n\ - SELECT nspname\n\ - FROM pg_catalog.pg_namespace\n\ - WHERE nspname = 'public'\n\ - )\n\ - THEN\n\ - SELECT nspowner::regrole::text\n\ - FROM pg_catalog.pg_namespace\n\ - WHERE nspname = 'public'\n\ - INTO schema_owner;\n\ - \n\ - IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\ - THEN\n\ - ALTER SCHEMA public OWNER TO {};\n\ - END IF;\n\ - END IF;\n\ - END\n\ - $$;", - db.owner.pg_quote() - ); - db_client.simple_query(&alter_query)?; - - // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. - // This is needed because since postgres 15 this privilege is removed by default. - // TODO: web_access isn't created for almost 1 year. It could be that we have - // active users of 1 year old projects, but hopefully not, so check it and - // remove this code if possible. The worst thing that could happen is that - // user won't be able to use public schema in NEW databases created in the - // very OLD project. - // - // Also, alter default permissions so that relations created by extensions can be - // used by neon_superuser without permission issues. - let grant_query = "DO $$\n\ - BEGIN\n\ - IF EXISTS(\n\ - SELECT nspname\n\ - FROM pg_catalog.pg_namespace\n\ - WHERE nspname = 'public'\n\ - ) AND\n\ - current_setting('server_version_num')::int/10000 >= 15\n\ - THEN\n\ - IF EXISTS(\n\ - SELECT rolname\n\ - FROM pg_catalog.pg_roles\n\ - WHERE rolname = 'web_access'\n\ - )\n\ - THEN\n\ - GRANT CREATE ON SCHEMA public TO web_access;\n\ - END IF;\n\ - END IF;\n\ - IF EXISTS(\n\ - SELECT nspname\n\ - FROM pg_catalog.pg_namespace\n\ - WHERE nspname = 'public'\n\ - )\n\ - THEN\n\ - ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\ - ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\ - END IF;\n\ - END\n\ - $$;" - .to_string(); - - info!( - "grant query for db {} : {}", - &db.name, - inlinify(&grant_query) - ); - db_client.simple_query(&grant_query)?; - - // it is important to run this after all grants - if enable_anon_extension { - handle_extension_anon(spec, &db.owner, &mut db_client, false) - .context("handle_grants handle_extension_anon")?; - } - } - - Ok(()) -} - -/// Create required system extensions -#[instrument(skip_all)] -pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> { - if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { - if libs.contains("pg_stat_statements") { - // Create extension only if this compute really needs it - let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements"; - info!("creating system extensions with query: {}", query); - client.simple_query(query)?; - } - } - - Ok(()) -} - -/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database -#[instrument(skip_all)] -pub fn handle_extension_neon(client: &mut Client) -> Result<()> { - info!("handle extension neon"); - - let mut query = "CREATE SCHEMA IF NOT EXISTS neon"; - client.simple_query(query)?; - - query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"; - info!("create neon extension with query: {}", query); - client.simple_query(query)?; - - query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'"; - client.simple_query(query)?; - - query = "ALTER EXTENSION neon SET SCHEMA neon"; - info!("alter neon extension schema with query: {}", query); - client.simple_query(query)?; - - // this will be a no-op if extension is already up to date, - // which may happen in two cases: - // - extension was just installed - // - extension was already installed and is up to date - let query = "ALTER EXTENSION neon UPDATE"; - info!("update neon extension version with query: {}", query); - if let Err(e) = client.simple_query(query) { - error!( - "failed to upgrade neon extension during `handle_extension_neon`: {}", - e - ); - } - - Ok(()) -} - #[instrument(skip_all)] pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { info!("handle neon extension upgrade"); diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs new file mode 100644 index 0000000000..7308d5d36e --- /dev/null +++ b/compute_tools/src/spec_apply.rs @@ -0,0 +1,680 @@ +use std::collections::{HashMap, HashSet}; +use std::fmt::{Debug, Formatter}; +use std::future::Future; +use std::iter::empty; +use std::iter::once; +use std::sync::Arc; + +use crate::compute::construct_superuser_query; +use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt}; +use anyhow::{bail, Result}; +use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role}; +use futures::future::join_all; +use tokio::sync::RwLock; +use tokio_postgres::Client; +use tracing::{debug, info_span, Instrument}; + +#[derive(Clone)] +pub enum DB { + SystemDB, + UserDB(Database), +} + +impl DB { + pub fn new(db: Database) -> DB { + Self::UserDB(db) + } + + pub fn is_owned_by(&self, role: &PgIdent) -> bool { + match self { + DB::SystemDB => false, + DB::UserDB(db) => &db.owner == role, + } + } +} + +impl Debug for DB { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + DB::SystemDB => f.debug_tuple("SystemDB").finish(), + DB::UserDB(db) => f.debug_tuple("UserDB").field(&db.name).finish(), + } + } +} + +#[derive(Copy, Clone, Debug)] +pub enum PerDatabasePhase { + DeleteDBRoleReferences, + ChangeSchemaPerms, + HandleAnonExtension, +} + +#[derive(Clone, Debug)] +pub enum ApplySpecPhase { + CreateSuperUser, + DropInvalidDatabases, + RenameRoles, + CreateAndAlterRoles, + RenameAndDeleteDatabases, + CreateAndAlterDatabases, + RunInEachDatabase { db: DB, subphase: PerDatabasePhase }, + HandleOtherExtensions, + HandleNeonExtension, + CreateAvailabilityCheck, + DropRoles, +} + +pub struct Operation { + pub query: String, + pub comment: Option, +} + +pub struct MutableApplyContext { + pub roles: HashMap, + pub dbs: HashMap, +} + +/// Appply the operations that belong to the given spec apply phase. +/// +/// Commands within a single phase are executed in order of Iterator yield. +/// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database +/// indicated by its `db` field, and can share a single client for all changes +/// to that database. +/// +/// Notes: +/// - Commands are pipelined, and thus may cause incomplete apply if one +/// command of many fails. +/// - Failing commands will fail the phase's apply step once the return value +/// is processed. +/// - No timeouts have (yet) been implemented. +/// - The caller is responsible for limiting and/or applying concurrency. +pub async fn apply_operations<'a, Fut, F>( + spec: Arc, + ctx: Arc>, + jwks_roles: Arc>, + apply_spec_phase: ApplySpecPhase, + client: F, +) -> Result<()> +where + F: FnOnce() -> Fut, + Fut: Future>, +{ + debug!("Starting phase {:?}", &apply_spec_phase); + let span = info_span!("db_apply_changes", phase=?apply_spec_phase); + let span2 = span.clone(); + async move { + debug!("Processing phase {:?}", &apply_spec_phase); + let ctx = ctx; + + let mut ops = get_operations(&spec, &ctx, &jwks_roles, &apply_spec_phase) + .await? + .peekable(); + + // Return (and by doing so, skip requesting the PostgreSQL client) if + // we don't have any operations scheduled. + if ops.peek().is_none() { + return Ok(()); + } + + let client = client().await?; + + debug!("Applying phase {:?}", &apply_spec_phase); + + let active_queries = ops + .map(|op| { + let Operation { comment, query } = op; + let inspan = match comment { + None => span.clone(), + Some(comment) => info_span!("phase {}: {}", comment), + }; + + async { + let query = query; + let res = client.simple_query(&query).await; + debug!( + "{} {}", + if res.is_ok() { + "successfully executed" + } else { + "failed to execute" + }, + query + ); + res + } + .instrument(inspan) + }) + .collect::>(); + + drop(ctx); + + for it in join_all(active_queries).await { + drop(it?); + } + + debug!("Completed phase {:?}", &apply_spec_phase); + + Ok(()) + } + .instrument(span2) + .await +} + +/// Create a stream of operations to be executed for that phase of applying +/// changes. +/// +/// In the future we may generate a single stream of changes and then +/// sort/merge/batch execution, but for now this is a nice way to improve +/// batching behaviour of the commands. +async fn get_operations<'a>( + spec: &'a ComputeSpec, + ctx: &'a RwLock, + jwks_roles: &'a HashSet, + apply_spec_phase: &'a ApplySpecPhase, +) -> Result + 'a + Send>> { + match apply_spec_phase { + ApplySpecPhase::CreateSuperUser => { + let query = construct_superuser_query(spec); + + Ok(Box::new(once(Operation { + query, + comment: None, + }))) + } + ApplySpecPhase::DropInvalidDatabases => { + let mut ctx = ctx.write().await; + let databases = &mut ctx.dbs; + + let keys: Vec<_> = databases + .iter() + .filter(|(_, db)| db.invalid) + .map(|(dbname, _)| dbname.clone()) + .collect(); + + // After recent commit in Postgres, interrupted DROP DATABASE + // leaves the database in the invalid state. According to the + // commit message, the only option for user is to drop it again. + // See: + // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 + // + // Postgres Neon extension is done the way, that db is de-registered + // in the control plane metadata only after it is dropped. So there is + // a chance that it still thinks that the db should exist. This means + // that it will be re-created by the `CreateDatabases` phase. This + // is fine, as user can just drop the table again (in vanilla + // Postgres they would need to do the same). + let operations = keys + .into_iter() + .filter_map(move |dbname| ctx.dbs.remove(&dbname)) + .map(|db| Operation { + query: format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote()), + comment: Some(format!("Dropping invalid database {}", db.name)), + }); + + Ok(Box::new(operations)) + } + ApplySpecPhase::RenameRoles => { + let mut ctx = ctx.write().await; + + let operations = spec + .delta_operations + .iter() + .flatten() + .filter(|op| op.action == "rename_role") + .filter_map(move |op| { + let roles = &mut ctx.roles; + + if roles.contains_key(op.name.as_str()) { + None + } else { + let new_name = op.new_name.as_ref().unwrap(); + let mut role = roles.remove(op.name.as_str()).unwrap(); + + role.name = new_name.clone(); + role.encrypted_password = None; + roles.insert(role.name.clone(), role); + + Some(Operation { + query: format!( + "ALTER ROLE {} RENAME TO {}", + op.name.pg_quote(), + new_name.pg_quote() + ), + comment: Some(format!("renaming role '{}' to '{}'", op.name, new_name)), + }) + } + }); + + Ok(Box::new(operations)) + } + ApplySpecPhase::CreateAndAlterRoles => { + let mut ctx = ctx.write().await; + + let operations = spec.cluster.roles + .iter() + .filter_map(move |role| { + let roles = &mut ctx.roles; + let db_role = roles.get(&role.name); + + match db_role { + Some(db_role) => { + if db_role.encrypted_password != role.encrypted_password { + // This can be run on /every/ role! Not just ones created through the console. + // This means that if you add some funny ALTER here that adds a permission, + // this will get run even on user-created roles! This will result in different + // behavior before and after a spec gets reapplied. The below ALTER as it stands + // now only grants LOGIN and changes the password. Please do not allow this branch + // to do anything silly. + Some(Operation { + query: format!( + "ALTER ROLE {} {}", + role.name.pg_quote(), + role.to_pg_options(), + ), + comment: None, + }) + } else { + None + } + } + None => { + let query = if !jwks_roles.contains(role.name.as_str()) { + format!( + "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser {}", + role.name.pg_quote(), + role.to_pg_options(), + ) + } else { + format!( + "CREATE ROLE {} {}", + role.name.pg_quote(), + role.to_pg_options(), + ) + }; + Some(Operation { + query, + comment: Some(format!("creating role {}", role.name)), + }) + } + } + }); + + Ok(Box::new(operations)) + } + ApplySpecPhase::RenameAndDeleteDatabases => { + let mut ctx = ctx.write().await; + + let operations = spec + .delta_operations + .iter() + .flatten() + .filter_map(move |op| { + let databases = &mut ctx.dbs; + match op.action.as_str() { + // We do not check whether the DB exists or not, + // Postgres will take care of it for us + "delete_db" => { + // In Postgres we can't drop a database if it is a template. + // So we need to unset the template flag first, but it could + // be a retry, so we could've already dropped the database. + // Check that database exists first to make it idempotent. + let unset_template_query: String = format!( + include_str!("sql/unset_template_for_drop_dbs.sql"), + datname_str = escape_literal(&op.name), + datname = &op.name.pg_quote() + ); + + // Use FORCE to drop database even if there are active connections. + // We run this from `cloud_admin`, so it should have enough privileges. + // NB: there could be other db states, which prevent us from dropping + // the database. For example, if db is used by any active subscription + // or replication slot. + // TODO: deal with it once we allow logical replication. Proper fix should + // involve returning an error code to the control plane, so it could + // figure out that this is a non-retryable error, return it to the user + // and fail operation permanently. + let drop_db_query: String = format!( + "DROP DATABASE IF EXISTS {} WITH (FORCE)", + &op.name.pg_quote() + ); + + databases.remove(&op.name); + + Some(vec![ + Operation { + query: unset_template_query, + comment: Some(format!( + "optionally clearing template flags for DB {}", + op.name, + )), + }, + Operation { + query: drop_db_query, + comment: Some(format!("deleting database {}", op.name,)), + }, + ]) + } + "rename_db" => { + if let Some(mut db) = databases.remove(&op.name) { + // update state of known databases + let new_name = op.new_name.as_ref().unwrap(); + db.name = new_name.clone(); + databases.insert(db.name.clone(), db); + + Some(vec![Operation { + query: format!( + "ALTER DATABASE {} RENAME TO {}", + op.name.pg_quote(), + new_name.pg_quote(), + ), + comment: Some(format!( + "renaming database '{}' to '{}'", + op.name, new_name + )), + }]) + } else { + None + } + } + _ => None, + } + }) + .flatten(); + + Ok(Box::new(operations)) + } + ApplySpecPhase::CreateAndAlterDatabases => { + let mut ctx = ctx.write().await; + + let operations = spec + .cluster + .databases + .iter() + .filter_map(move |db| { + let databases = &mut ctx.dbs; + if let Some(edb) = databases.get_mut(&db.name) { + let change_owner = if edb.owner.starts_with('"') { + db.owner.pg_quote() != edb.owner + } else { + db.owner != edb.owner + }; + + edb.owner = db.owner.clone(); + + if change_owner { + Some(vec![Operation { + query: format!( + "ALTER DATABASE {} OWNER TO {}", + db.name.pg_quote(), + db.owner.pg_quote() + ), + comment: Some(format!( + "changing database owner of database {} to {}", + db.name, db.owner + )), + }]) + } else { + None + } + } else { + databases.insert(db.name.clone(), db.clone()); + + Some(vec![ + Operation { + query: format!( + "CREATE DATABASE {} {}", + db.name.pg_quote(), + db.to_pg_options(), + ), + comment: None, + }, + Operation { + query: format!( + "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", + db.name.pg_quote() + ), + comment: None, + }, + ]) + } + }) + .flatten(); + + Ok(Box::new(operations)) + } + ApplySpecPhase::RunInEachDatabase { db, subphase } => { + match subphase { + PerDatabasePhase::DeleteDBRoleReferences => { + let ctx = ctx.read().await; + + let operations = + spec.delta_operations + .iter() + .flatten() + .filter(|op| op.action == "delete_role") + .filter_map(move |op| { + if db.is_owned_by(&op.name) { + return None; + } + if !ctx.roles.contains_key(&op.name) { + return None; + } + let quoted = op.name.pg_quote(); + let new_owner = match &db { + DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), + DB::UserDB(db) => db.owner.pg_quote(), + }; + + Some(vec![ + // This will reassign all dependent objects to the db owner + Operation { + query: format!( + "REASSIGN OWNED BY {} TO {}", + quoted, new_owner, + ), + comment: None, + }, + // This now will only drop privileges of the role + Operation { + query: format!("DROP OWNED BY {}", quoted), + comment: None, + }, + ]) + }) + .flatten(); + + Ok(Box::new(operations)) + } + PerDatabasePhase::ChangeSchemaPerms => { + let ctx = ctx.read().await; + let databases = &ctx.dbs; + + let db = match &db { + // ignore schema permissions on the system database + DB::SystemDB => return Ok(Box::new(empty())), + DB::UserDB(db) => db, + }; + + if databases.get(&db.name).is_none() { + bail!("database {} doesn't exist in PostgreSQL", db.name); + } + + let edb = databases.get(&db.name).unwrap(); + + if edb.restrict_conn || edb.invalid { + return Ok(Box::new(empty())); + } + + let operations = vec![ + Operation { + query: format!( + include_str!("sql/set_public_schema_owner.sql"), + db_owner = db.owner.pg_quote() + ), + comment: None, + }, + Operation { + query: String::from(include_str!("sql/default_grants.sql")), + comment: None, + }, + ] + .into_iter(); + + Ok(Box::new(operations)) + } + PerDatabasePhase::HandleAnonExtension => { + // Only install Anon into user databases + let db = match &db { + DB::SystemDB => return Ok(Box::new(empty())), + DB::UserDB(db) => db, + }; + // Never install Anon when it's not enabled as feature + if !spec.features.contains(&ComputeFeature::AnonExtension) { + return Ok(Box::new(empty())); + } + + // Only install Anon when it's added in preload libraries + let opt_libs = spec.cluster.settings.find("shared_preload_libraries"); + + let libs = match opt_libs { + Some(libs) => libs, + None => return Ok(Box::new(empty())), + }; + + if !libs.contains("anon") { + return Ok(Box::new(empty())); + } + + let db_owner = db.owner.pg_quote(); + + let operations = vec![ + // Create anon extension if this compute needs it + // Users cannot create it themselves, because superuser is required. + Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS anon CASCADE"), + comment: Some(String::from("creating anon extension")), + }, + // Initialize anon extension + // This also requires superuser privileges, so users cannot do it themselves. + Operation { + query: String::from("SELECT anon.init()"), + comment: Some(String::from("initializing anon extension data")), + }, + Operation { + query: format!("GRANT ALL ON SCHEMA anon TO {}", db_owner), + comment: Some(String::from( + "granting anon extension schema permissions", + )), + }, + Operation { + query: format!( + "GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", + db_owner + ), + comment: Some(String::from( + "granting anon extension schema functions permissions", + )), + }, + // We need this, because some functions are defined as SECURITY DEFINER. + // In Postgres SECURITY DEFINER functions are executed with the privileges + // of the owner. + // In anon extension this it is needed to access some GUCs, which are only accessible to + // superuser. But we've patched postgres to allow db_owner to access them as well. + // So we need to change owner of these functions to db_owner. + Operation { + query: format!( + include_str!("sql/anon_ext_fn_reassign.sql"), + db_owner = db_owner, + ), + comment: Some(String::from( + "change anon extension functions owner to database_owner", + )), + }, + Operation { + query: format!( + "GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", + db_owner, + ), + comment: Some(String::from( + "granting anon extension tables permissions", + )), + }, + Operation { + query: format!( + "GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", + db_owner, + ), + comment: Some(String::from( + "granting anon extension sequences permissions", + )), + }, + ] + .into_iter(); + + Ok(Box::new(operations)) + } + } + } + // Interestingly, we only install p_s_s in the main database, even when + // it's preloaded. + ApplySpecPhase::HandleOtherExtensions => { + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + if libs.contains("pg_stat_statements") { + return Ok(Box::new(once(Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"), + comment: Some(String::from("create system extensions")), + }))); + } + } + Ok(Box::new(empty())) + } + ApplySpecPhase::HandleNeonExtension => { + let operations = vec![ + Operation { + query: String::from("CREATE SCHEMA IF NOT EXISTS neon"), + comment: Some(String::from("init: add schema for extension")), + }, + Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"), + comment: Some(String::from( + "init: install the extension if not already installed", + )), + }, + Operation { + query: String::from( + "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'", + ), + comment: Some(String::from("compat/fix: make neon relocatable")), + }, + Operation { + query: String::from("ALTER EXTENSION neon SET SCHEMA neon"), + comment: Some(String::from("compat/fix: alter neon extension schema")), + }, + Operation { + query: String::from("ALTER EXTENSION neon UPDATE"), + comment: Some(String::from("compat/update: update neon extension version")), + }, + ] + .into_iter(); + + Ok(Box::new(operations)) + } + ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation { + query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")), + comment: None, + }))), + ApplySpecPhase::DropRoles => { + let operations = spec + .delta_operations + .iter() + .flatten() + .filter(|op| op.action == "delete_role") + .map(|op| Operation { + query: format!("DROP ROLE IF EXISTS {}", op.name.pg_quote()), + comment: None, + }); + + Ok(Box::new(operations)) + } + } +} diff --git a/compute_tools/src/sql/add_availabilitycheck_tables.sql b/compute_tools/src/sql/add_availabilitycheck_tables.sql new file mode 100644 index 0000000000..7c60690c78 --- /dev/null +++ b/compute_tools/src/sql/add_availabilitycheck_tables.sql @@ -0,0 +1,18 @@ +DO $$ +BEGIN + IF NOT EXISTS( + SELECT 1 + FROM pg_catalog.pg_tables + WHERE tablename = 'health_check' + ) + THEN + CREATE TABLE health_check ( + id serial primary key, + updated_at timestamptz default now() + ); + INSERT INTO health_check VALUES (1, now()) + ON CONFLICT (id) DO UPDATE + SET updated_at = now(); + END IF; +END +$$ \ No newline at end of file diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql new file mode 100644 index 0000000000..3d7b15c590 --- /dev/null +++ b/compute_tools/src/sql/anon_ext_fn_reassign.sql @@ -0,0 +1,12 @@ +DO $$ +DECLARE + query varchar; +BEGIN + FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};' + FROM pg_proc p + JOIN pg_namespace nsp ON p.pronamespace = nsp.oid + WHERE nsp.nspname = 'anon' LOOP + EXECUTE query; + END LOOP; +END +$$; diff --git a/compute_tools/src/sql/default_grants.sql b/compute_tools/src/sql/default_grants.sql new file mode 100644 index 0000000000..58ebb0690b --- /dev/null +++ b/compute_tools/src/sql/default_grants.sql @@ -0,0 +1,30 @@ +DO +$$ + BEGIN + IF EXISTS( + SELECT nspname + FROM pg_catalog.pg_namespace + WHERE nspname = 'public' + ) AND + current_setting('server_version_num')::int / 10000 >= 15 + THEN + IF EXISTS( + SELECT rolname + FROM pg_catalog.pg_roles + WHERE rolname = 'web_access' + ) + THEN + GRANT CREATE ON SCHEMA public TO web_access; + END IF; + END IF; + IF EXISTS( + SELECT nspname + FROM pg_catalog.pg_namespace + WHERE nspname = 'public' + ) + THEN + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION; + END IF; + END +$$; \ No newline at end of file diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql new file mode 100644 index 0000000000..fd061a713e --- /dev/null +++ b/compute_tools/src/sql/set_public_schema_owner.sql @@ -0,0 +1,23 @@ +DO +$$ + DECLARE + schema_owner TEXT; + BEGIN + IF EXISTS( + SELECT nspname + FROM pg_catalog.pg_namespace + WHERE nspname = 'public' + ) + THEN + SELECT nspowner::regrole::text + FROM pg_catalog.pg_namespace + WHERE nspname = 'public' + INTO schema_owner; + + IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin' + THEN + ALTER SCHEMA public OWNER TO {db_owner}; + END IF; + END IF; + END +$$; \ No newline at end of file diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql new file mode 100644 index 0000000000..6c4343a589 --- /dev/null +++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql @@ -0,0 +1,12 @@ +DO $$ + BEGIN + IF EXISTS( + SELECT 1 + FROM pg_catalog.pg_database + WHERE datname = {datname_str} + ) + THEN + ALTER DATABASE {datname} is_template false; + END IF; + END +$$; \ No newline at end of file diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index c4063bbd1a..1ea443b026 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1153,6 +1153,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re timeline_info.timeline_id ); } + // TODO: rename to import-basebackup-plus-wal TimelineCmd::Import(args) => { let tenant_id = get_tenant_id(args.tenant_id, env)?; let timeline_id = args.timeline_id; diff --git a/deny.toml b/deny.toml index 8bf643f4ba..7a1eecac99 100644 --- a/deny.toml +++ b/deny.toml @@ -33,7 +33,6 @@ reason = "the marvin attack only affects private key decryption, not public key [licenses] allow = [ "Apache-2.0", - "Artistic-2.0", "BSD-2-Clause", "BSD-3-Clause", "CC0-1.0", @@ -67,7 +66,7 @@ registries = [] # More documentation about the 'bans' section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html [bans] -multiple-versions = "warn" +multiple-versions = "allow" wildcards = "allow" highlight = "all" workspace-default-features = "allow" diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 3732bfdab2..1f7e913c07 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -113,21 +113,21 @@ so manual installation of dependencies is not recommended. A single virtual environment with all dependencies is described in the single `Pipfile`. ### Prerequisites -- Install Python 3.9 (the minimal supported version) or greater. +- Install Python 3.11 (the minimal supported version) or greater. - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected. - - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: + - If you have some trouble with other version you can resolve it by installing Python 3.11 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu sudo add-apt-repository ppa:deadsnakes/ppa sudo apt update - sudo apt install python3.9 + sudo apt install python3.11 ``` - Install `poetry` - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation). - Install dependencies via `./scripts/pysync`. - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) so if you have different version some linting tools can yield different result locally vs in the CI. - - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`. + - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.11`. This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning. Run `poetry shell` to activate the virtual environment. diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs index 920724fdec..13a745e031 100644 --- a/libs/metrics/src/more_process_metrics.rs +++ b/libs/metrics/src/more_process_metrics.rs @@ -2,14 +2,28 @@ // This module has heavy inspiration from the prometheus crate's `process_collector.rs`. +use once_cell::sync::Lazy; +use prometheus::Gauge; + use crate::UIntGauge; pub struct Collector { descs: Vec, vmlck: crate::UIntGauge, + cpu_seconds_highres: Gauge, } -const NMETRICS: usize = 1; +const NMETRICS: usize = 2; + +static CLK_TCK_F64: Lazy = Lazy::new(|| { + let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; + if long == -1 { + panic!("sysconf(_SC_CLK_TCK) failed"); + } + let convertible_to_f64: i32 = + i32::try_from(long).expect("sysconf(_SC_CLK_TCK) is larger than i32"); + convertible_to_f64 as f64 +}); impl prometheus::core::Collector for Collector { fn desc(&self) -> Vec<&prometheus::core::Desc> { @@ -27,6 +41,12 @@ impl prometheus::core::Collector for Collector { mfs.extend(self.vmlck.collect()) } } + if let Ok(stat) = myself.stat() { + let cpu_seconds = stat.utime + stat.stime; + self.cpu_seconds_highres + .set(cpu_seconds as f64 / *CLK_TCK_F64); + mfs.extend(self.cpu_seconds_highres.collect()); + } mfs } } @@ -43,7 +63,23 @@ impl Collector { .cloned(), ); - Self { descs, vmlck } + let cpu_seconds_highres = Gauge::new( + "libmetrics_process_cpu_seconds_highres", + "Total user and system CPU time spent in seconds.\ + Sub-second resolution, hence better than `process_cpu_seconds_total`.", + ) + .unwrap(); + descs.extend( + prometheus::core::Collector::desc(&cpu_seconds_highres) + .into_iter() + .cloned(), + ); + + Self { + descs, + vmlck, + cpu_seconds_highres, + } } } diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 8710904cec..79da05da6c 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -33,6 +33,7 @@ remote_storage.workspace = true postgres_backend.workspace = true nix = {workspace = true, optional = true} reqwest.workspace = true +rand.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index ee20613d6d..0abca5cdc2 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -18,7 +18,7 @@ use std::{ str::FromStr, time::Duration, }; -use utils::logging::LogFormat; +use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol}; use crate::models::ImageCompressionAlgorithm; use crate::models::LsnLease; @@ -97,6 +97,15 @@ pub struct ConfigToml { pub control_plane_api: Option, pub control_plane_api_token: Option, pub control_plane_emergency_mode: bool, + /// Unstable feature: subject to change or removal without notice. + /// See . + pub import_pgdata_upcall_api: Option, + /// Unstable feature: subject to change or removal without notice. + /// See . + pub import_pgdata_upcall_api_token: Option, + /// Unstable feature: subject to change or removal without notice. + /// See . + pub import_pgdata_aws_endpoint_url: Option, pub heatmap_upload_concurrency: usize, pub secondary_download_concurrency: usize, pub virtual_file_io_engine: Option, @@ -111,6 +120,7 @@ pub struct ConfigToml { pub no_sync: Option, #[serde(with = "humantime_serde")] pub server_side_batch_timeout: Option, + pub wal_receiver_protocol: PostgresClientProtocol, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -321,6 +331,9 @@ pub mod defaults { pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None; + + pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol = + utils::postgres_client::PostgresClientProtocol::Vanilla; } impl Default for ConfigToml { @@ -386,6 +399,10 @@ impl Default for ConfigToml { control_plane_api_token: (None), control_plane_emergency_mode: (false), + import_pgdata_upcall_api: (None), + import_pgdata_upcall_api_token: (None), + import_pgdata_aws_endpoint_url: (None), + heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), @@ -405,6 +422,7 @@ impl Default for ConfigToml { .map(|duration| humantime::parse_duration(duration).unwrap()), tenant_config: TenantConfigToml::default(), no_sync: None, + wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, } } } diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 401887d362..c55b9e9484 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -48,7 +48,7 @@ pub struct ShardedRange<'a> { // Calculate the size of a range within the blocks of the same relation, or spanning only the // top page in the previous relation's space. -fn contiguous_range_len(range: &Range) -> u32 { +pub fn contiguous_range_len(range: &Range) -> u32 { debug_assert!(is_contiguous_range(range)); if range.start.field6 == 0xffffffff { range.end.field6 + 1 @@ -67,7 +67,7 @@ fn contiguous_range_len(range: &Range) -> u32 { /// This matters, because: /// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse. /// - Within such ranges, we may calculate distances using simple subtraction of field6. -fn is_contiguous_range(range: &Range) -> bool { +pub fn is_contiguous_range(range: &Range) -> bool { range.start.field1 == range.end.field1 && range.start.field2 == range.end.field2 && range.start.field3 == range.end.field3 diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0dfa1ba817..1b86bfd91a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -2,6 +2,8 @@ pub mod detach_ancestor; pub mod partitioning; pub mod utilization; +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; pub use utilization::PageserverUtilization; use std::{ @@ -227,6 +229,9 @@ pub enum TimelineCreateRequestMode { // we continue to accept it by having it here. pg_version: Option, }, + ImportPgdata { + import_pgdata: TimelineCreateRequestModeImportPgdata, + }, // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap. // (serde picks the first matching enum variant, in declaration order). Bootstrap { @@ -236,6 +241,42 @@ pub enum TimelineCreateRequestMode { }, } +#[derive(Serialize, Deserialize, Clone)] +pub struct TimelineCreateRequestModeImportPgdata { + pub location: ImportPgdataLocation, + pub idempotency_key: ImportPgdataIdempotencyKey, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub enum ImportPgdataLocation { + #[cfg(feature = "testing")] + LocalFs { path: Utf8PathBuf }, + AwsS3 { + region: String, + bucket: String, + /// A better name for this would be `prefix`; changing requires coordination with cplane. + /// See . + key: String, + }, +} + +#[derive(Serialize, Deserialize, Clone)] +#[serde(transparent)] +pub struct ImportPgdataIdempotencyKey(pub String); + +impl ImportPgdataIdempotencyKey { + pub fn random() -> Self { + use rand::{distributions::Alphanumeric, Rng}; + Self( + rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(20) + .map(char::from) + .collect(), + ) + } +} + #[derive(Serialize, Deserialize, Clone)] pub struct LsnLeaseRequest { pub lsn: Lsn, diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 9075a019b4..8c024375c1 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -834,7 +834,7 @@ impl PostgresBackend { use CopyStreamHandlerEnd::*; let expected_end = match &end { - ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true, + ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true, CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) if is_expected_io_error(io_error) => { @@ -874,6 +874,9 @@ impl PostgresBackend { // message from server' when it receives ErrorResponse (anything but // CopyData/CopyDone) back. CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), + + // When cancelled, send no response: we must not risk blocking on sending that response + Cancelled => None, _ => None, }; if let Some((err, errcode)) = err_to_send_and_errcode { @@ -1051,6 +1054,8 @@ pub enum CopyStreamHandlerEnd { /// The connection was lost #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), + #[error("Shutdown")] + Cancelled, /// Some other error #[error(transparent)] Other(#[from] anyhow::Error), diff --git a/libs/postgres_initdb/Cargo.toml b/libs/postgres_initdb/Cargo.toml new file mode 100644 index 0000000000..1605279bce --- /dev/null +++ b/libs/postgres_initdb/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "postgres_initdb" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +tokio.workspace = true +camino.workspace = true +thiserror.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs new file mode 100644 index 0000000000..2f072354fb --- /dev/null +++ b/libs/postgres_initdb/src/lib.rs @@ -0,0 +1,103 @@ +//! The canonical way we run `initdb` in Neon. +//! +//! initdb has implicit defaults that are dependent on the environment, e.g., locales & collations. +//! +//! This module's job is to eliminate the environment-dependence as much as possible. + +use std::fmt; + +use camino::Utf8Path; + +pub struct RunInitdbArgs<'a> { + pub superuser: &'a str, + pub locale: &'a str, + pub initdb_bin: &'a Utf8Path, + pub pg_version: u32, + pub library_search_path: &'a Utf8Path, + pub pgdata: &'a Utf8Path, +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + Spawn(std::io::Error), + Failed { + status: std::process::ExitStatus, + stderr: Vec, + }, + WaitOutput(std::io::Error), + Other(anyhow::Error), +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Error::Spawn(e) => write!(f, "Error spawning command: {:?}", e), + Error::Failed { status, stderr } => write!( + f, + "Command failed with status {:?}: {}", + status, + String::from_utf8_lossy(stderr) + ), + Error::WaitOutput(e) => write!(f, "Error waiting for command output: {:?}", e), + Error::Other(e) => write!(f, "Error: {:?}", e), + } + } +} + +pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> { + let RunInitdbArgs { + superuser, + locale, + initdb_bin: initdb_bin_path, + pg_version, + library_search_path, + pgdata, + } = args; + let mut initdb_command = tokio::process::Command::new(initdb_bin_path); + initdb_command + .args(["--pgdata", pgdata.as_ref()]) + .args(["--username", superuser]) + .args(["--encoding", "utf8"]) + .args(["--locale", locale]) + .arg("--no-instructions") + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", library_search_path) + .env("DYLD_LIBRARY_PATH", library_search_path) + .stdin(std::process::Stdio::null()) + // stdout invocation produces the same output every time, we don't need it + .stdout(std::process::Stdio::null()) + // we would be interested in the stderr output, if there was any + .stderr(std::process::Stdio::piped()); + + // Before version 14, only the libc provide was available. + if pg_version > 14 { + // Version 17 brought with it a builtin locale provider which only provides + // C and C.UTF-8. While being safer for collation purposes since it is + // guaranteed to be consistent throughout a major release, it is also more + // performant. + let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; + + initdb_command.args(["--locale-provider", locale_provider]); + } + + let initdb_proc = initdb_command.spawn().map_err(Error::Spawn)?; + + // Ideally we'd select here with the cancellation token, but the problem is that + // we can't safely terminate initdb: it launches processes of its own, and killing + // initdb doesn't kill them. After we return from this function, we want the target + // directory to be able to be cleaned up. + // See https://github.com/neondatabase/neon/issues/6385 + let initdb_output = initdb_proc + .wait_with_output() + .await + .map_err(Error::WaitOutput)?; + if !initdb_output.status.success() { + return Err(Error::Failed { + status: initdb_output.status, + stderr: initdb_output.stderr, + }); + } + + Ok(()) +} diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 9ffaaba584..b7871ab01f 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -185,7 +185,7 @@ pub struct CancelKeyData { impl fmt::Display for CancelKeyData { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let hi = (self.backend_pid as u64) << 32; - let lo = self.cancel_key as u64; + let lo = (self.cancel_key as u64) & 0xffffffff; let id = hi | lo; // This format is more compact and might work better for logs. @@ -562,6 +562,9 @@ pub enum BeMessage<'a> { options: &'a [&'a str], }, KeepAlive(WalSndKeepAlive), + /// Batch of interpreted, shard filtered WAL records, + /// ready for the pageserver to ingest + InterpretedWalRecords(InterpretedWalRecordsBody<'a>), } /// Common shorthands. @@ -672,6 +675,25 @@ pub struct WalSndKeepAlive { pub request_reply: bool, } +/// Batch of interpreted WAL records used in the interpreted +/// safekeeper to pageserver protocol. +/// +/// Note that the pageserver uses the RawInterpretedWalRecordsBody +/// counterpart of this from the neondatabase/rust-postgres repo. +/// If you're changing this struct, you likely need to change its +/// twin as well. +#[derive(Debug)] +pub struct InterpretedWalRecordsBody<'a> { + /// End of raw WAL in [`Self::data`] + pub streaming_lsn: u64, + /// Current end of WAL on the server + pub commit_lsn: u64, + /// Start LSN of the next record in PG WAL. + /// Is 0 if the portion of PG WAL did not contain any records. + pub next_record_lsn: u64, + pub data: &'a [u8], +} + pub static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(&[Some(b"hello world")]); // single text column @@ -996,6 +1018,20 @@ impl BeMessage<'_> { Ok(()) })? } + + BeMessage::InterpretedWalRecords(rec) => { + // We use the COPY_DATA_TAG for our custom message + // since this tag is interpreted as raw bytes. + buf.put_u8(b'd'); + write_body(buf, |buf| { + buf.put_u8(b'0'); // matches INTERPRETED_WAL_RECORD_TAG in postgres-protocol + // dependency + buf.put_u64(rec.streaming_lsn); + buf.put_u64(rec.commit_lsn); + buf.put_u64(rec.next_record_lsn); + buf.put_slice(rec.data); + }); + } } Ok(()) } @@ -1046,4 +1082,13 @@ mod tests { let data = [0, 0, 0, 7, 0, 0, 0, 0]; FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err(); } + + #[test] + fn cancel_key_data() { + let key = CancelKeyData { + backend_pid: -1817212860, + cancel_key: -1183897012, + }; + assert_eq!(format!("{key}"), "CancelKeyData(93af8844b96f2a4c)"); + } } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index f98d16789c..ae0a94295c 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -24,6 +24,7 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl use bytes::Bytes; use futures::future::Either; use futures::stream::Stream; +use futures::FutureExt; use futures_util::StreamExt; use futures_util::TryStreamExt; use http_types::{StatusCode, Url}; @@ -31,6 +32,7 @@ use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use tracing::debug; use utils::backoff; +use utils::backoff::exponential_backoff_duration_seconds; use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; use crate::{ @@ -97,10 +99,7 @@ impl AzureBlobStorage { pub fn relative_path_to_name(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); - let path_string = path - .get_path() - .as_str() - .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR); + let path_string = path.get_path().as_str(); match &self.prefix_in_container { Some(prefix) => { if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { @@ -277,19 +276,14 @@ impl RemoteStorage for AzureBlobStorage { cancel: &CancellationToken, ) -> impl Stream> { // get the passed prefix or if it is not set use prefix_in_bucket value - let list_prefix = prefix - .map(|p| self.relative_path_to_name(p)) - .or_else(|| self.prefix_in_container.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + let list_prefix = prefix.map(|p| self.relative_path_to_name(p)).or_else(|| { + self.prefix_in_container.clone().map(|mut s| { + if !s.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); } - p - }); + s + }) + }); async_stream::stream! { let _permit = self.permit(RequestKind::List, cancel).await?; @@ -310,40 +304,59 @@ impl RemoteStorage for AzureBlobStorage { let mut next_marker = None; + let mut timeout_try_cnt = 1; + 'outer: loop { let mut builder = builder.clone(); if let Some(marker) = next_marker.clone() { builder = builder.marker(marker); } - let response = builder.into_stream(); - let response = response.into_stream().map_err(to_download_error); - let response = tokio_stream::StreamExt::timeout(response, self.timeout); - let response = response.map(|res| match res { - Ok(res) => res, - Err(_elapsed) => Err(DownloadError::Timeout), + // Azure Blob Rust SDK does not expose the list blob API directly. Users have to use + // their pageable iterator wrapper that returns all keys as a stream. We want to have + // full control of paging, and therefore we only take the first item from the stream. + let mut response_stream = builder.into_stream(); + let response = response_stream.next(); + // Timeout mechanism: Azure client will sometimes stuck on a request, but retrying that request + // would immediately succeed. Therefore, we use exponential backoff timeout to retry the request. + // (Usually, exponential backoff is used to determine the sleep time between two retries.) We + // start with 10.0 second timeout, and double the timeout for each failure, up to 5 failures. + // timeout = min(5 * (1.0+1.0)^n, self.timeout). + let this_timeout = (5.0 * exponential_backoff_duration_seconds(timeout_try_cnt, 1.0, self.timeout.as_secs_f64())).min(self.timeout.as_secs_f64()); + let response = tokio::time::timeout(Duration::from_secs_f64(this_timeout), response); + let response = response.map(|res| { + match res { + Ok(Some(Ok(res))) => Ok(Some(res)), + Ok(Some(Err(e))) => Err(to_download_error(e)), + Ok(None) => Ok(None), + Err(_elasped) => Err(DownloadError::Timeout), + } }); - - let mut response = std::pin::pin!(response); - let mut max_keys = max_keys.map(|mk| mk.get()); let next_item = tokio::select! { - op = response.next() => Ok(op), + op = response => op, _ = cancel.cancelled() => Err(DownloadError::Cancelled), - }?; + }; + + if let Err(DownloadError::Timeout) = &next_item { + timeout_try_cnt += 1; + if timeout_try_cnt <= 5 { + continue; + } + } + + let next_item = next_item?; + + if timeout_try_cnt >= 2 { + tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt); + } + timeout_try_cnt = 1; + let Some(entry) = next_item else { // The list is complete, so yield it. break; }; let mut res = Listing::default(); - let entry = match entry { - Ok(entry) => entry, - Err(e) => { - // The error is potentially retryable, so we must rewind the loop after yielding. - yield Err(e); - continue; - } - }; next_marker = entry.continuation(); let prefix_iter = entry .blobs @@ -359,7 +372,7 @@ impl RemoteStorage for AzureBlobStorage { last_modified: k.properties.last_modified.into(), size: k.properties.content_length, } - ); + ); for key in blob_iter { res.keys.push(key); diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 553153826e..ee2fc9d6e2 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -360,7 +360,12 @@ impl RemoteStorage for LocalFs { let mut objects = Vec::with_capacity(keys.len()); for key in keys { let path = key.with_base(&self.storage_root); - let metadata = file_metadata(&path).await?; + let metadata = file_metadata(&path).await; + if let Err(DownloadError::NotFound) = metadata { + // Race: if the file is deleted between listing and metadata check, ignore it. + continue; + } + let metadata = metadata?; if metadata.is_dir() { continue; } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 545317f958..f440b81d8f 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -29,9 +29,11 @@ jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true +pprof.workspace = true regex.workspace = true routerify.workspace = true serde.workspace = true +serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true thiserror.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 8ee5abd434..6a85f0ddeb 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,7 +1,8 @@ use crate::auth::{AuthError, Claims, SwappableJwtAuth}; use crate::http::error::{api_error_handler, route_error_handler, ApiError}; -use anyhow::Context; -use hyper::header::{HeaderName, AUTHORIZATION}; +use crate::http::request::{get_query_param, parse_query_param}; +use anyhow::{anyhow, Context}; +use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION}; use hyper::http::HeaderValue; use hyper::Method; use hyper::{header::CONTENT_TYPE, Body, Request, Response}; @@ -12,11 +13,13 @@ use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; +use std::io::Write as _; use std::str::FromStr; +use std::time::Duration; use bytes::{Bytes, BytesMut}; -use std::io::Write as _; -use tokio::sync::mpsc; +use pprof::protos::Message as _; +use tokio::sync::{mpsc, Mutex}; use tokio_stream::wrappers::ReceiverStream; static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { @@ -328,6 +331,82 @@ pub async fn prometheus_metrics_handler(_req: Request) -> Result) -> Result, ApiError> { + enum Format { + Pprof, + Svg, + } + + // Parameters. + let format = match get_query_param(&req, "format")?.as_deref() { + None => Format::Pprof, + Some("pprof") => Format::Pprof, + Some("svg") => Format::Svg, + Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), + }; + let seconds = match parse_query_param(&req, "seconds")? { + None => 5, + Some(seconds @ 1..=30) => seconds, + Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))), + }; + let frequency_hz = match parse_query_param(&req, "frequency")? { + None => 99, + Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))), + Some(frequency) => frequency, + }; + + // Only allow one profiler at a time. + static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + let _lock = PROFILE_LOCK + .try_lock() + .map_err(|_| ApiError::Conflict("profiler already running".into()))?; + + // Take the profile. + let report = tokio::task::spawn_blocking(move || { + let guard = pprof::ProfilerGuardBuilder::default() + .frequency(frequency_hz) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build()?; + std::thread::sleep(Duration::from_secs(seconds)); + guard.report().build() + }) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?; + + // Return the report in the requested format. + match format { + Format::Pprof => { + let mut body = Vec::new(); + report + .pprof() + .map_err(|err| ApiError::InternalServerError(err.into()))? + .write_to_vec(&mut body) + .map_err(|err| ApiError::InternalServerError(err.into()))?; + + Response::builder() + .status(200) + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_DISPOSITION, "attachment; filename=\"profile.pb\"") + .body(Body::from(body)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + + Format::Svg => { + let mut body = Vec::new(); + report + .flamegraph(&mut body) + .map_err(|err| ApiError::InternalServerError(err.into()))?; + Response::builder() + .status(200) + .header(CONTENT_TYPE, "image/svg+xml") + .body(Body::from(body)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + } +} + pub fn add_request_id_middleware( ) -> Middleware { Middleware::pre(move |req| async move { diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 8b8ed5a67f..7ea71685ec 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -30,7 +30,7 @@ pub fn parse_request_param( } } -fn get_query_param<'a>( +pub fn get_query_param<'a>( request: &'a Request, param_name: &str, ) -> Result>, ApiError> { diff --git a/libs/utils/src/postgres_client.rs b/libs/utils/src/postgres_client.rs index dba74f5b0b..3073bbde4c 100644 --- a/libs/utils/src/postgres_client.rs +++ b/libs/utils/src/postgres_client.rs @@ -7,29 +7,94 @@ use postgres_connection::{parse_host_port, PgConnectionConfig}; use crate::id::TenantTimelineId; +/// Postgres client protocol types +#[derive( + Copy, + Clone, + PartialEq, + Eq, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, + Debug, +)] +#[strum(serialize_all = "kebab-case")] +#[repr(u8)] +pub enum PostgresClientProtocol { + /// Usual Postgres replication protocol + Vanilla, + /// Custom shard-aware protocol that replicates interpreted records. + /// Used to send wal from safekeeper to pageserver. + Interpreted, +} + +impl TryFrom for PostgresClientProtocol { + type Error = u8; + + fn try_from(value: u8) -> Result { + Ok(match value { + v if v == (PostgresClientProtocol::Vanilla as u8) => PostgresClientProtocol::Vanilla, + v if v == (PostgresClientProtocol::Interpreted as u8) => { + PostgresClientProtocol::Interpreted + } + x => return Err(x), + }) + } +} + +pub struct ConnectionConfigArgs<'a> { + pub protocol: PostgresClientProtocol, + + pub ttid: TenantTimelineId, + pub shard_number: Option, + pub shard_count: Option, + pub shard_stripe_size: Option, + + pub listen_pg_addr_str: &'a str, + + pub auth_token: Option<&'a str>, + pub availability_zone: Option<&'a str>, +} + +impl<'a> ConnectionConfigArgs<'a> { + fn options(&'a self) -> Vec { + let mut options = vec![ + "-c".to_owned(), + format!("timeline_id={}", self.ttid.timeline_id), + format!("tenant_id={}", self.ttid.tenant_id), + format!("protocol={}", self.protocol as u8), + ]; + + if self.shard_number.is_some() { + assert!(self.shard_count.is_some()); + assert!(self.shard_stripe_size.is_some()); + + options.push(format!("shard_count={}", self.shard_count.unwrap())); + options.push(format!("shard_number={}", self.shard_number.unwrap())); + options.push(format!( + "shard_stripe_size={}", + self.shard_stripe_size.unwrap() + )); + } + + options + } +} + /// Create client config for fetching WAL from safekeeper on particular timeline. /// listen_pg_addr_str is in form host:\[port\]. pub fn wal_stream_connection_config( - TenantTimelineId { - tenant_id, - timeline_id, - }: TenantTimelineId, - listen_pg_addr_str: &str, - auth_token: Option<&str>, - availability_zone: Option<&str>, + args: ConnectionConfigArgs, ) -> anyhow::Result { let (host, port) = - parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?; + parse_host_port(args.listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?; let port = port.unwrap_or(5432); let mut connstr = PgConnectionConfig::new_host_port(host, port) - .extend_options([ - "-c".to_owned(), - format!("timeline_id={}", timeline_id), - format!("tenant_id={}", tenant_id), - ]) - .set_password(auth_token.map(|s| s.to_owned())); + .extend_options(args.options()) + .set_password(args.auth_token.map(|s| s.to_owned())); - if let Some(availability_zone) = availability_zone { + if let Some(availability_zone) = args.availability_zone { connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]); } diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 375b227b99..d99dc25769 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -83,7 +83,9 @@ where } wake_these.push(self.heap.pop().unwrap().wake_channel); } - self.update_status(); + if !wake_these.is_empty() { + self.update_status(); + } wake_these } diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs index 3223765016..1d70cedcf9 100644 --- a/libs/vm_monitor/src/cgroup.rs +++ b/libs/vm_monitor/src/cgroup.rs @@ -218,7 +218,7 @@ impl MemoryStatus { fn debug_slice(slice: &[Self]) -> impl '_ + Debug { struct DS<'a>(&'a [MemoryStatus]); - impl<'a> Debug for DS<'a> { + impl Debug for DS<'_> { fn fmt(&self, f: &mut Formatter) -> fmt::Result { f.debug_struct("[MemoryStatus]") .field( @@ -233,7 +233,7 @@ impl MemoryStatus { struct Fields<'a, F>(&'a [MemoryStatus], F); - impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> { + impl T, T: Debug> Debug for Fields<'_, F> { fn fmt(&self, f: &mut Formatter) -> fmt::Result { f.debug_list().entries(self.0.iter().map(&self.1)).finish() } diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index c69f8c869a..7ac425cb5f 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -65,6 +65,18 @@ pub struct InterpretedWalRecord { pub xid: TransactionId, } +impl InterpretedWalRecord { + /// Checks if the WAL record is empty + /// + /// An empty interpreted WAL record has no data or metadata and does not have to be sent to the + /// pageserver. + pub fn is_empty(&self) -> bool { + self.batch.is_empty() + && self.metadata_record.is_none() + && matches!(self.flush_uncommitted, FlushUncommittedRecords::No) + } +} + /// The interpreted part of the Postgres WAL record which requires metadata /// writes to the underlying storage engine. #[derive(Serialize, Deserialize)] diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index 9c0708ebbe..41294da7a0 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -496,11 +496,16 @@ impl SerializedValueBatch { } } - /// Checks if the batch is empty - /// - /// A batch is empty when it contains no serialized values. - /// Note that it may still contain observed values. + /// Checks if the batch contains any serialized or observed values pub fn is_empty(&self) -> bool { + !self.has_data() && self.metadata.is_empty() + } + + /// Checks if the batch contains data + /// + /// Note that if this returns false, it may still contain observed values or + /// a metadata record. + pub fn has_data(&self) -> bool { let empty = self.raw.is_empty(); if cfg!(debug_assertions) && empty { @@ -510,7 +515,7 @@ impl SerializedValueBatch { .all(|meta| matches!(meta, ValueMeta::Observed(_)))); } - empty + !empty } /// Returns the number of values serialized in the batch diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 143d8236df..140b287ccc 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -43,6 +43,7 @@ postgres.workspace = true postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true +postgres_initdb.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true @@ -68,6 +69,7 @@ url.workspace = true walkdir.workspace = true metrics.workspace = true pageserver_api.workspace = true +pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that pageserver_compaction.workspace = true postgres_connection.workspace = true postgres_ffi.workspace = true diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 033a9a4619..a8c2c2e992 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); + info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f7be6ecaab..2cf237e72b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -14,6 +14,7 @@ use remote_storage::{RemotePath, RemoteStorageConfig}; use std::env; use storage_broker::Uri; use utils::logging::SecretString; +use utils::postgres_client::PostgresClientProtocol; use once_cell::sync::OnceCell; use reqwest::Url; @@ -144,6 +145,10 @@ pub struct PageServerConf { /// JWT token for use with the control plane API. pub control_plane_api_token: Option, + pub import_pgdata_upcall_api: Option, + pub import_pgdata_upcall_api_token: Option, + pub import_pgdata_aws_endpoint_url: Option, + /// If true, pageserver will make best-effort to operate without a control plane: only /// for use in major incidents. pub control_plane_emergency_mode: bool, @@ -186,6 +191,8 @@ pub struct PageServerConf { /// Maximum amount of time for which a get page request request /// might be held up for request merging. pub server_side_batch_timeout: Option, + + pub wal_receiver_protocol: PostgresClientProtocol, } /// Token for authentication to safekeepers @@ -328,6 +335,9 @@ impl PageServerConf { control_plane_api, control_plane_api_token, control_plane_emergency_mode, + import_pgdata_upcall_api, + import_pgdata_upcall_api_token, + import_pgdata_aws_endpoint_url, heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, @@ -343,6 +353,7 @@ impl PageServerConf { server_side_batch_timeout, tenant_config, no_sync, + wal_receiver_protocol, } = config_toml; let mut conf = PageServerConf { @@ -383,6 +394,10 @@ impl PageServerConf { timeline_offloading, ephemeral_bytes_per_memory_kb, server_side_batch_timeout, + import_pgdata_upcall_api, + import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from), + import_pgdata_aws_endpoint_url, + wal_receiver_protocol, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index 1f04bc0410..3d02387c98 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -15,6 +15,7 @@ use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; use utils::backoff; +use utils::pausable_failpoint; use crate::metrics; @@ -90,6 +91,7 @@ impl Deleter { /// Block until everything in accumulator has been executed async fn flush(&mut self) -> Result<(), DeletionQueueError> { while !self.accumulator.is_empty() && !self.cancel.is_cancelled() { + pausable_failpoint!("deletion-queue-before-execute-pause"); match self.remote_delete().await { Ok(()) => { // Note: we assume that the remote storage layer returns Ok(()) if some diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2bc7f5ad39..7fb9247feb 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -623,6 +623,8 @@ paths: existing_initdb_timeline_id: type: string format: hex + import_pgdata: + $ref: "#/components/schemas/TimelineCreateRequestImportPgdata" responses: "201": description: Timeline was created, or already existed with matching parameters @@ -979,6 +981,34 @@ components: $ref: "#/components/schemas/TenantConfig" effective_config: $ref: "#/components/schemas/TenantConfig" + TimelineCreateRequestImportPgdata: + type: object + required: + - location + - idempotency_key + properties: + idempotency_key: + type: string + location: + $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocation" + TimelineCreateRequestImportPgdataLocation: + type: object + properties: + AwsS3: + $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocationAwsS3" + TimelineCreateRequestImportPgdataLocationAwsS3: + type: object + properties: + region: + type: string + bucket: + type: string + key: + type: string + required: + - region + - bucket + - key TimelineInfo: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 306b0f35ab..ceb1c3b012 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -40,6 +40,7 @@ use pageserver_api::models::TenantSorting; use pageserver_api::models::TenantState; use pageserver_api::models::TimelineArchivalConfigRequest; use pageserver_api::models::TimelineCreateRequestMode; +use pageserver_api::models::TimelineCreateRequestModeImportPgdata; use pageserver_api::models::TimelinesInfoAndOffloaded; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::TopTenantShardsRequest; @@ -55,6 +56,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::profile_cpu_handler; use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; use utils::http::request::must_parse_query_param; @@ -80,6 +82,7 @@ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::import_pgdata; use crate::tenant::timeline::offload::offload_timeline; use crate::tenant::timeline::offload::OffloadError; use crate::tenant::timeline::CompactFlags; @@ -125,7 +128,7 @@ pub struct State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, - allowlist_routes: Vec, + allowlist_routes: &'static [&'static str], remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, @@ -146,10 +149,13 @@ impl State { deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, ) -> anyhow::Result { - let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect::>(); + let allowlist_routes = &[ + "/v1/status", + "/v1/doc", + "/swagger.yml", + "/metrics", + "/profile/cpu", + ]; Ok(Self { conf, tenant_manager, @@ -576,6 +582,35 @@ async fn timeline_create_handler( ancestor_timeline_id, ancestor_start_lsn, }), + TimelineCreateRequestMode::ImportPgdata { + import_pgdata: + TimelineCreateRequestModeImportPgdata { + location, + idempotency_key, + }, + } => tenant::CreateTimelineParams::ImportPgdata(tenant::CreateTimelineParamsImportPgdata { + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new( + idempotency_key.0, + ), + new_timeline_id, + location: { + use import_pgdata::index_part_format::Location; + use pageserver_api::models::ImportPgdataLocation; + match location { + #[cfg(feature = "testing")] + ImportPgdataLocation::LocalFs { path } => Location::LocalFs { path }, + ImportPgdataLocation::AwsS3 { + region, + bucket, + key, + } => Location::AwsS3 { + region, + bucket, + key, + }, + } + }, + }), }; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); @@ -3148,7 +3183,7 @@ pub fn make_router( if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); - if state.allowlist_routes.contains(request.uri()) { + if state.allowlist_routes.contains(&request.uri().path()) { None } else { state.auth.as_deref() @@ -3167,6 +3202,7 @@ pub fn make_router( Ok(router .data(state) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a429dff1fd..5fd02d8749 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1068,21 +1068,26 @@ impl PageServerHandler { )); } - if request_lsn < **latest_gc_cutoff_lsn { + // Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus + if request_lsn == Lsn::INVALID { + return Err(PageStreamError::BadRequest( + "invalid LSN(0) in request".into(), + )); + } + + // Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease. + // + // We may have older data available, but we make a best effort to detect this case and return an error, + // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN). + if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() { let gc_info = &timeline.gc_info.read().unwrap(); if !gc_info.leases.contains_key(&request_lsn) { - // The requested LSN is below gc cutoff and is not guarded by a lease. - - // Check explicitly for INVALID just to get a less scary error message if the - // request is obviously bogus - return Err(if request_lsn == Lsn::INVALID { - PageStreamError::BadRequest("invalid LSN(0) in request".into()) - } else { + return Err( PageStreamError::BadRequest(format!( "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", request_lsn, **latest_gc_cutoff_lsn ).into()) - }); + ); } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 5995d1cc57..c491bfe650 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1229,10 +1229,9 @@ impl<'a> DatadirModification<'a> { } pub(crate) fn has_dirty_data(&self) -> bool { - !self - .pending_data_batch + self.pending_data_batch .as_ref() - .map_or(true, |b| b.is_empty()) + .map_or(false, |b| b.has_data()) } /// Set the current lsn @@ -1408,7 +1407,7 @@ impl<'a> DatadirModification<'a> { Some(pending_batch) => { pending_batch.extend(batch); } - None if !batch.is_empty() => { + None if batch.has_data() => { self.pending_data_batch = Some(batch); } None => { @@ -2276,9 +2275,9 @@ impl<'a> Version<'a> { //--- Metadata structs stored in key-value pairs in the repository. #[derive(Debug, Serialize, Deserialize)] -struct DbDirectory { +pub(crate) struct DbDirectory { // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) - dbdirs: HashMap<(Oid, Oid), bool>, + pub(crate) dbdirs: HashMap<(Oid, Oid), bool>, } // The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of @@ -2287,8 +2286,8 @@ struct DbDirectory { // "pg_twophsae/0000000A000002E4". #[derive(Debug, Serialize, Deserialize)] -struct TwoPhaseDirectory { - xids: HashSet, +pub(crate) struct TwoPhaseDirectory { + pub(crate) xids: HashSet, } #[derive(Debug, Serialize, Deserialize)] @@ -2297,12 +2296,12 @@ struct TwoPhaseDirectoryV17 { } #[derive(Debug, Serialize, Deserialize, Default)] -struct RelDirectory { +pub(crate) struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) // // TODO: Store it as a btree or radix tree or something else that spans multiple // key-value pairs, if you have a lot of relations - rels: HashSet<(Oid, u8)>, + pub(crate) rels: HashSet<(Oid, u8)>, } #[derive(Debug, Serialize, Deserialize)] @@ -2311,9 +2310,9 @@ struct RelSizeEntry { } #[derive(Debug, Serialize, Deserialize, Default)] -struct SlruSegmentDirectory { +pub(crate) struct SlruSegmentDirectory { // Set of SLRU segments that exist. - segments: HashSet, + pub(crate) segments: HashSet, } #[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)] diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 6a4e90dd55..622738022a 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -381,6 +381,8 @@ pub enum TaskKind { UnitTest, DetachAncestor, + + ImportPgdata, } #[derive(Default)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 37bf83c984..0214ee68fa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -43,7 +43,9 @@ use std::sync::atomic::AtomicBool; use std::sync::Weak; use std::time::SystemTime; use storage_broker::BrokerClientChannel; +use timeline::import_pgdata; use timeline::offload::offload_timeline; +use timeline::ShutdownMode; use tokio::io::BufReader; use tokio::sync::watch; use tokio::task::JoinSet; @@ -189,6 +191,7 @@ pub struct TenantSharedResources { /// A [`Tenant`] is really an _attached_ tenant. The configuration /// for an attached tenant is a subset of the [`LocationConf`], represented /// in this struct. +#[derive(Clone)] pub(super) struct AttachedTenantConf { tenant_conf: TenantConfOpt, location: AttachedLocationConfig, @@ -249,7 +252,8 @@ struct TimelinePreload { pub(crate) struct TenantPreload { tenant_manifest: TenantManifest, - timelines: HashMap, + /// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest. + timelines: HashMap>, } /// When we spawn a tenant, there is a special mode for tenant creation that @@ -371,7 +375,6 @@ pub struct Tenant { l0_flush_global_state: L0FlushGlobalState, } - impl std::fmt::Debug for Tenant { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{} ({})", self.tenant_shard_id, self.current_state()) @@ -858,6 +861,7 @@ impl Debug for SetStoppingError { pub(crate) enum CreateTimelineParams { Bootstrap(CreateTimelineParamsBootstrap), Branch(CreateTimelineParamsBranch), + ImportPgdata(CreateTimelineParamsImportPgdata), } #[derive(Debug)] @@ -875,7 +879,14 @@ pub(crate) struct CreateTimelineParamsBranch { pub(crate) ancestor_start_lsn: Option, } -/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`]. +#[derive(Debug)] +pub(crate) struct CreateTimelineParamsImportPgdata { + pub(crate) new_timeline_id: TimelineId, + pub(crate) location: import_pgdata::index_part_format::Location, + pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey, +} + +/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`] in [`Tenant::start_creating_timeline`]. /// /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`]. /// @@ -905,19 +916,50 @@ pub(crate) enum CreateTimelineIdempotency { ancestor_timeline_id: TimelineId, ancestor_start_lsn: Lsn, }, + ImportPgdata(CreatingTimelineIdempotencyImportPgdata), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct CreatingTimelineIdempotencyImportPgdata { + idempotency_key: import_pgdata::index_part_format::IdempotencyKey, } /// What is returned by [`Tenant::start_creating_timeline`]. #[must_use] -enum StartCreatingTimelineResult<'t> { - CreateGuard(TimelineCreateGuard<'t>), +enum StartCreatingTimelineResult { + CreateGuard(TimelineCreateGuard), Idempotent(Arc), } +enum TimelineInitAndSyncResult { + ReadyToActivate(Arc), + NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), +} + +impl TimelineInitAndSyncResult { + fn ready_to_activate(self) -> Option> { + match self { + Self::ReadyToActivate(timeline) => Some(timeline), + _ => None, + } + } +} + +#[must_use] +struct TimelineInitAndSyncNeedsSpawnImportPgdata { + timeline: Arc, + import_pgdata: import_pgdata::index_part_format::Root, + guard: TimelineCreateGuard, +} + /// What is returned by [`Tenant::create_timeline`]. enum CreateTimelineResult { Created(Arc), Idempotent(Arc), + /// IMPORTANT: This [`Arc`] object is not in [`Tenant::timelines`] when + /// we return this result, nor will this concrete object ever be added there. + /// Cf method comment on [`Tenant::create_timeline_import_pgdata`]. + ImportSpawned(Arc), } impl CreateTimelineResult { @@ -925,18 +967,19 @@ impl CreateTimelineResult { match self { Self::Created(_) => "Created", Self::Idempotent(_) => "Idempotent", + Self::ImportSpawned(_) => "ImportSpawned", } } fn timeline(&self) -> &Arc { match self { - Self::Created(t) | Self::Idempotent(t) => t, + Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t, } } /// Unit test timelines aren't activated, test has to do it if it needs to. #[cfg(test)] fn into_timeline_for_test(self) -> Arc { match self { - Self::Created(t) | Self::Idempotent(t) => t, + Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t, } } } @@ -960,33 +1003,13 @@ pub enum CreateTimelineError { } #[derive(thiserror::Error, Debug)] -enum InitdbError { - Other(anyhow::Error), +pub enum InitdbError { + #[error("Operation was cancelled")] Cancelled, - Spawn(std::io::Result<()>), - Failed(std::process::ExitStatus, Vec), -} - -impl fmt::Display for InitdbError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - InitdbError::Cancelled => write!(f, "Operation was cancelled"), - InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e), - InitdbError::Failed(status, stderr) => write!( - f, - "Command failed with status {:?}: {}", - status, - String::from_utf8_lossy(stderr) - ), - InitdbError::Other(e) => write!(f, "Error: {:?}", e), - } - } -} - -impl From for InitdbError { - fn from(error: std::io::Error) -> Self { - InitdbError::Spawn(Err(error)) - } + #[error(transparent)] + Other(anyhow::Error), + #[error(transparent)] + Inner(postgres_initdb::Error), } enum CreateTimelineCause { @@ -994,6 +1017,15 @@ enum CreateTimelineCause { Delete, } +enum LoadTimelineCause { + Attach, + Unoffload, + ImportPgdata { + create_guard: TimelineCreateGuard, + activate: ActivateTimelineArgs, + }, +} + #[derive(thiserror::Error, Debug)] pub(crate) enum GcError { // The tenant is shutting down @@ -1070,24 +1102,35 @@ impl Tenant { /// it is marked as Active. #[allow(clippy::too_many_arguments)] async fn timeline_init_and_sync( - &self, + self: &Arc, timeline_id: TimelineId, resources: TimelineResources, - index_part: IndexPart, + mut index_part: IndexPart, metadata: TimelineMetadata, ancestor: Option>, - _ctx: &RequestContext, - ) -> anyhow::Result<()> { + cause: LoadTimelineCause, + ctx: &RequestContext, + ) -> anyhow::Result { let tenant_id = self.tenant_shard_id; - let idempotency = if metadata.ancestor_timeline().is_none() { - CreateTimelineIdempotency::Bootstrap { - pg_version: metadata.pg_version(), + let import_pgdata = index_part.import_pgdata.take(); + let idempotency = match &import_pgdata { + Some(import_pgdata) => { + CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { + idempotency_key: import_pgdata.idempotency_key().clone(), + }) } - } else { - CreateTimelineIdempotency::Branch { - ancestor_timeline_id: metadata.ancestor_timeline().unwrap(), - ancestor_start_lsn: metadata.ancestor_lsn(), + None => { + if metadata.ancestor_timeline().is_none() { + CreateTimelineIdempotency::Bootstrap { + pg_version: metadata.pg_version(), + } + } else { + CreateTimelineIdempotency::Branch { + ancestor_timeline_id: metadata.ancestor_timeline().unwrap(), + ancestor_start_lsn: metadata.ancestor_lsn(), + } + } } }; @@ -1119,39 +1162,91 @@ impl Tenant { format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}") })?; - { - // avoiding holding it across awaits - let mut timelines_accessor = self.timelines.lock().unwrap(); - match timelines_accessor.entry(timeline_id) { - // We should never try and load the same timeline twice during startup - Entry::Occupied(_) => { - unreachable!( - "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" - ); + match import_pgdata { + Some(import_pgdata) if !import_pgdata.is_done() => { + match cause { + LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), + LoadTimelineCause::ImportPgdata { .. } => { + unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3") + } } - Entry::Vacant(v) => { - v.insert(Arc::clone(&timeline)); - timeline.maybe_spawn_flush_loop(); + let mut guard = self.timelines_creating.lock().unwrap(); + if !guard.insert(timeline_id) { + // We should never try and load the same timeline twice during startup + unreachable!("Timeline {tenant_id}/{timeline_id} is already being created") } + let timeline_create_guard = TimelineCreateGuard { + _tenant_gate_guard: self.gate.enter()?, + owning_tenant: self.clone(), + timeline_id, + idempotency, + // The users of this specific return value don't need the timline_path in there. + timeline_path: timeline + .conf + .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id), + }; + Ok(TimelineInitAndSyncResult::NeedsSpawnImportPgdata( + TimelineInitAndSyncNeedsSpawnImportPgdata { + timeline, + import_pgdata, + guard: timeline_create_guard, + }, + )) } - }; + Some(_) | None => { + { + let mut timelines_accessor = self.timelines.lock().unwrap(); + match timelines_accessor.entry(timeline_id) { + // We should never try and load the same timeline twice during startup + Entry::Occupied(_) => { + unreachable!( + "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" + ); + } + Entry::Vacant(v) => { + v.insert(Arc::clone(&timeline)); + timeline.maybe_spawn_flush_loop(); + } + } + } - // Sanity check: a timeline should have some content. - anyhow::ensure!( - ancestor.is_some() - || timeline - .layers - .read() - .await - .layer_map() - .expect("currently loading, layer manager cannot be shutdown already") - .iter_historic_layers() - .next() - .is_some(), - "Timeline has no ancestor and no layer files" - ); + // Sanity check: a timeline should have some content. + anyhow::ensure!( + ancestor.is_some() + || timeline + .layers + .read() + .await + .layer_map() + .expect("currently loading, layer manager cannot be shutdown already") + .iter_historic_layers() + .next() + .is_some(), + "Timeline has no ancestor and no layer files" + ); - Ok(()) + match cause { + LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), + LoadTimelineCause::ImportPgdata { + create_guard, + activate, + } => { + // TODO: see the comment in the task code above how I'm not so certain + // it is safe to activate here because of concurrent shutdowns. + match activate { + ActivateTimelineArgs::Yes { broker_client } => { + info!("activating timeline after reload from pgdata import task"); + timeline.activate(self.clone(), broker_client, None, ctx); + } + ActivateTimelineArgs::No => (), + } + drop(create_guard); + } + } + + Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline)) + } + } } /// Attach a tenant that's available in cloud storage. @@ -1397,7 +1492,7 @@ impl Tenant { // Get list of remote timelines // download index files for every tenant timeline info!("listing remote timelines"); - let (remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines( + let (mut remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines( remote_storage, self.tenant_shard_id, cancel.clone(), @@ -1431,11 +1526,27 @@ impl Tenant { warn!("Unexpected non timeline key {k}"); } + // Avoid downloading IndexPart of offloaded timelines. + let mut offloaded_with_prefix = HashSet::new(); + for offloaded in tenant_manifest.offloaded_timelines.iter() { + if remote_timeline_ids.remove(&offloaded.timeline_id) { + offloaded_with_prefix.insert(offloaded.timeline_id); + } else { + // We'll take care later of timelines in the manifest without a prefix + } + } + + let timelines = self + .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel) + .await?; + Ok(TenantPreload { tenant_manifest, - timelines: self - .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel) - .await?, + timelines: timelines + .into_iter() + .map(|(id, tl)| (id, Some(tl))) + .chain(offloaded_with_prefix.into_iter().map(|id| (id, None))) + .collect(), }) } @@ -1466,6 +1577,19 @@ impl Tenant { offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline))); offloaded_timeline_ids.insert(timeline_id); } + // Complete deletions for offloaded timeline id's from manifest. + // The manifest will be uploaded later in this function. + offloaded_timelines_list + .retain(|(offloaded_id, offloaded)| { + // Existence of a timeline is finally determined by the existence of an index-part.json in remote storage. + // If there is dangling references in another location, they need to be cleaned up. + let delete = !preload.timelines.contains_key(offloaded_id); + if delete { + tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found"); + offloaded.defuse_for_tenant_drop(); + } + !delete + }); let mut timelines_to_resume_deletions = vec![]; @@ -1473,10 +1597,9 @@ impl Tenant { let mut timeline_ancestors = HashMap::new(); let mut existent_timelines = HashSet::new(); for (timeline_id, preload) in preload.timelines { - if offloaded_timeline_ids.remove(&timeline_id) { - // The timeline is offloaded, skip loading it. - continue; - } + let Some(preload) = preload else { continue }; + // This is an invariant of the `preload` function's API + assert!(!offloaded_timeline_ids.contains(&timeline_id)); let index_part = match preload.index_part { Ok(i) => { debug!("remote index part exists for timeline {timeline_id}"); @@ -1548,24 +1671,46 @@ impl Tenant { } // TODO again handle early failure - self.load_remote_timeline( - timeline_id, - index_part, - remote_metadata, - TimelineResources { - remote_client, - timeline_get_throttle: self.timeline_get_throttle.clone(), - l0_flush_global_state: self.l0_flush_global_state.clone(), - }, - ctx, - ) - .await - .with_context(|| { - format!( - "failed to load remote timeline {} for tenant {}", - timeline_id, self.tenant_shard_id + let effect = self + .load_remote_timeline( + timeline_id, + index_part, + remote_metadata, + TimelineResources { + remote_client, + timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), + }, + LoadTimelineCause::Attach, + ctx, ) - })?; + .await + .with_context(|| { + format!( + "failed to load remote timeline {} for tenant {}", + timeline_id, self.tenant_shard_id + ) + })?; + + match effect { + TimelineInitAndSyncResult::ReadyToActivate(_) => { + // activation happens later, on Tenant::activate + } + TimelineInitAndSyncResult::NeedsSpawnImportPgdata( + TimelineInitAndSyncNeedsSpawnImportPgdata { + timeline, + import_pgdata, + guard, + }, + ) => { + tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( + timeline, + import_pgdata, + ActivateTimelineArgs::No, + guard, + )); + } + } } // Walk through deleted timelines, resume deletion @@ -1586,31 +1731,13 @@ impl Tenant { .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; } - // Complete deletions for offloaded timeline id's. - offloaded_timelines_list - .retain(|(offloaded_id, offloaded)| { - // At this point, offloaded_timeline_ids has the list of all offloaded timelines - // without a prefix in S3, so they are inexistent. - // In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage. - // If there is a dangling reference in another location, they need to be cleaned up. - let delete = offloaded_timeline_ids.contains(offloaded_id); - if delete { - tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found"); - offloaded.defuse_for_tenant_drop(); - } - !delete - }); - if !offloaded_timelines_list.is_empty() { - tracing::info!( - "Tenant has {} offloaded timelines", - offloaded_timelines_list.len() - ); - } + let needs_manifest_upload = + offloaded_timelines_list.len() != preload.tenant_manifest.offloaded_timelines.len(); { let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap(); offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter()); } - if !offloaded_timeline_ids.is_empty() { + if needs_manifest_upload { self.store_tenant_manifest().await?; } @@ -1707,13 +1834,14 @@ impl Tenant { #[instrument(skip_all, fields(timeline_id=%timeline_id))] async fn load_remote_timeline( - &self, + self: &Arc, timeline_id: TimelineId, index_part: IndexPart, remote_metadata: TimelineMetadata, resources: TimelineResources, + cause: LoadTimelineCause, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { span::debug_assert_current_span_has_tenant_id(); info!("downloading index file for timeline {}", timeline_id); @@ -1740,6 +1868,7 @@ impl Tenant { index_part, remote_metadata, ancestor, + cause, ctx, ) .await @@ -1796,6 +1925,7 @@ impl Tenant { self.tenant_shard_id, timeline_id, self.generation, + &self.tenant_conf.load().location, ) } @@ -1925,6 +2055,7 @@ impl Tenant { TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists")) } TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e), + TimelineExclusionError::ShuttingDown => TimelineArchivalError::Cancelled, })?; let timeline_preload = self @@ -1963,6 +2094,7 @@ impl Tenant { index_part, remote_metadata, timeline_resources, + LoadTimelineCause::Unoffload, &ctx, ) .await @@ -2200,7 +2332,7 @@ impl Tenant { /// /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys. pub(crate) async fn create_empty_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, @@ -2250,7 +2382,7 @@ impl Tenant { // Our current tests don't need the background loops. #[cfg(test)] pub async fn create_test_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, @@ -2289,7 +2421,7 @@ impl Tenant { #[cfg(test)] #[allow(clippy::too_many_arguments)] pub async fn create_test_timeline_with_layers( - &self, + self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, @@ -2426,6 +2558,16 @@ impl Tenant { self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx) .await? } + CreateTimelineParams::ImportPgdata(params) => { + self.create_timeline_import_pgdata( + params, + ActivateTimelineArgs::Yes { + broker_client: broker_client.clone(), + }, + ctx, + ) + .await? + } }; // At this point we have dropped our guard on [`Self::timelines_creating`], and @@ -2468,11 +2610,202 @@ impl Tenant { ); timeline } + CreateTimelineResult::ImportSpawned(timeline) => { + info!("import task spawned, timeline will become visible and activated once the import is done"); + timeline + } }; Ok(activated_timeline) } + /// The returned [`Arc`] is NOT in the [`Tenant::timelines`] map until the import + /// completes in the background. A DIFFERENT [`Arc`] will be inserted into the + /// [`Tenant::timelines`] map when the import completes. + /// We only return an [`Arc`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`] + /// for the response. + async fn create_timeline_import_pgdata( + self: &Arc, + params: CreateTimelineParamsImportPgdata, + activate: ActivateTimelineArgs, + ctx: &RequestContext, + ) -> Result { + let CreateTimelineParamsImportPgdata { + new_timeline_id, + location, + idempotency_key, + } = params; + + let started_at = chrono::Utc::now().naive_utc(); + + // + // There's probably a simpler way to upload an index part, but, remote_timeline_client + // is the canonical way we do it. + // - create an empty timeline in-memory + // - use its remote_timeline_client to do the upload + // - dispose of the uninit timeline + // - keep the creation guard alive + + let timeline_create_guard = match self + .start_creating_timeline( + new_timeline_id, + CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { + idempotency_key: idempotency_key.clone(), + }), + ) + .await? + { + StartCreatingTimelineResult::CreateGuard(guard) => guard, + StartCreatingTimelineResult::Idempotent(timeline) => { + return Ok(CreateTimelineResult::Idempotent(timeline)) + } + }; + + let mut uninit_timeline = { + let this = &self; + let initdb_lsn = Lsn(0); + let _ctx = ctx; + async move { + let new_metadata = TimelineMetadata::new( + // Initialize disk_consistent LSN to 0, The caller must import some data to + // make it valid, before calling finish_creation() + Lsn(0), + None, + None, + Lsn(0), + initdb_lsn, + initdb_lsn, + 15, + ); + this.prepare_new_timeline( + new_timeline_id, + &new_metadata, + timeline_create_guard, + initdb_lsn, + None, + ) + .await + } + } + .await?; + + let in_progress = import_pgdata::index_part_format::InProgress { + idempotency_key, + location, + started_at, + }; + let index_part = import_pgdata::index_part_format::Root::V1( + import_pgdata::index_part_format::V1::InProgress(in_progress), + ); + uninit_timeline + .raw_timeline() + .unwrap() + .remote_client + .schedule_index_upload_for_import_pgdata_state_update(Some(index_part.clone()))?; + + // wait_completion happens in caller + + let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself(); + + tokio::spawn(self.clone().create_timeline_import_pgdata_task( + timeline.clone(), + index_part, + activate, + timeline_create_guard, + )); + + // NB: the timeline doesn't exist in self.timelines at this point + Ok(CreateTimelineResult::ImportSpawned(timeline)) + } + + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))] + async fn create_timeline_import_pgdata_task( + self: Arc, + timeline: Arc, + index_part: import_pgdata::index_part_format::Root, + activate: ActivateTimelineArgs, + timeline_create_guard: TimelineCreateGuard, + ) { + debug_assert_current_span_has_tenant_and_timeline_id(); + info!("starting"); + scopeguard::defer! {info!("exiting")}; + + let res = self + .create_timeline_import_pgdata_task_impl( + timeline, + index_part, + activate, + timeline_create_guard, + ) + .await; + if let Err(err) = &res { + error!(?err, "task failed"); + // TODO sleep & retry, sensitive to tenant shutdown + // TODO: allow timeline deletion requests => should cancel the task + } + } + + async fn create_timeline_import_pgdata_task_impl( + self: Arc, + timeline: Arc, + index_part: import_pgdata::index_part_format::Root, + activate: ActivateTimelineArgs, + timeline_create_guard: TimelineCreateGuard, + ) -> Result<(), anyhow::Error> { + let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn); + + info!("importing pgdata"); + import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) + .await + .context("import")?; + info!("import done"); + + // + // Reload timeline from remote. + // This proves that the remote state is attachable, and it reuses the code. + // + // TODO: think about whether this is safe to do with concurrent Tenant::shutdown. + // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit. + // But our activate() call might launch new background tasks after Tenant::shutdown + // already went past shutting down the Tenant::timelines, which this timeline here is no part of. + // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting + // down while bootstrapping/branching + activating), but, the race condition is much more likely + // to manifest because of the long runtime of this import task. + + // in theory this shouldn't even .await anything except for coop yield + info!("shutting down timeline"); + timeline.shutdown(ShutdownMode::Hard).await; + info!("timeline shut down, reloading from remote"); + // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc + // let Some(timeline) = Arc::into_inner(timeline) else { + // anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere"); + // }; + let timeline_id = timeline.timeline_id; + + // load from object storage like Tenant::attach does + let resources = self.build_timeline_resources(timeline_id); + let index_part = resources + .remote_client + .download_index_file(&self.cancel) + .await?; + let index_part = match index_part { + MaybeDeletedIndexPart::Deleted(_) => { + // likely concurrent delete call, cplane should prevent this + anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but") + } + MaybeDeletedIndexPart::IndexPart(p) => p, + }; + let metadata = index_part.metadata.clone(); + self + .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{ + create_guard: timeline_create_guard, activate, }, &ctx) + .await? + .ready_to_activate() + .context("implementation error: reloaded timeline still needs import after import reported success")?; + + anyhow::Ok(()) + } + pub(crate) async fn delete_timeline( self: Arc, timeline_id: TimelineId, @@ -2516,6 +2849,10 @@ impl Tenant { { let conf = self.tenant_conf.load(); + // If we may not delete layers, then simply skip GC. Even though a tenant + // in AttachedMulti state could do GC and just enqueue the blocked deletions, + // the only advantage to doing it is to perhaps shrink the LayerMap metadata + // a bit sooner than we would achieve by waiting for AttachedSingle status. if !conf.location.may_delete_layers_hint() { info!("Skipping GC in location state {:?}", conf.location); return Ok(GcResult::default()); @@ -2557,7 +2894,14 @@ impl Tenant { { let conf = self.tenant_conf.load(); - if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { + + // Note that compaction usually requires deletions, but we don't respect + // may_delete_layers_hint here: that is because tenants in AttachedMulti + // should proceed with compaction even if they can't do deletion, to avoid + // accumulating dangerously deep stacks of L0 layers. Deletions will be + // enqueued inside RemoteTimelineClient, and executed layer if/when we transition + // to AttachedSingle state. + if !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); return Ok(false); } @@ -3313,6 +3657,13 @@ where Ok(result) } +enum ActivateTimelineArgs { + Yes { + broker_client: storage_broker::BrokerClientChannel, + }, + No, +} + impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { self.tenant_conf.load().tenant_conf.clone() @@ -3435,6 +3786,7 @@ impl Tenant { // this race is not possible if both request types come from the storage // controller (as they should!) because an exclusive op lock is required // on the storage controller side. + self.tenant_conf.rcu(|inner| { Arc::new(AttachedTenantConf { tenant_conf: new_tenant_conf.clone(), @@ -3444,20 +3796,22 @@ impl Tenant { }) }); + let updated = self.tenant_conf.load().clone(); + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(&new_tenant_conf); + timeline.tenant_conf_updated(&updated); } } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { let new_tenant_conf = new_conf.tenant_conf.clone(); - self.tenant_conf.store(Arc::new(new_conf)); + self.tenant_conf.store(Arc::new(new_conf.clone())); self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. @@ -3465,7 +3819,7 @@ impl Tenant { // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(&new_tenant_conf); + timeline.tenant_conf_updated(&new_conf); } } @@ -3493,6 +3847,7 @@ impl Tenant { /// `validate_ancestor == false` is used when a timeline is created for deletion /// and we might not have the ancestor present anymore which is fine for to be /// deleted timelines. + #[allow(clippy::too_many_arguments)] fn create_timeline_struct( &self, new_timeline_id: TimelineId, @@ -4256,16 +4611,17 @@ impl Tenant { /// If the timeline was already created in the meantime, we check whether this /// request conflicts or is idempotent , based on `state`. async fn start_creating_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, idempotency: CreateTimelineIdempotency, - ) -> Result, CreateTimelineError> { + ) -> Result { let allow_offloaded = false; match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) { Ok(create_guard) => { pausable_failpoint!("timeline-creation-after-uninit"); Ok(StartCreatingTimelineResult::CreateGuard(create_guard)) } + Err(TimelineExclusionError::ShuttingDown) => Err(CreateTimelineError::ShuttingDown), Err(TimelineExclusionError::AlreadyCreating) => { // Creation is in progress, we cannot create it again, and we cannot // check if this request matches the existing one, so caller must try @@ -4533,6 +4889,7 @@ impl Tenant { self.tenant_shard_id, timeline_id, self.generation, + &self.tenant_conf.load().location, ) } @@ -4554,7 +4911,7 @@ impl Tenant { &'a self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, - create_guard: TimelineCreateGuard<'a>, + create_guard: TimelineCreateGuard, start_lsn: Lsn, ancestor: Option>, ) -> anyhow::Result> { @@ -4614,7 +4971,7 @@ impl Tenant { /// The `allow_offloaded` parameter controls whether to tolerate the existence of /// offloaded timelines or not. fn create_timeline_create_guard( - &self, + self: &Arc, timeline_id: TimelineId, idempotency: CreateTimelineIdempotency, allow_offloaded: bool, @@ -4874,48 +5231,16 @@ async fn run_initdb( let _permit = INIT_DB_SEMAPHORE.acquire().await; - let mut initdb_command = tokio::process::Command::new(&initdb_bin_path); - initdb_command - .args(["--pgdata", initdb_target_dir.as_ref()]) - .args(["--username", &conf.superuser]) - .args(["--encoding", "utf8"]) - .args(["--locale", &conf.locale]) - .arg("--no-instructions") - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", &initdb_lib_dir) - .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) - .stdin(std::process::Stdio::null()) - // stdout invocation produces the same output every time, we don't need it - .stdout(std::process::Stdio::null()) - // we would be interested in the stderr output, if there was any - .stderr(std::process::Stdio::piped()); - - // Before version 14, only the libc provide was available. - if pg_version > 14 { - // Version 17 brought with it a builtin locale provider which only provides - // C and C.UTF-8. While being safer for collation purposes since it is - // guaranteed to be consistent throughout a major release, it is also more - // performant. - let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; - - initdb_command.args(["--locale-provider", locale_provider]); - } - - let initdb_proc = initdb_command.spawn()?; - - // Ideally we'd select here with the cancellation token, but the problem is that - // we can't safely terminate initdb: it launches processes of its own, and killing - // initdb doesn't kill them. After we return from this function, we want the target - // directory to be able to be cleaned up. - // See https://github.com/neondatabase/neon/issues/6385 - let initdb_output = initdb_proc.wait_with_output().await?; - if !initdb_output.status.success() { - return Err(InitdbError::Failed( - initdb_output.status, - initdb_output.stderr, - )); - } + let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser: &conf.superuser, + locale: &conf.locale, + initdb_bin: &initdb_bin_path, + pg_version, + library_search_path: &initdb_lib_dir, + pgdata: initdb_target_dir, + }) + .await + .map_err(InitdbError::Inner); // This isn't true cancellation support, see above. Still return an error to // excercise the cancellation code path. @@ -4923,7 +5248,7 @@ async fn run_initdb( return Err(InitdbError::Cancelled); } - Ok(()) + res } /// Dump contents of a layer file to stdout. diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4fc9d740c8..92b2200542 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1719,10 +1719,11 @@ impl TenantManager { parent_layers.push(relative_path.to_owned()); } } - debug_assert!( - !parent_layers.is_empty(), - "shutdown cannot empty the layermap" - ); + + if parent_layers.is_empty() { + tracing::info!("Ancestor shard has no resident layer to hard link"); + } + (parent_timelines, parent_layers) }; diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 94f42c7827..007bd3eef0 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -197,8 +197,9 @@ use utils::backoff::{ self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; use utils::pausable_failpoint; +use utils::shard::ShardNumber; -use std::collections::{HashMap, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::Duration; @@ -222,7 +223,7 @@ use crate::task_mgr::shutdown_token; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::download::download_retry; use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable}; +use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable}; use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::{ config::PageServerConf, @@ -240,8 +241,10 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; +use super::config::AttachedLocationConfig; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; +use super::timeline::import_pgdata; use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::{DeleteTimelineError, Generation}; @@ -301,6 +304,36 @@ pub enum WaitCompletionError { #[derive(Debug, thiserror::Error)] #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")] pub struct UploadQueueNotReadyError; +/// Behavioral modes that enable seamless live migration. +/// +/// See docs/rfcs/028-pageserver-migration.md to understand how these fit in. +struct RemoteTimelineClientConfig { + /// If this is false, then update to remote_consistent_lsn are dropped rather + /// than being submitted to DeletionQueue for validation. This behavior is + /// used when a tenant attachment is known to have a stale generation number, + /// such that validation attempts will always fail. This is not necessary + /// for correctness, but avoids spamming error statistics with failed validations + /// when doing migrations of tenants. + process_remote_consistent_lsn_updates: bool, + + /// If this is true, then object deletions are held in a buffer in RemoteTimelineClient + /// rather than being submitted to the DeletionQueue. This behavior is used when a tenant + /// is known to be multi-attached, in order to avoid disrupting other attached tenants + /// whose generations' metadata refers to the deleted objects. + block_deletions: bool, +} + +/// RemoteTimelineClientConfig's state is entirely driven by LocationConf, but we do +/// not carry the entire LocationConf structure: it's much more than we need. The From +/// impl extracts the subset of the LocationConf that is interesting to RemoteTimelineClient. +impl From<&AttachedLocationConfig> for RemoteTimelineClientConfig { + fn from(lc: &AttachedLocationConfig) -> Self { + Self { + block_deletions: !lc.may_delete_layers_hint(), + process_remote_consistent_lsn_updates: lc.may_upload_layers_hint(), + } + } +} /// A client for accessing a timeline's data in remote storage. /// @@ -321,7 +354,7 @@ pub struct UploadQueueNotReadyError; /// in the index part file, whenever timeline metadata is uploaded. /// /// Downloads are not queued, they are performed immediately. -pub struct RemoteTimelineClient { +pub(crate) struct RemoteTimelineClient { conf: &'static PageServerConf, runtime: tokio::runtime::Handle, @@ -338,6 +371,9 @@ pub struct RemoteTimelineClient { deletion_queue_client: DeletionQueueClient, + /// Subset of tenant configuration used to control upload behaviors during migrations + config: std::sync::RwLock, + cancel: CancellationToken, } @@ -348,13 +384,14 @@ impl RemoteTimelineClient { /// Note: the caller must initialize the upload queue before any uploads can be scheduled, /// by calling init_upload_queue. /// - pub fn new( + pub(crate) fn new( remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, + location_conf: &AttachedLocationConfig, ) -> RemoteTimelineClient { RemoteTimelineClient { conf, @@ -374,6 +411,7 @@ impl RemoteTimelineClient { &tenant_shard_id, &timeline_id, )), + config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)), cancel: CancellationToken::new(), } } @@ -429,6 +467,43 @@ impl RemoteTimelineClient { Ok(()) } + /// Notify this client of a change to its parent tenant's config, as this may cause us to + /// take action (unblocking deletions when transitioning from AttachedMulti to AttachedSingle) + pub(super) fn update_config(&self, location_conf: &AttachedLocationConfig) { + let new_conf = RemoteTimelineClientConfig::from(location_conf); + let unblocked = !new_conf.block_deletions; + + // Update config before draining deletions, so that we don't race with more being + // inserted. This can result in deletions happening our of order, but that does not + // violate any invariants: deletions only need to be ordered relative to upload of the index + // that dereferences the deleted objects, and we are not changing that order. + *self.config.write().unwrap() = new_conf; + + if unblocked { + // If we may now delete layers, drain any that were blocked in our old + // configuration state + let mut queue_locked = self.upload_queue.lock().unwrap(); + + if let Ok(queue) = queue_locked.initialized_mut() { + let blocked_deletions = std::mem::take(&mut queue.blocked_deletions); + for d in blocked_deletions { + if let Err(e) = self.deletion_queue_client.push_layers_sync( + self.tenant_shard_id, + self.timeline_id, + self.generation, + d.layers, + ) { + // This could happen if the pageserver is shut down while a tenant + // is transitioning from a deletion-blocked state: we will leak some + // S3 objects in this case. + warn!("Failed to drain blocked deletions: {}", e); + break; + } + } + } + } + } + /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise. pub fn remote_consistent_lsn_projected(&self) -> Option { match &mut *self.upload_queue.lock().unwrap() { @@ -739,6 +814,18 @@ impl RemoteTimelineClient { Ok(need_wait) } + /// Launch an index-file upload operation in the background, setting `import_pgdata` field. + pub(crate) fn schedule_index_upload_for_import_pgdata_state_update( + self: &Arc, + state: Option, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.import_pgdata = state; + self.schedule_index_upload(upload_queue)?; + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -1016,7 +1103,7 @@ impl RemoteTimelineClient { "scheduled layer file upload {layer}", ); - let op = UploadOp::UploadLayer(layer, metadata); + let op = UploadOp::UploadLayer(layer, metadata, None); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } @@ -1731,7 +1818,7 @@ impl RemoteTimelineClient { // have finished. upload_queue.inprogress_tasks.is_empty() } - UploadOp::Delete(_) => { + UploadOp::Delete(..) => { // Wait for preceding uploads to finish. Concurrent deletions are OK, though. upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len() } @@ -1759,19 +1846,32 @@ impl RemoteTimelineClient { } // We can launch this task. Remove it from the queue first. - let next_op = upload_queue.queued_operations.pop_front().unwrap(); + let mut next_op = upload_queue.queued_operations.pop_front().unwrap(); debug!("starting op: {}", next_op); - // Update the counters - match next_op { - UploadOp::UploadLayer(_, _) => { + // Update the counters and prepare + match &mut next_op { + UploadOp::UploadLayer(layer, meta, mode) => { + if upload_queue + .recently_deleted + .remove(&(layer.layer_desc().layer_name().clone(), meta.generation)) + { + *mode = Some(OpType::FlushDeletion); + } else { + *mode = Some(OpType::MayReorder) + } upload_queue.num_inprogress_layer_uploads += 1; } UploadOp::UploadMetadata { .. } => { upload_queue.num_inprogress_metadata_uploads += 1; } - UploadOp::Delete(_) => { + UploadOp::Delete(Delete { layers }) => { + for (name, meta) in layers { + upload_queue + .recently_deleted + .insert((name.clone(), meta.generation)); + } upload_queue.num_inprogress_deletions += 1; } UploadOp::Barrier(sender) => { @@ -1847,7 +1947,66 @@ impl RemoteTimelineClient { } let upload_result: anyhow::Result<()> = match &task.op { - UploadOp::UploadLayer(ref layer, ref layer_metadata) => { + UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => { + if let Some(OpType::FlushDeletion) = mode { + if self.config.read().unwrap().block_deletions { + // Of course, this is not efficient... but usually the queue should be empty. + let mut queue_locked = self.upload_queue.lock().unwrap(); + let mut detected = false; + if let Ok(queue) = queue_locked.initialized_mut() { + for list in queue.blocked_deletions.iter_mut() { + list.layers.retain(|(name, meta)| { + if name == &layer.layer_desc().layer_name() + && meta.generation == layer_metadata.generation + { + detected = true; + // remove the layer from deletion queue + false + } else { + // keep the layer + true + } + }); + } + } + if detected { + info!( + "cancelled blocked deletion of layer {} at gen {:?}", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + } + } else { + // TODO: we did not guarantee that upload task starts after deletion task, so there could be possibly race conditions + // that we still get the layer deleted. But this only happens if someone creates a layer immediately after it's deleted, + // which is not possible in the current system. + info!( + "waiting for deletion queue flush to complete before uploading layer {} at gen {:?}", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + { + // We are going to flush, we can clean up the recently deleted list. + let mut queue_locked = self.upload_queue.lock().unwrap(); + if let Ok(queue) = queue_locked.initialized_mut() { + queue.recently_deleted.clear(); + } + } + if let Err(e) = self.deletion_queue_client.flush_execute().await { + warn!( + "failed to flush the deletion queue before uploading layer {} at gen {:?}, still proceeding to upload: {e:#} ", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + } else { + info!( + "done flushing deletion queue before uploading layer {} at gen {:?}", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + } + } + } let local_path = layer.local_path(); // We should only be uploading layers created by this `Tenant`'s lifetime, so @@ -1912,16 +2071,24 @@ impl RemoteTimelineClient { res } UploadOp::Delete(delete) => { - pausable_failpoint!("before-delete-layer-pausable"); - self.deletion_queue_client - .push_layers( - self.tenant_shard_id, - self.timeline_id, - self.generation, - delete.layers.clone(), - ) - .await - .map_err(|e| anyhow::anyhow!(e)) + if self.config.read().unwrap().block_deletions { + let mut queue_locked = self.upload_queue.lock().unwrap(); + if let Ok(queue) = queue_locked.initialized_mut() { + queue.blocked_deletions.push(delete.clone()); + } + Ok(()) + } else { + pausable_failpoint!("before-delete-layer-pausable"); + self.deletion_queue_client + .push_layers( + self.tenant_shard_id, + self.timeline_id, + self.generation, + delete.layers.clone(), + ) + .await + .map_err(|e| anyhow::anyhow!(e)) + } } unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => { // unreachable. Barrier operations are handled synchronously in @@ -2003,7 +2170,7 @@ impl RemoteTimelineClient { upload_queue.inprogress_tasks.remove(&task.task_id); let lsn_update = match task.op { - UploadOp::UploadLayer(_, _) => { + UploadOp::UploadLayer(_, _, _) => { upload_queue.num_inprogress_layer_uploads -= 1; None } @@ -2028,8 +2195,16 @@ impl RemoteTimelineClient { // Legacy mode: skip validating generation upload_queue.visible_remote_consistent_lsn.store(lsn); None - } else { + } else if self + .config + .read() + .unwrap() + .process_remote_consistent_lsn_updates + { Some((lsn, upload_queue.visible_remote_consistent_lsn.clone())) + } else { + // Our config disables remote_consistent_lsn updates: drop it. + None } } UploadOp::Delete(_) => { @@ -2072,7 +2247,7 @@ impl RemoteTimelineClient { )> { use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize; let res = match op { - UploadOp::UploadLayer(_, m) => ( + UploadOp::UploadLayer(_, m, _) => ( RemoteOpFileKind::Layer, RemoteOpKind::Upload, RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size), @@ -2166,8 +2341,10 @@ impl RemoteTimelineClient { queued_operations: VecDeque::default(), #[cfg(feature = "testing")] dangling_files: HashMap::default(), + blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), + recently_deleted: HashSet::new(), }; let upload_queue = std::mem::replace( @@ -2231,6 +2408,28 @@ impl RemoteTimelineClient { UploadQueue::Initialized(x) => x.no_pending_work(), } } + + /// 'foreign' in the sense that it does not belong to this tenant shard. This method + /// is used during GC for other shards to get the index of shard zero. + pub(crate) async fn download_foreign_index( + &self, + shard_number: ShardNumber, + cancel: &CancellationToken, + ) -> Result<(IndexPart, Generation, std::time::SystemTime), DownloadError> { + let foreign_shard_id = TenantShardId { + shard_number, + shard_count: self.tenant_shard_id.shard_count, + tenant_id: self.tenant_shard_id.tenant_id, + }; + download_index_part( + &self.storage_impl, + &foreign_shard_id, + &self.timeline_id, + Generation::MAX, + cancel, + ) + .await + } } pub(crate) struct UploadQueueAccessor<'a> { @@ -2379,6 +2578,7 @@ mod tests { use crate::{ context::RequestContext, tenant::{ + config::AttachmentMode, harness::{TenantHarness, TIMELINE_ID}, storage_layer::layer::local_layer_path, Tenant, Timeline, @@ -2464,6 +2664,10 @@ mod tests { /// Construct a RemoteTimelineClient in an arbitrary generation fn build_client(&self, generation: Generation) -> Arc { + let location_conf = AttachedLocationConfig { + generation, + attach_mode: AttachmentMode::Single, + }; Arc::new(RemoteTimelineClient { conf: self.harness.conf, runtime: tokio::runtime::Handle::current(), @@ -2477,6 +2681,7 @@ mod tests { &self.harness.tenant_shard_id, &TIMELINE_ID, )), + config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)), cancel: CancellationToken::new(), }) } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index b5c3a52583..41e32a1aa3 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -709,7 +709,7 @@ where .and_then(|x| x) } -async fn download_retry_forever( +pub(crate) async fn download_retry_forever( op: O, description: &str, cancel: &CancellationToken, diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index d8a881a2c4..506990fb2f 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -12,6 +12,7 @@ use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::import_pgdata; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; @@ -37,6 +38,13 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none")] pub archived_at: Option, + /// This field supports import-from-pgdata ("fast imports" platform feature). + /// We don't currently use fast imports, so, this field is None for all production timelines. + /// See for more information. + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub import_pgdata: Option, + /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata @@ -90,10 +98,11 @@ impl IndexPart { /// - 7: metadata_bytes is no longer written, but still read /// - 8: added `archived_at` /// - 9: +gc_blocking - const LATEST_VERSION: usize = 9; + /// - 10: +import_pgdata + const LATEST_VERSION: usize = 10; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -108,6 +117,7 @@ impl IndexPart { lineage: Default::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, } } @@ -381,6 +391,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -425,6 +436,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -470,6 +482,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -518,6 +531,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -561,6 +575,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -607,6 +622,7 @@ mod tests { }, gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -658,6 +674,7 @@ mod tests { }, gc_blocking: None, last_aux_file_policy: Some(AuxFilePolicy::V2), + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -714,6 +731,7 @@ mod tests { lineage: Default::default(), gc_blocking: None, last_aux_file_policy: Default::default(), + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -771,6 +789,7 @@ mod tests { lineage: Default::default(), gc_blocking: None, last_aux_file_policy: Default::default(), + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -833,6 +852,83 @@ mod tests { }), last_aux_file_policy: Default::default(), archived_at: None, + import_pgdata: None, + }; + + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v10_importpgdata_is_parsed() { + let example = r#"{ + "version": 10, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + }, + "import_pgdata": { + "V1": { + "Done": { + "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", + "started_at": "2024-11-13T09:23:42.123", + "finished_at": "2024-11-13T09:42:23.123" + } + } + } + }"#; + + let expected = IndexPart { + version: 10, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ + started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), + finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), + }))) }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 1331c07d05..3df89a928c 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -111,15 +111,6 @@ pub(crate) struct SecondaryTenant { pub(super) heatmap_total_size_metric: UIntGauge, } -impl Drop for SecondaryTenant { - fn drop(&mut self) { - let tenant_id = self.tenant_shard_id.tenant_id.to_string(); - let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); - let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); - let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); - } -} - impl SecondaryTenant { pub(crate) fn new( tenant_shard_id: TenantShardId, @@ -167,6 +158,13 @@ impl SecondaryTenant { // Wait for any secondary downloader work to complete self.gate.close().await; + + self.validate_metrics(); + + let tenant_id = self.tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); + let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); } pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) { @@ -254,6 +252,20 @@ impl SecondaryTenant { .await .expect("secondary eviction should not have panicked"); } + + /// Exhaustive check that incrementally updated metrics match the actual state. + #[cfg(feature = "testing")] + fn validate_metrics(&self) { + let detail = self.detail.lock().unwrap(); + let resident_size = detail.total_resident_size(); + + assert_eq!(resident_size, self.resident_size_metric.get()); + } + + #[cfg(not(feature = "testing"))] + fn validate_metrics(&self) { + // No-op in non-testing builds + } } /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 82c5702686..7443261a9c 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -242,6 +242,19 @@ impl SecondaryDetail { } } + #[cfg(feature = "testing")] + pub(crate) fn total_resident_size(&self) -> u64 { + self.timelines + .values() + .map(|tl| { + tl.on_disk_layers + .values() + .map(|v| v.metadata.file_size) + .sum::() + }) + .sum::() + } + pub(super) fn evict_layer( &mut self, name: LayerName, @@ -763,24 +776,7 @@ impl<'a> TenantDownloader<'a> { } // Metrics consistency check in testing builds - if cfg!(feature = "testing") { - let detail = self.secondary_state.detail.lock().unwrap(); - let resident_size = detail - .timelines - .values() - .map(|tl| { - tl.on_disk_layers - .values() - .map(|v| v.metadata.file_size) - .sum::() - }) - .sum::(); - assert_eq!( - resident_size, - self.secondary_state.resident_size_metric.get() - ); - } - + self.secondary_state.validate_metrics(); // Only update last_etag after a full successful download: this way will not skip // the next download, even if the heatmap's actual etag is unchanged. self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0eb3de21e9..f6a06e73a7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4,6 +4,7 @@ pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; pub(crate) mod handle; +pub(crate) mod import_pgdata; mod init; pub mod layer_manager; pub(crate) mod logical_size; @@ -38,6 +39,7 @@ use pageserver_api::{ shard::{ShardIdentity, ShardNumber, TenantShardId}, }; use rand::Rng; +use remote_storage::DownloadError; use serde_with::serde_as; use storage_broker::BrokerClientChannel; use tokio::{ @@ -272,7 +274,7 @@ pub struct Timeline { /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. - pub remote_client: Arc, + pub(crate) remote_client: Arc, // What page versions do we hold in the repository? If we get a // request > last_record_lsn, we need to wait until we receive all @@ -2084,6 +2086,11 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts) } + pub(crate) fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf.is_gc_blocked_by_lsn_lease_deadline() + } + pub(crate) fn get_lazy_slru_download(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2171,14 +2178,14 @@ impl Timeline { ) } - pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { + pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - new_conf, + &new_conf.tenant_conf, &self.conf.default_tenant_conf, ); @@ -2186,6 +2193,9 @@ impl Timeline { let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug()); let timeline_id_str = self.timeline_id.to_string(); + + self.remote_client.update_config(&new_conf.location); + self.metrics .evictions_with_low_residence_duration .write() @@ -2460,6 +2470,7 @@ impl Timeline { *guard = Some(WalReceiver::start( Arc::clone(self), WalReceiverConf { + protocol: self.conf.wal_receiver_protocol, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, @@ -2643,6 +2654,7 @@ impl Timeline { // // NB: generation numbers naturally protect against this because they disambiguate // (1) and (4) + // TODO: this is basically a no-op now, should we remove it? self.remote_client.schedule_barrier()?; // Tenant::create_timeline will wait for these uploads to happen before returning, or // on retry. @@ -2698,20 +2710,23 @@ impl Timeline { { Some(cancel) => cancel.cancel(), None => { - let state = self.current_state(); - if matches!( - state, - TimelineState::Broken { .. } | TimelineState::Stopping - ) { - - // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). - // Don't make noise. - } else { - warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); - debug_assert!(false); + match self.current_state() { + TimelineState::Broken { .. } | TimelineState::Stopping => { + // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). + // Don't make noise. + } + TimelineState::Loading => { + // Import does not return an activated timeline. + info!("discarding priority boost for logical size calculation because timeline is not yet active"); + } + TimelineState::Active => { + // activation should be setting the once cell + warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + debug_assert!(false); + } } } - }; + } } } @@ -4821,6 +4836,86 @@ impl Timeline { Ok(()) } + async fn find_gc_time_cutoff( + &self, + pitr: Duration, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + if self.shard_identity.is_shard_zero() { + // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself + let now = SystemTime::now(); + let time_range = if pitr == Duration::ZERO { + humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") + } else { + pitr + }; + + // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) + let time_cutoff = now.checked_sub(time_range).unwrap_or(now); + let timestamp = to_pg_timestamp(time_cutoff); + + let time_cutoff = match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { + LsnForTimestamp::Present(lsn) => Some(lsn), + LsnForTimestamp::Future(lsn) => { + // The timestamp is in the future. That sounds impossible, + // but what it really means is that there hasn't been + // any commits since the cutoff timestamp. + // + // In this case we should use the LSN of the most recent commit, + // which is implicitly the last LSN in the log. + debug!("future({})", lsn); + Some(self.get_last_record_lsn()) + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + None + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + None + } + }; + Ok(time_cutoff) + } else { + // Shards other than shard zero cannot do timestamp->lsn lookups, and must instead learn their GC cutoff + // from shard zero's index. The index doesn't explicitly tell us the time cutoff, but we may assume that + // the point up to which shard zero's last_gc_cutoff has advanced will either be the time cutoff, or a + // space cutoff that we would also have respected ourselves. + match self + .remote_client + .download_foreign_index(ShardNumber(0), cancel) + .await + { + Ok((index_part, index_generation, _index_mtime)) => { + tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", + index_part.metadata.latest_gc_cutoff_lsn()); + Ok(Some(index_part.metadata.latest_gc_cutoff_lsn())) + } + Err(DownloadError::NotFound) => { + // This is unexpected, because during timeline creations shard zero persists to remote + // storage before other shards are called, and during timeline deletion non-zeroth shards are + // deleted before the zeroth one. However, it should be harmless: if we somehow end up in this + // state, then shard zero should _eventually_ write an index when it GCs. + tracing::warn!("GC couldn't find shard zero's index for timeline"); + Ok(None) + } + Err(e) => { + // TODO: this function should return a different error type than page reconstruct error + Err(PageReconstructError::Other(anyhow::anyhow!(e))) + } + } + + // TODO: after reading shard zero's GC cutoff, we should validate its generation with the storage + // controller. Otherwise, it is possible that we see the GC cutoff go backwards while shard zero + // is going through a migration if we read the old location's index and it has GC'd ahead of the + // new location. This is legal in principle, but problematic in practice because it might result + // in a timeline creation succeeding on shard zero ('s new location) but then failing on other shards + // because they have GC'd past the branch point. + } + } + /// Find the Lsns above which layer files need to be retained on /// garbage collection. /// @@ -4863,40 +4958,7 @@ impl Timeline { // - if PITR interval is set, then this is our cutoff. // - if PITR interval is not set, then we do a lookup // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. - let time_cutoff = { - let now = SystemTime::now(); - let time_range = if pitr == Duration::ZERO { - humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") - } else { - pitr - }; - - // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) - let time_cutoff = now.checked_sub(time_range).unwrap_or(now); - let timestamp = to_pg_timestamp(time_cutoff); - - match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { - LsnForTimestamp::Present(lsn) => Some(lsn), - LsnForTimestamp::Future(lsn) => { - // The timestamp is in the future. That sounds impossible, - // but what it really means is that there hasn't been - // any commits since the cutoff timestamp. - // - // In this case we should use the LSN of the most recent commit, - // which is implicitly the last LSN in the log. - debug!("future({})", lsn); - Some(self.get_last_record_lsn()) - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - None - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - None - } - } - }; + let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?; Ok(match (pitr, time_cutoff) { (Duration::ZERO, Some(time_cutoff)) => { @@ -5835,7 +5897,7 @@ impl<'a> TimelineWriter<'a> { batch: SerializedValueBatch, ctx: &RequestContext, ) -> anyhow::Result<()> { - if batch.is_empty() { + if !batch.has_data() { return Ok(()); } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 13a8dfa51a..67fc710c44 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -283,7 +283,7 @@ impl DeleteTimelineFlow { /// Shortcut to create Timeline in stopping state and spawn deletion task. #[instrument(skip_all, fields(%timeline_id))] - pub async fn resume_deletion( + pub(crate) async fn resume_deletion( tenant: Arc, timeline_id: TimelineId, local_metadata: &TimelineMetadata, diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs new file mode 100644 index 0000000000..de56468580 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -0,0 +1,218 @@ +use std::sync::Arc; + +use anyhow::{bail, Context}; +use remote_storage::RemotePath; +use tokio_util::sync::CancellationToken; +use tracing::{info, info_span, Instrument}; +use utils::lsn::Lsn; + +use crate::{context::RequestContext, tenant::metadata::TimelineMetadata}; + +use super::Timeline; + +mod flow; +mod importbucket_client; +mod importbucket_format; +pub(crate) mod index_part_format; +pub(crate) mod upcall_api; + +pub async fn doit( + timeline: &Arc, + index_part: index_part_format::Root, + ctx: &RequestContext, + cancel: CancellationToken, +) -> anyhow::Result<()> { + let index_part_format::Root::V1(v1) = index_part; + let index_part_format::InProgress { + location, + idempotency_key, + started_at, + } = match v1 { + index_part_format::V1::Done(_) => return Ok(()), + index_part_format::V1::InProgress(in_progress) => in_progress, + }; + + let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; + + info!("get spec early so we know we'll be able to upcall when done"); + let Some(spec) = storage.get_spec().await? else { + bail!("spec not found") + }; + + let upcall_client = + upcall_api::Client::new(timeline.conf, cancel.clone()).context("create upcall client")?; + + // + // send an early progress update to clean up k8s job early and generate potentially useful logs + // + info!("send early progress update"); + upcall_client + .send_progress_until_success(&spec) + .instrument(info_span!("early_progress_update")) + .await?; + + let status_prefix = RemotePath::from_string("status").unwrap(); + + // + // See if shard is done. + // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing. + // + let shard_status_key = + status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug())); + let shard_status: Option = + storage.get_json(&shard_status_key).await?; + info!(?shard_status, "peeking shard status"); + if shard_status.map(|st| st.done).unwrap_or(false) { + info!("shard status indicates that the shard is done, skipping import"); + } else { + // TODO: checkpoint the progress into the IndexPart instead of restarting + // from the beginning. + + // + // Wipe the slate clean - the flow does not allow resuming. + // We can implement resuming in the future by checkpointing the progress into the IndexPart. + // + info!("wipe the slate clean"); + { + // TODO: do we need to hold GC lock for this? + let mut guard = timeline.layers.write().await; + assert!( + guard.layer_map()?.open_layer.is_none(), + "while importing, there should be no in-memory layer" // this just seems like a good place to assert it + ); + let all_layers_keys = guard.all_persistent_layers(); + let all_layers: Vec<_> = all_layers_keys + .iter() + .map(|key| guard.get_from_key(key)) + .collect(); + let open = guard.open_mut().context("open_mut")?; + + timeline.remote_client.schedule_gc_update(&all_layers)?; + open.finish_gc_timeline(&all_layers); + } + + // + // Wait for pgdata to finish uploading + // + info!("wait for pgdata to reach status 'done'"); + let pgdata_status_key = status_prefix.join("pgdata"); + loop { + let res = async { + let pgdata_status: Option = storage + .get_json(&pgdata_status_key) + .await + .context("get pgdata status")?; + info!(?pgdata_status, "peeking pgdata status"); + if pgdata_status.map(|st| st.done).unwrap_or(false) { + Ok(()) + } else { + Err(anyhow::anyhow!("pgdata not done yet")) + } + } + .await; + match res { + Ok(_) => break, + Err(err) => { + info!(?err, "indefintely waiting for pgdata to finish"); + if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) + .await + .is_ok() + { + bail!("cancelled while waiting for pgdata"); + } + } + } + } + + // + // Do the import + // + info!("do the import"); + let control_file = storage.get_control_file().await?; + let base_lsn = control_file.base_lsn(); + + info!("update TimelineMetadata based on LSNs from control file"); + { + let pg_version = control_file.pg_version(); + let _ctx: &RequestContext = ctx; + async move { + // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the + // checkpoint record, and prev_record_lsn should point to its beginning. + // We should read the real end of the record from the WAL, but here we + // just fake it. + let disk_consistent_lsn = Lsn(base_lsn.0 + 8); + let prev_record_lsn = base_lsn; + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + Some(prev_record_lsn), + None, // no ancestor + Lsn(0), // no ancestor lsn + base_lsn, // latest_gc_cutoff_lsn + base_lsn, // initdb_lsn + pg_version, + ); + + let _start_lsn = disk_consistent_lsn + 1; + + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; + + timeline.remote_client.wait_completion().await?; + + anyhow::Ok(()) + } + } + .await?; + + flow::run( + timeline.clone(), + base_lsn, + control_file, + storage.clone(), + ctx, + ) + .await?; + + // + // Communicate that shard is done. + // + storage + .put_json( + &shard_status_key, + &importbucket_format::ShardStatus { done: true }, + ) + .await + .context("put shard status")?; + } + + // + // Ensure at-least-once deliver of the upcall to cplane + // before we mark the task as done and never come here again. + // + info!("send final progress update"); + upcall_client + .send_progress_until_success(&spec) + .instrument(info_span!("final_progress_update")) + .await?; + + // + // Mark as done in index_part. + // This makes subsequent timeline loads enter the normal load code path + // instead of spawning the import task and calling this here function. + // + info!("mark import as complete in index part"); + timeline + .remote_client + .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1( + index_part_format::V1::Done(index_part_format::Done { + idempotency_key, + started_at, + finished_at: chrono::Utc::now().naive_utc(), + }), + )))?; + + timeline.remote_client.wait_completion().await?; + + Ok(()) +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs new file mode 100644 index 0000000000..cbd4168c06 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -0,0 +1,798 @@ +//! Import a PGDATA directory into an empty root timeline. +//! +//! This module is adapted hackathon code by Heikki and Stas. +//! Other code in the parent module was written by Christian as part of a customer PoC. +//! +//! The hackathon code was producing image layer files as a free-standing program. +//! +//! It has been modified to +//! - run inside a running Pageserver, within the proper lifecycles of Timeline -> Tenant(Shard) +//! - => sharding-awareness: produce image layers with only the data relevant for this shard +//! - => S3 as the source for the PGDATA instead of local filesystem +//! +//! TODOs before productionization: +//! - ChunkProcessingJob size / ImportJob::total_size does not account for sharding. +//! => produced image layers likely too small. +//! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size. +//! - asserts / unwraps need to be replaced with errors +//! - don't trust remote objects will be small (=prevent OOMs in those cases) +//! - limit all in-memory buffers in size, or download to disk and read from there +//! - limit task concurrency +//! - generally play nice with other tenants in the system +//! - importbucket is different bucket than main pageserver storage, so, should be fine wrt S3 rate limits +//! - but concerns like network bandwidth, local disk write bandwidth, local disk capacity, etc +//! - integrate with layer eviction system +//! - audit for Tenant::cancel nor Timeline::cancel responsivity +//! - audit for Tenant/Timeline gate holding (we spawn tokio tasks during this flow!) +//! +//! An incomplete set of TODOs from the Hackathon: +//! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) + +use std::sync::Arc; + +use anyhow::{bail, ensure}; +use bytes::Bytes; + +use itertools::Itertools; +use pageserver_api::{ + key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY}, + reltag::RelTag, + shard::ShardIdentity, +}; +use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ}; +use tokio::task::JoinSet; +use tracing::{debug, info_span, instrument, Instrument}; + +use crate::{ + assert_u64_eq_usize::UsizeIsU64, + pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory}, +}; +use crate::{ + context::{DownloadBehavior, RequestContext}, + pgdatadir_mapping::{DbDirectory, RelDirectory}, + task_mgr::TaskKind, + tenant::storage_layer::{ImageLayerWriter, Layer}, +}; + +use pageserver_api::key::Key; +use pageserver_api::key::{ + slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY, + TWOPHASEDIR_KEY, +}; +use pageserver_api::keyspace::singleton_range; +use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range}; +use pageserver_api::reltag::SlruKind; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; + +use std::collections::HashSet; +use std::ops::Range; + +use super::{ + importbucket_client::{ControlFile, RemoteStorageWrapper}, + Timeline, +}; + +use remote_storage::RemotePath; + +pub async fn run( + timeline: Arc, + pgdata_lsn: Lsn, + control_file: ControlFile, + storage: RemoteStorageWrapper, + ctx: &RequestContext, +) -> anyhow::Result<()> { + Flow { + timeline, + pgdata_lsn, + control_file, + tasks: Vec::new(), + storage, + } + .run(ctx) + .await +} + +struct Flow { + timeline: Arc, + pgdata_lsn: Lsn, + control_file: ControlFile, + tasks: Vec, + storage: RemoteStorageWrapper, +} + +impl Flow { + /// Perform the ingestion into [`Self::timeline`]. + /// Assumes the timeline is empty (= no layers). + pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> { + let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); + + self.pgdata_lsn = pgdata_lsn; + + let datadir = PgDataDir::new(&self.storage).await?; + + // Import dbdir (00:00:00 keyspace) + // This is just constructed here, but will be written to the image layer in the first call to import_db() + let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory { + dbdirs: datadir + .dbs + .iter() + .map(|db| ((db.spcnode, db.dboid), true)) + .collect(), + })?); + self.tasks + .push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into()); + + // Import databases (00:spcnode:dbnode keyspace for each db) + for db in datadir.dbs { + self.import_db(&db).await?; + } + + // Import SLRUs + + // pg_xact (01:00 keyspace) + self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) + .await?; + // pg_multixact/members (01:01 keyspace) + self.import_slru( + SlruKind::MultiXactMembers, + &self.storage.pgdata().join("pg_multixact/members"), + ) + .await?; + // pg_multixact/offsets (01:02 keyspace) + self.import_slru( + SlruKind::MultiXactOffsets, + &self.storage.pgdata().join("pg_multixact/offsets"), + ) + .await?; + + // Import pg_twophase. + // TODO: as empty + let twophasedir_buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + })?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + TWOPHASEDIR_KEY, + Bytes::from(twophasedir_buf), + ))); + + // Controlfile, checkpoint + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + CONTROLFILE_KEY, + self.control_file.control_file_buf().clone(), + ))); + + let checkpoint_buf = self + .control_file + .control_file_data() + .checkPointCopy + .encode()?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + CHECKPOINT_KEY, + checkpoint_buf, + ))); + + // Assigns parts of key space to later parallel jobs + let mut last_end_key = Key::MIN; + let mut current_chunk = Vec::new(); + let mut current_chunk_size: usize = 0; + let mut parallel_jobs = Vec::new(); + for task in std::mem::take(&mut self.tasks).into_iter() { + if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 { + let key_range = last_end_key..task.key_range().start; + parallel_jobs.push(ChunkProcessingJob::new( + key_range.clone(), + std::mem::take(&mut current_chunk), + &self, + )); + last_end_key = key_range.end; + current_chunk_size = 0; + } + current_chunk_size += task.total_size(); + current_chunk.push(task); + } + parallel_jobs.push(ChunkProcessingJob::new( + last_end_key..Key::MAX, + current_chunk, + &self, + )); + + // Start all jobs simultaneosly + let mut work = JoinSet::new(); + // TODO: semaphore? + for job in parallel_jobs { + let ctx: RequestContext = + ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); + work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job"))); + } + let mut results = Vec::new(); + while let Some(result) = work.join_next().await { + match result { + Ok(res) => { + results.push(res); + } + Err(_joinset_err) => { + results.push(Err(anyhow::anyhow!( + "parallel job panicked or cancelled, check pageserver logs" + ))); + } + } + } + + if results.iter().all(|r| r.is_ok()) { + Ok(()) + } else { + let mut msg = String::new(); + for result in results { + if let Err(err) = result { + msg.push_str(&format!("{err:?}\n\n")); + } + } + bail!("Some parallel jobs failed:\n\n{msg}"); + } + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] + async fn import_db(&mut self, db: &PgDataDirDb) -> anyhow::Result<()> { + debug!("start"); + scopeguard::defer! { + debug!("return"); + } + + // Import relmap (00:spcnode:dbnode:00:*:00) + let relmap_key = relmap_file_key(db.spcnode, db.dboid); + debug!("Constructing relmap entry, key {relmap_key}"); + let relmap_path = db.path.join("pg_filenode.map"); + let relmap_buf = self.storage.get(&relmap_path).await?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + relmap_key, relmap_buf, + ))); + + // Import reldir (00:spcnode:dbnode:00:*:01) + let reldir_key = rel_dir_to_key(db.spcnode, db.dboid); + debug!("Constructing reldirs entry, key {reldir_key}"); + let reldir_buf = RelDirectory::ser(&RelDirectory { + rels: db + .files + .iter() + .map(|f| (f.rel_tag.relnode, f.rel_tag.forknum)) + .collect(), + })?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + reldir_key, + Bytes::from(reldir_buf), + ))); + + // Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last + // segment in a given relation (00:spcnode:dbnode:reloid:fork:ff) + for file in &db.files { + debug!(%file.path, %file.filesize, "importing file"); + let len = file.filesize; + ensure!(len % 8192 == 0); + let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192); + let start_key = rel_block_to_key(file.rel_tag, start_blk); + let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32); + self.tasks + .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new( + *self.timeline.get_shard_identity(), + start_key..end_key, + &file.path, + self.storage.clone(), + ))); + + // Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff) + if let Some(nblocks) = file.nblocks { + let size_key = rel_size_to_key(file.rel_tag); + //debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}"); + let buf = nblocks.to_le_bytes(); + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + size_key, + Bytes::from(buf.to_vec()), + ))); + } + } + + Ok(()) + } + + async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { + let segments = self.storage.listfilesindir(path).await?; + let segments: Vec<(String, u32, usize)> = segments + .into_iter() + .filter_map(|(path, size)| { + let filename = path.object_name()?; + let segno = u32::from_str_radix(filename, 16).ok()?; + Some((filename.to_string(), segno, size)) + }) + .collect(); + + // Write SlruDir + let slrudir_key = slru_dir_to_key(kind); + let segnos: HashSet = segments + .iter() + .map(|(_path, segno, _size)| *segno) + .collect(); + let slrudir = SlruSegmentDirectory { segments: segnos }; + let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + slrudir_key, + Bytes::from(slrudir_buf), + ))); + + for (segpath, segno, size) in segments { + // SlruSegBlocks for each segment + let p = path.join(&segpath); + let file_size = size; + ensure!(file_size % 8192 == 0); + let nblocks = u32::try_from(file_size / 8192)?; + let start_key = slru_block_to_key(kind, segno, 0); + let end_key = slru_block_to_key(kind, segno, nblocks); + debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment"); + self.tasks + .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new( + *self.timeline.get_shard_identity(), + start_key..end_key, + &p, + self.storage.clone(), + ))); + + // Followed by SlruSegSize + let segsize_key = slru_segment_size_to_key(kind, segno); + let segsize_buf = nblocks.to_le_bytes(); + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + segsize_key, + Bytes::copy_from_slice(&segsize_buf), + ))); + } + Ok(()) + } +} + +// +// dbdir iteration tools +// + +struct PgDataDir { + pub dbs: Vec, // spcnode, dboid, path +} + +struct PgDataDirDb { + pub spcnode: u32, + pub dboid: u32, + pub path: RemotePath, + pub files: Vec, +} + +struct PgDataDirDbFile { + pub path: RemotePath, + pub rel_tag: RelTag, + pub segno: u32, + pub filesize: usize, + // Cummulative size of the given fork, set only for the last segment of that fork + pub nblocks: Option, +} + +impl PgDataDir { + async fn new(storage: &RemoteStorageWrapper) -> anyhow::Result { + let datadir_path = storage.pgdata(); + // Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first + // Traverse database in increasing oid order + + let basedir = &datadir_path.join("base"); + let db_oids: Vec<_> = storage + .listdir(basedir) + .await? + .into_iter() + .filter_map(|path| path.object_name().and_then(|name| name.parse::().ok())) + .sorted() + .collect(); + debug!(?db_oids, "found databases"); + let mut databases = Vec::new(); + for dboid in db_oids { + databases.push( + PgDataDirDb::new( + storage, + &basedir.join(dboid.to_string()), + pg_constants::DEFAULTTABLESPACE_OID, + dboid, + &datadir_path, + ) + .await?, + ); + } + + // special case for global catalogs + databases.push( + PgDataDirDb::new( + storage, + &datadir_path.join("global"), + postgres_ffi::pg_constants::GLOBALTABLESPACE_OID, + 0, + &datadir_path, + ) + .await?, + ); + + databases.sort_by_key(|db| (db.spcnode, db.dboid)); + + Ok(Self { dbs: databases }) + } +} + +impl PgDataDirDb { + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%dboid, %db_path))] + async fn new( + storage: &RemoteStorageWrapper, + db_path: &RemotePath, + spcnode: u32, + dboid: u32, + datadir_path: &RemotePath, + ) -> anyhow::Result { + let mut files: Vec = storage + .listfilesindir(db_path) + .await? + .into_iter() + .filter_map(|(path, size)| { + debug!(%path, %size, "found file in dbdir"); + path.object_name().and_then(|name| { + // returns (relnode, forknum, segno) + parse_relfilename(name).ok().map(|x| (size, x)) + }) + }) + .sorted_by_key(|(_, relfilename)| *relfilename) + .map(|(filesize, (relnode, forknum, segno))| { + let rel_tag = RelTag { + spcnode, + dbnode: dboid, + relnode, + forknum, + }; + + let path = datadir_path.join(rel_tag.to_segfile_name(segno)); + assert!(filesize % BLCKSZ as usize == 0); // TODO: this should result in an error + let nblocks = filesize / BLCKSZ as usize; + + PgDataDirDbFile { + path, + filesize, + rel_tag, + segno, + nblocks: Some(nblocks), // first non-cummulative sizes + } + }) + .collect(); + + // Set cummulative sizes. Do all of that math here, so that later we could easier + // parallelize over segments and know with which segments we need to write relsize + // entry. + let mut cumulative_nblocks: usize = 0; + let mut prev_rel_tag: Option = None; + for i in 0..files.len() { + if prev_rel_tag == Some(files[i].rel_tag) { + cumulative_nblocks += files[i].nblocks.unwrap(); + } else { + cumulative_nblocks = files[i].nblocks.unwrap(); + } + + files[i].nblocks = if i == files.len() - 1 || files[i + 1].rel_tag != files[i].rel_tag { + Some(cumulative_nblocks) + } else { + None + }; + + prev_rel_tag = Some(files[i].rel_tag); + } + + Ok(PgDataDirDb { + files, + path: db_path.clone(), + spcnode, + dboid, + }) + } +} + +trait ImportTask { + fn key_range(&self) -> Range; + + fn total_size(&self) -> usize { + // TODO: revisit this + if is_contiguous_range(&self.key_range()) { + contiguous_range_len(&self.key_range()) as usize * 8192 + } else { + u32::MAX as usize + } + } + + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result; +} + +struct ImportSingleKeyTask { + key: Key, + buf: Bytes, +} + +impl ImportSingleKeyTask { + fn new(key: Key, buf: Bytes) -> Self { + ImportSingleKeyTask { key, buf } + } +} + +impl ImportTask for ImportSingleKeyTask { + fn key_range(&self) -> Range { + singleton_range(self.key) + } + + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + layer_writer.put_image(self.key, self.buf, ctx).await?; + Ok(1) + } +} + +struct ImportRelBlocksTask { + shard_identity: ShardIdentity, + key_range: Range, + path: RemotePath, + storage: RemoteStorageWrapper, +} + +impl ImportRelBlocksTask { + fn new( + shard_identity: ShardIdentity, + key_range: Range, + path: &RemotePath, + storage: RemoteStorageWrapper, + ) -> Self { + ImportRelBlocksTask { + shard_identity, + key_range, + path: path.clone(), + storage, + } + } +} + +impl ImportTask for ImportRelBlocksTask { + fn key_range(&self) -> Range { + self.key_range.clone() + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%self.path))] + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + debug!("Importing relation file"); + + let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?; + let (rel_tag_end, end_blk) = self.key_range.end.to_rel_block()?; + assert_eq!(rel_tag, rel_tag_end); + + let ranges = (start_blk..end_blk) + .enumerate() + .filter_map(|(i, blknum)| { + let key = rel_block_to_key(rel_tag, blknum); + if self.shard_identity.is_key_disposable(&key) { + return None; + } + let file_offset = i.checked_mul(8192).unwrap(); + Some(( + vec![key], + file_offset, + file_offset.checked_add(8192).unwrap(), + )) + }) + .coalesce(|(mut acc, acc_start, acc_end), (mut key, start, end)| { + assert_eq!(key.len(), 1); + assert!(!acc.is_empty()); + assert!(acc_end > acc_start); + if acc_end == start /* TODO additional max range check here, to limit memory consumption per task to X */ { + acc.push(key.pop().unwrap()); + Ok((acc, acc_start, end)) + } else { + Err(((acc, acc_start, acc_end), (key, start, end))) + } + }); + + let mut nimages = 0; + for (keys, range_start, range_end) in ranges { + let range_buf = self + .storage + .get_range(&self.path, range_start.into_u64(), range_end.into_u64()) + .await?; + let mut buf = Bytes::from(range_buf); + // TODO: batched writes + for key in keys { + let image = buf.split_to(8192); + layer_writer.put_image(key, image, ctx).await?; + nimages += 1; + } + } + + Ok(nimages) + } +} + +struct ImportSlruBlocksTask { + shard_identity: ShardIdentity, + key_range: Range, + path: RemotePath, + storage: RemoteStorageWrapper, +} + +impl ImportSlruBlocksTask { + fn new( + shard_identity: ShardIdentity, + key_range: Range, + path: &RemotePath, + storage: RemoteStorageWrapper, + ) -> Self { + ImportSlruBlocksTask { + shard_identity, + key_range, + path: path.clone(), + storage, + } + } +} + +impl ImportTask for ImportSlruBlocksTask { + fn key_range(&self) -> Range { + self.key_range.clone() + } + + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + debug!("Importing SLRU segment file {}", self.path); + let buf = self.storage.get(&self.path).await?; + + let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?; + let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?; + let mut blknum = start_blk; + let mut nimages = 0; + let mut file_offset = 0; + while blknum < end_blk { + let key = slru_block_to_key(kind, segno, blknum); + assert!( + !self.shard_identity.is_key_disposable(&key), + "SLRU keys need to go into every shard" + ); + let buf = &buf[file_offset..(file_offset + 8192)]; + file_offset += 8192; + layer_writer + .put_image(key, Bytes::copy_from_slice(buf), ctx) + .await?; + blknum += 1; + nimages += 1; + } + Ok(nimages) + } +} + +enum AnyImportTask { + SingleKey(ImportSingleKeyTask), + RelBlocks(ImportRelBlocksTask), + SlruBlocks(ImportSlruBlocksTask), +} + +impl ImportTask for AnyImportTask { + fn key_range(&self) -> Range { + match self { + Self::SingleKey(t) => t.key_range(), + Self::RelBlocks(t) => t.key_range(), + Self::SlruBlocks(t) => t.key_range(), + } + } + /// returns the number of images put into the `layer_writer` + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + match self { + Self::SingleKey(t) => t.doit(layer_writer, ctx).await, + Self::RelBlocks(t) => t.doit(layer_writer, ctx).await, + Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await, + } + } +} + +impl From for AnyImportTask { + fn from(t: ImportSingleKeyTask) -> Self { + Self::SingleKey(t) + } +} + +impl From for AnyImportTask { + fn from(t: ImportRelBlocksTask) -> Self { + Self::RelBlocks(t) + } +} + +impl From for AnyImportTask { + fn from(t: ImportSlruBlocksTask) -> Self { + Self::SlruBlocks(t) + } +} + +struct ChunkProcessingJob { + timeline: Arc, + range: Range, + tasks: Vec, + + pgdata_lsn: Lsn, +} + +impl ChunkProcessingJob { + fn new(range: Range, tasks: Vec, env: &Flow) -> Self { + assert!(env.pgdata_lsn.is_valid()); + Self { + timeline: env.timeline.clone(), + range, + tasks, + pgdata_lsn: env.pgdata_lsn, + } + } + + async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> { + let mut writer = ImageLayerWriter::new( + self.timeline.conf, + self.timeline.timeline_id, + self.timeline.tenant_shard_id, + &self.range, + self.pgdata_lsn, + ctx, + ) + .await?; + + let mut nimages = 0; + for task in self.tasks { + nimages += task.doit(&mut writer, ctx).await?; + } + + let resident_layer = if nimages > 0 { + let (desc, path) = writer.finish(ctx).await?; + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)? + } else { + // dropping the writer cleans up + return Ok(()); + }; + + // this is sharing the same code as create_image_layers + let mut guard = self.timeline.layers.write().await; + guard + .open_mut()? + .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics); + crate::tenant::timeline::drop_wlock(guard); + + // Schedule the layer for upload but don't add barriers such as + // wait for completion or index upload, so we don't inhibit upload parallelism. + // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?) + // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level. + self.timeline + .remote_client + .schedule_layer_file_upload(resident_layer)?; + + Ok(()) + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs new file mode 100644 index 0000000000..8d5ab1780f --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -0,0 +1,315 @@ +use std::{ops::Bound, sync::Arc}; + +use anyhow::Context; +use bytes::Bytes; +use postgres_ffi::ControlFileData; +use remote_storage::{ + Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath, +}; +use serde::de::DeserializeOwned; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, instrument}; +use utils::lsn::Lsn; + +use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf}; + +use super::{importbucket_format, index_part_format}; + +pub async fn new( + conf: &'static PageServerConf, + location: &index_part_format::Location, + cancel: CancellationToken, +) -> Result { + // FIXME: we probably want some timeout, and we might be able to assume the max file + // size on S3 is 1GiB (postgres segment size). But the problem is that the individual + // downloaders don't know enough about concurrent downloads to make a guess on the + // expected bandwidth and resulting best timeout. + let timeout = std::time::Duration::from_secs(24 * 60 * 60); + let location_storage = match location { + #[cfg(feature = "testing")] + index_part_format::Location::LocalFs { path } => { + GenericRemoteStorage::LocalFs(remote_storage::LocalFs::new(path.clone(), timeout)?) + } + index_part_format::Location::AwsS3 { + region, + bucket, + key, + } => { + // TODO: think about security implications of letting the client specify the bucket & prefix. + // It's the most flexible right now, but, possibly we want to move bucket name into PS conf + // and force the timeline_id into the prefix? + GenericRemoteStorage::AwsS3(Arc::new( + remote_storage::S3Bucket::new( + &remote_storage::S3Config { + bucket_name: bucket.clone(), + prefix_in_bucket: Some(key.clone()), + bucket_region: region.clone(), + endpoint: conf + .import_pgdata_aws_endpoint_url + .clone() + .map(|url| url.to_string()), // by specifying None here, remote_storage/aws-sdk-rust will infer from env + concurrency_limit: 100.try_into().unwrap(), // TODO: think about this + max_keys_per_list_response: Some(1000), // TODO: think about this + upload_storage_class: None, // irrelevant + }, + timeout, + ) + .await + .context("setup s3 bucket")?, + )) + } + }; + let storage_wrapper = RemoteStorageWrapper::new(location_storage, cancel); + Ok(storage_wrapper) +} + +/// Wrap [`remote_storage`] APIs to make it look a bit more like a filesystem API +/// such as [`tokio::fs`], which was used in the original implementation of the import code. +#[derive(Clone)] +pub struct RemoteStorageWrapper { + storage: GenericRemoteStorage, + cancel: CancellationToken, +} + +impl RemoteStorageWrapper { + pub fn new(storage: GenericRemoteStorage, cancel: CancellationToken) -> Self { + Self { storage, cancel } + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn listfilesindir( + &self, + path: &RemotePath, + ) -> Result, DownloadError> { + assert!( + path.object_name().is_some(), + "must specify dirname, without trailing slash" + ); + let path = path.add_trailing_slash(); + + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Listing { keys, prefixes: _ } = self + .storage + .list( + Some(&path), + remote_storage::ListingMode::WithDelimiter, + None, + &self.cancel, + ) + .await?; + let res = keys + .into_iter() + .map(|ListingObject { key, size, .. }| (key, size.into_usize())) + .collect(); + Ok(res) + }, + &format!("listfilesindir {path:?}"), + &self.cancel, + ) + .await; + debug!(?res, "returning"); + res + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn listdir(&self, path: &RemotePath) -> Result, DownloadError> { + assert!( + path.object_name().is_some(), + "must specify dirname, without trailing slash" + ); + let path = path.add_trailing_slash(); + + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Listing { keys, prefixes } = self + .storage + .list( + Some(&path), + remote_storage::ListingMode::WithDelimiter, + None, + &self.cancel, + ) + .await?; + let res = keys + .into_iter() + .map(|ListingObject { key, .. }| key) + .chain(prefixes.into_iter()) + .collect(); + Ok(res) + }, + &format!("listdir {path:?}"), + &self.cancel, + ) + .await; + debug!(?res, "returning"); + res + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn get(&self, path: &RemotePath) -> Result { + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Download { + download_stream, .. + } = self + .storage + .download(path, &DownloadOpts::default(), &self.cancel) + .await?; + let mut reader = tokio_util::io::StreamReader::new(download_stream); + + // XXX optimize this, can we get the capacity hint from somewhere? + let mut buf = Vec::new(); + tokio::io::copy_buf(&mut reader, &mut buf).await?; + Ok(Bytes::from(buf)) + }, + &format!("download {path:?}"), + &self.cancel, + ) + .await; + debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done"); + res + } + + pub async fn get_spec(&self) -> Result, anyhow::Error> { + self.get_json(&RemotePath::from_string("spec.json").unwrap()) + .await + .context("get spec") + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn get_json( + &self, + path: &RemotePath, + ) -> Result, DownloadError> { + let buf = match self.get(path).await { + Ok(buf) => buf, + Err(DownloadError::NotFound) => return Ok(None), + Err(err) => return Err(err), + }; + let res = serde_json::from_slice(&buf) + .context("serialize") + // TODO: own error type + .map_err(DownloadError::Other)?; + Ok(Some(res)) + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn put_json(&self, path: &RemotePath, value: &T) -> anyhow::Result<()> + where + T: serde::Serialize, + { + let buf = serde_json::to_vec(value)?; + let bytes = Bytes::from(buf); + utils::backoff::retry( + || async { + let size = bytes.len(); + let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); + self.storage + .upload_storage_object(bytes, size, path, &self.cancel) + .await + }, + remote_storage::TimeoutOrCancel::caused_by_cancel, + 1, + u32::MAX, + &format!("put json {path}"), + &self.cancel, + ) + .await + .expect("practically infinite retries") + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn get_range( + &self, + path: &RemotePath, + start_inclusive: u64, + end_exclusive: u64, + ) -> Result, DownloadError> { + let len = end_exclusive + .checked_sub(start_inclusive) + .unwrap() + .into_usize(); + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Download { + download_stream, .. + } = self + .storage + .download( + path, + &DownloadOpts { + etag: None, + byte_start: Bound::Included(start_inclusive), + byte_end: Bound::Excluded(end_exclusive) + }, + &self.cancel) + .await?; + let mut reader = tokio_util::io::StreamReader::new(download_stream); + + let mut buf = Vec::with_capacity(len); + tokio::io::copy_buf(&mut reader, &mut buf).await?; + Ok(buf) + }, + &format!("download range len=0x{len:x} [0x{start_inclusive:x},0x{end_exclusive:x}) from {path:?}"), + &self.cancel, + ) + .await; + debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done"); + res + } + + pub fn pgdata(&self) -> RemotePath { + RemotePath::from_string("pgdata").unwrap() + } + + pub async fn get_control_file(&self) -> Result { + let control_file_path = self.pgdata().join("global/pg_control"); + info!("get control file from {control_file_path}"); + let control_file_buf = self.get(&control_file_path).await?; + ControlFile::new(control_file_buf) + } +} + +pub struct ControlFile { + control_file_data: ControlFileData, + control_file_buf: Bytes, +} + +impl ControlFile { + pub(crate) fn new(control_file_buf: Bytes) -> Result { + // XXX ControlFileData is version-specific, we're always using v14 here. v17 had changes. + let control_file_data = ControlFileData::decode(&control_file_buf)?; + let control_file = ControlFile { + control_file_data, + control_file_buf, + }; + control_file.try_pg_version()?; // so that we can offer infallible pg_version() + Ok(control_file) + } + pub(crate) fn base_lsn(&self) -> Lsn { + Lsn(self.control_file_data.checkPoint).align() + } + pub(crate) fn pg_version(&self) -> u32 { + self.try_pg_version() + .expect("prepare() checks that try_pg_version doesn't error") + } + pub(crate) fn control_file_data(&self) -> &ControlFileData { + &self.control_file_data + } + pub(crate) fn control_file_buf(&self) -> &Bytes { + &self.control_file_buf + } + fn try_pg_version(&self) -> anyhow::Result { + Ok(match self.control_file_data.catalog_version_no { + // thesea are from catversion.h + 202107181 => 14, + 202209061 => 15, + 202307071 => 16, + /* XXX pg17 */ + catversion => { + anyhow::bail!("unrecognized catalog version {catversion}") + } + }) + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs new file mode 100644 index 0000000000..04ba3c6f1f --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs @@ -0,0 +1,20 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] +pub struct PgdataStatus { + pub done: bool, + // TODO: remaining fields +} + +#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] +pub struct ShardStatus { + pub done: bool, + // TODO: remaining fields +} + +// TODO: dedupe with fast_import code +#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] +pub struct Spec { + pub project_id: String, + pub branch_id: String, +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs new file mode 100644 index 0000000000..310d97a6a9 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -0,0 +1,68 @@ +use serde::{Deserialize, Serialize}; + +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum Root { + V1(V1), +} +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum V1 { + InProgress(InProgress), + Done(Done), +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(transparent)] +pub struct IdempotencyKey(String); + +impl IdempotencyKey { + pub fn new(s: String) -> Self { + Self(s) + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct InProgress { + pub idempotency_key: IdempotencyKey, + pub location: Location, + pub started_at: chrono::NaiveDateTime, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct Done { + pub idempotency_key: IdempotencyKey, + pub started_at: chrono::NaiveDateTime, + pub finished_at: chrono::NaiveDateTime, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum Location { + #[cfg(feature = "testing")] + LocalFs { path: Utf8PathBuf }, + AwsS3 { + region: String, + bucket: String, + key: String, + }, +} + +impl Root { + pub fn is_done(&self) -> bool { + match self { + Root::V1(v1) => match v1 { + V1::Done(_) => true, + V1::InProgress(_) => false, + }, + } + } + pub fn idempotency_key(&self) -> &IdempotencyKey { + match self { + Root::V1(v1) => match v1 { + V1::InProgress(in_progress) => &in_progress.idempotency_key, + V1::Done(done) => &done.idempotency_key, + }, + } + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs new file mode 100644 index 0000000000..c5210f9a30 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs @@ -0,0 +1,119 @@ +//! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate. +use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use tracing::error; + +use crate::config::PageServerConf; +use reqwest::Method; + +use super::importbucket_format::Spec; + +pub struct Client { + base_url: String, + authorization_header: Option, + client: reqwest::Client, + cancel: CancellationToken, +} + +pub type Result = std::result::Result; + +#[derive(Serialize, Deserialize, Debug)] +struct ImportProgressRequest { + // no fields yet, not sure if there every will be any +} + +#[derive(Serialize, Deserialize, Debug)] +struct ImportProgressResponse { + // we don't care +} + +impl Client { + pub fn new(conf: &PageServerConf, cancel: CancellationToken) -> anyhow::Result { + let Some(ref base_url) = conf.import_pgdata_upcall_api else { + anyhow::bail!("import_pgdata_upcall_api is not configured") + }; + Ok(Self { + base_url: base_url.to_string(), + client: reqwest::Client::new(), + cancel, + authorization_header: conf + .import_pgdata_upcall_api_token + .as_ref() + .map(|secret_string| secret_string.get_contents()) + .map(|jwt| format!("Bearer {jwt}")), + }) + } + + fn start_request( + &self, + method: Method, + uri: U, + ) -> reqwest::RequestBuilder { + let req = self.client.request(method, uri); + if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + } + } + + async fn request_noerror( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + self.start_request(method, uri) + .json(&body) + .send() + .await + .map_err(Error::ReceiveBody) + } + + async fn request( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let res = self.request_noerror(method, uri, body).await?; + let response = res.error_from_body().await?; + Ok(response) + } + + pub async fn send_progress_once(&self, spec: &Spec) -> Result<()> { + let url = format!( + "{}/projects/{}/branches/{}/import_progress", + self.base_url, spec.project_id, spec.branch_id + ); + let ImportProgressResponse {} = self + .request(Method::POST, url, &ImportProgressRequest {}) + .await? + .json() + .await + .map_err(Error::ReceiveBody)?; + Ok(()) + } + + pub async fn send_progress_until_success(&self, spec: &Spec) -> anyhow::Result<()> { + loop { + match self.send_progress_once(spec).await { + Ok(()) => return Ok(()), + Err(Error::Cancelled) => return Err(anyhow::anyhow!("cancelled")), + Err(err) => { + error!(?err, "error sending progress, retrying"); + if tokio::time::timeout( + std::time::Duration::from_secs(10), + self.cancel.cancelled(), + ) + .await + .is_ok() + { + anyhow::bail!("cancelled while sending early progress update"); + } + } + } + } + } +} diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index a93bdde3f8..80a09b4840 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -3,7 +3,7 @@ use std::{collections::hash_map::Entry, fs, sync::Arc}; use anyhow::Context; use camino::Utf8PathBuf; use tracing::{error, info, info_span}; -use utils::{fs_ext, id::TimelineId, lsn::Lsn}; +use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard}; use crate::{ context::RequestContext, @@ -23,14 +23,14 @@ use super::Timeline; pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard)>, } impl<'t> UninitializedTimeline<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard)>, ) -> Self { Self { owning_tenant, @@ -87,6 +87,10 @@ impl<'t> UninitializedTimeline<'t> { } } + pub(crate) fn finish_creation_myself(&mut self) -> (Arc, TimelineCreateGuard) { + self.raw_timeline.take().expect("already checked") + } + /// Prepares timeline data by loading it from the basebackup archive. pub(crate) async fn import_basebackup_from_tar( self, @@ -167,9 +171,10 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) { /// A guard for timeline creations in process: as long as this object exists, the timeline ID /// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline. #[must_use] -pub(crate) struct TimelineCreateGuard<'t> { - owning_tenant: &'t Tenant, - timeline_id: TimelineId, +pub(crate) struct TimelineCreateGuard { + pub(crate) _tenant_gate_guard: GateGuard, + pub(crate) owning_tenant: Arc, + pub(crate) timeline_id: TimelineId, pub(crate) timeline_path: Utf8PathBuf, pub(crate) idempotency: CreateTimelineIdempotency, } @@ -184,20 +189,27 @@ pub(crate) enum TimelineExclusionError { }, #[error("Already creating")] AlreadyCreating, + #[error("Shutting down")] + ShuttingDown, // e.g. I/O errors, or some failure deep in postgres initdb #[error(transparent)] Other(#[from] anyhow::Error), } -impl<'t> TimelineCreateGuard<'t> { +impl TimelineCreateGuard { pub(crate) fn new( - owning_tenant: &'t Tenant, + owning_tenant: &Arc, timeline_id: TimelineId, timeline_path: Utf8PathBuf, idempotency: CreateTimelineIdempotency, allow_offloaded: bool, ) -> Result { + let _tenant_gate_guard = owning_tenant + .gate + .enter() + .map_err(|_| TimelineExclusionError::ShuttingDown)?; + // Lock order: this is the only place we take both locks. During drop() we only // lock creating_timelines let timelines = owning_tenant.timelines.lock().unwrap(); @@ -225,8 +237,12 @@ impl<'t> TimelineCreateGuard<'t> { return Err(TimelineExclusionError::AlreadyCreating); } creating_timelines.insert(timeline_id); + drop(creating_timelines); + drop(timelines_offloaded); + drop(timelines); Ok(Self { - owning_tenant, + _tenant_gate_guard, + owning_tenant: Arc::clone(owning_tenant), timeline_id, timeline_path, idempotency, @@ -234,7 +250,7 @@ impl<'t> TimelineCreateGuard<'t> { } } -impl Drop for TimelineCreateGuard<'_> { +impl Drop for TimelineCreateGuard { fn drop(&mut self) { self.owning_tenant .timelines_creating diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 4a3a5c621b..f831f5e48a 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -38,6 +38,7 @@ use storage_broker::BrokerClientChannel; use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::postgres_client::PostgresClientProtocol; use self::connection_manager::ConnectionManagerStatus; @@ -45,6 +46,7 @@ use super::Timeline; #[derive(Clone)] pub struct WalReceiverConf { + pub protocol: PostgresClientProtocol, /// The timeout on the connection to safekeeper for WAL streaming. pub wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index de50f217d8..7a64703a30 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -36,7 +36,9 @@ use postgres_connection::PgConnectionConfig; use utils::backoff::{ exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; -use utils::postgres_client::wal_stream_connection_config; +use utils::postgres_client::{ + wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol, +}; use utils::{ id::{NodeId, TenantTimelineId}, lsn::Lsn, @@ -984,15 +986,33 @@ impl ConnectionManagerState { if info.safekeeper_connstr.is_empty() { return None; // no connection string, ignore sk } - match wal_stream_connection_config( - self.id, - info.safekeeper_connstr.as_ref(), - match &self.conf.auth_token { - None => None, - Some(x) => Some(x), + + let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol { + PostgresClientProtocol::Vanilla => { + (None, None, None) }, - self.conf.availability_zone.as_deref(), - ) { + PostgresClientProtocol::Interpreted => { + let shard_identity = self.timeline.get_shard_identity(); + ( + Some(shard_identity.number.0), + Some(shard_identity.count.0), + Some(shard_identity.stripe_size.0), + ) + } + }; + + let connection_conf_args = ConnectionConfigArgs { + protocol: self.conf.protocol, + ttid: self.id, + shard_number, + shard_count, + shard_stripe_size, + listen_pg_addr_str: info.safekeeper_connstr.as_ref(), + auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()), + availability_zone: self.conf.availability_zone.as_deref() + }; + + match wal_stream_connection_config(connection_conf_args) { Ok(connstr) => Some((*sk_id, info, connstr)), Err(e) => { error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id); @@ -1096,6 +1116,7 @@ impl ReconnectReason { mod tests { use super::*; use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL; use url::Host; fn dummy_broker_sk_timeline( @@ -1532,6 +1553,7 @@ mod tests { timeline, cancel: CancellationToken::new(), conf: WalReceiverConf { + protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 6ac6920d47..1a0e66ceb3 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -36,7 +36,7 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::{id::NodeId, lsn::Lsn}; +use utils::{bin_ser::BeSer, id::NodeId, lsn::Lsn}; use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; /// Status of the connection. @@ -291,6 +291,15 @@ pub(super) async fn handle_walreceiver_connection( connection_status.latest_connection_update = now; connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end())); } + ReplicationMessage::RawInterpretedWalRecords(raw) => { + connection_status.latest_connection_update = now; + if !raw.data().is_empty() { + connection_status.latest_wal_update = now; + } + + connection_status.commit_lsn = Some(Lsn::from(raw.commit_lsn())); + connection_status.streaming_lsn = Some(Lsn::from(raw.streaming_lsn())); + } &_ => {} }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { @@ -298,7 +307,130 @@ pub(super) async fn handle_walreceiver_connection( return Ok(()); } + async fn commit( + modification: &mut DatadirModification<'_>, + uncommitted: &mut u64, + filtered: &mut u64, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + WAL_INGEST + .records_committed + .inc_by(*uncommitted - *filtered); + modification.commit(ctx).await?; + *uncommitted = 0; + *filtered = 0; + Ok(()) + } + let status_update = match replication_message { + ReplicationMessage::RawInterpretedWalRecords(raw) => { + WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64); + + let mut uncommitted_records = 0; + let mut filtered_records = 0; + + // This is the end LSN of the raw WAL from which the records + // were interpreted. + let streaming_lsn = Lsn::from(raw.streaming_lsn()); + tracing::debug!( + "Received WAL up to {streaming_lsn} with next_record_lsn={}", + Lsn(raw.next_record_lsn().unwrap_or(0)) + ); + + let records = Vec::::des(raw.data()).with_context(|| { + anyhow::anyhow!( + "Failed to deserialize interpreted records ending at LSN {streaming_lsn}" + ) + })?; + + // We start the modification at 0 because each interpreted record + // advances it to its end LSN. 0 is just an initialization placeholder. + let mut modification = timeline.begin_modification(Lsn(0)); + + for interpreted in records { + if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) + && uncommitted_records > 0 + { + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; + } + + let next_record_lsn = interpreted.next_record_lsn; + let ingested = walingest + .ingest_record(interpreted, &mut modification, &ctx) + .await + .with_context(|| format!("could not ingest record at {next_record_lsn}"))?; + + if !ingested { + tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}"); + WAL_INGEST.records_filtered.inc(); + filtered_records += 1; + } + + uncommitted_records += 1; + + // FIXME: this cannot be made pausable_failpoint without fixing the + // failpoint library; in tests, the added amount of debugging will cause us + // to timeout the tests. + fail_point!("walreceiver-after-ingest"); + + // Commit every ingest_batch_size records. Even if we filtered out + // all records, we still need to call commit to advance the LSN. + if uncommitted_records >= ingest_batch_size + || modification.approx_pending_bytes() + > DatadirModification::MAX_PENDING_BYTES + { + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; + } + } + + // Records might have been filtered out on the safekeeper side, but we still + // need to advance last record LSN on all shards. If we've not ingested the latest + // record, then set the LSN of the modification past it. This way all shards + // advance their last record LSN at the same time. + let needs_last_record_lsn_advance = match raw.next_record_lsn().map(Lsn::from) { + Some(lsn) if lsn > modification.get_lsn() => { + modification.set_lsn(lsn).unwrap(); + true + } + _ => false, + }; + + if uncommitted_records > 0 || needs_last_record_lsn_advance { + // Commit any uncommitted records + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; + } + + if !caught_up && streaming_lsn >= end_of_wal { + info!("caught up at LSN {streaming_lsn}"); + caught_up = true; + } + + tracing::debug!( + "Ingested WAL up to {streaming_lsn}. Last record LSN is {}", + timeline.get_last_record_lsn() + ); + + Some(streaming_lsn) + } + ReplicationMessage::XLogData(xlog_data) => { // Pass the WAL data to the decoder, and see if we can decode // more records as a result. @@ -316,21 +448,6 @@ pub(super) async fn handle_walreceiver_connection( let mut uncommitted_records = 0; let mut filtered_records = 0; - async fn commit( - modification: &mut DatadirModification<'_>, - uncommitted: &mut u64, - filtered: &mut u64, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - WAL_INGEST - .records_committed - .inc_by(*uncommitted - *filtered); - modification.commit(ctx).await?; - *uncommitted = 0; - *filtered = 0; - Ok(()) - } - while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? { // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 592f41cb21..ef3aa759f3 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use std::collections::HashSet; use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; @@ -14,7 +15,6 @@ use utils::lsn::AtomicLsn; use std::sync::atomic::AtomicU32; use utils::lsn::Lsn; -#[cfg(feature = "testing")] use utils::generation::Generation; // clippy warns that Uninitialized is much smaller than Initialized, which wastes @@ -38,6 +38,12 @@ impl UploadQueue { } } +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub(crate) enum OpType { + MayReorder, + FlushDeletion, +} + /// This keeps track of queued and in-progress tasks. pub(crate) struct UploadQueueInitialized { /// Counter to assign task IDs @@ -88,6 +94,12 @@ pub(crate) struct UploadQueueInitialized { #[cfg(feature = "testing")] pub(crate) dangling_files: HashMap, + /// Ensure we order file operations correctly. + pub(crate) recently_deleted: HashSet<(LayerName, Generation)>, + + /// Deletions that are blocked by the tenant configuration + pub(crate) blocked_deletions: Vec, + /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, @@ -180,6 +192,8 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + recently_deleted: HashSet::new(), + blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; @@ -220,6 +234,8 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + recently_deleted: HashSet::new(), + blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; @@ -270,15 +286,15 @@ pub(crate) struct UploadTask { /// A deletion of some layers within the lifetime of a timeline. This is not used /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct Delete { pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, } #[derive(Debug)] pub(crate) enum UploadOp { - /// Upload a layer file - UploadLayer(ResidentLayer, LayerFileMetadata), + /// Upload a layer file. The last field indicates the last operation for thie file. + UploadLayer(ResidentLayer, LayerFileMetadata, Option), /// Upload a index_part.json file UploadMetadata { @@ -300,11 +316,11 @@ pub(crate) enum UploadOp { impl std::fmt::Display for UploadOp { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { - UploadOp::UploadLayer(layer, metadata) => { + UploadOp::UploadLayer(layer, metadata, mode) => { write!( f, - "UploadLayer({}, size={:?}, gen={:?})", - layer, metadata.file_size, metadata.generation + "UploadLayer({}, size={:?}, gen={:?}, mode={:?})", + layer, metadata.file_size, metadata.generation, mode ) } UploadOp::UploadMetadata { uploaded, .. } => { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs index 6cecf34c1c..1952b82578 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs @@ -19,7 +19,7 @@ impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign> { } } -impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> { +impl Deref for AlignedSlice<'_, N, A> { type Target = [u8; N]; fn deref(&self) -> &Self::Target { @@ -27,13 +27,13 @@ impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> { } } -impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> { +impl DerefMut for AlignedSlice<'_, N, A> { fn deref_mut(&mut self) -> &mut Self::Target { self.buf } } -impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> { +impl AsRef<[u8; N]> for AlignedSlice<'_, N, A> { fn as_ref(&self) -> &[u8; N] { self.buf } diff --git a/poetry.lock b/poetry.lock index 6171f92391..e2fca7be47 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -114,7 +114,6 @@ files = [ [package.dependencies] aiohappyeyeballs = ">=2.3.0" aiosignal = ">=1.1.2" -async-timeout = {version = ">=4.0,<6.0", markers = "python_version < \"3.11\""} attrs = ">=17.3.0" frozenlist = ">=1.1.1" multidict = ">=4.5,<7.0" @@ -219,10 +218,8 @@ files = [ ] [package.dependencies] -exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} idna = ">=2.8" sniffio = ">=1.1" -typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} [package.extras] doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] @@ -737,10 +734,7 @@ files = [ [package.dependencies] jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" -urllib3 = [ - {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, - {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}, -] +urllib3 = {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""} [package.extras] crt = ["awscrt (==0.19.19)"] @@ -1069,20 +1063,6 @@ docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"] ssh = ["paramiko (>=2.4.3)"] websockets = ["websocket-client (>=1.3.0)"] -[[package]] -name = "exceptiongroup" -version = "1.1.1" -description = "Backport of PEP 654 (exception groups)" -optional = false -python-versions = ">=3.7" -files = [ - {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, - {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, -] - -[package.extras] -test = ["pytest (>=6)"] - [[package]] name = "execnet" version = "1.9.0" @@ -1110,7 +1090,6 @@ files = [ [package.dependencies] click = ">=8.0" -importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.0" Jinja2 = ">=3.0" Werkzeug = ">=2.2.2" @@ -1319,25 +1298,6 @@ files = [ {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] -[[package]] -name = "importlib-metadata" -version = "4.12.0" -description = "Read metadata from Python packages" -optional = false -python-versions = ">=3.7" -files = [ - {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"}, - {file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"}, -] - -[package.dependencies] -zipp = ">=0.5" - -[package.extras] -docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"] -perf = ["ipython"] -testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] - [[package]] name = "iniconfig" version = "1.1.1" @@ -1898,48 +1858,54 @@ files = [ [[package]] name = "mypy" -version = "1.3.0" +version = "1.13.0" description = "Optional static typing for Python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "mypy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eb485cea53f4f5284e5baf92902cd0088b24984f4209e25981cc359d64448d"}, - {file = "mypy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c99c3ecf223cf2952638da9cd82793d8f3c0c5fa8b6ae2b2d9ed1e1ff51ba85"}, - {file = "mypy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550a8b3a19bb6589679a7c3c31f64312e7ff482a816c96e0cecec9ad3a7564dd"}, - {file = "mypy-1.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cbc07246253b9e3d7d74c9ff948cd0fd7a71afcc2b77c7f0a59c26e9395cb152"}, - {file = "mypy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:a22435632710a4fcf8acf86cbd0d69f68ac389a3892cb23fbad176d1cddaf228"}, - {file = "mypy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6e33bb8b2613614a33dff70565f4c803f889ebd2f859466e42b46e1df76018dd"}, - {file = "mypy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d23370d2a6b7a71dc65d1266f9a34e4cde9e8e21511322415db4b26f46f6b8c"}, - {file = "mypy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658fe7b674769a0770d4b26cb4d6f005e88a442fe82446f020be8e5f5efb2fae"}, - {file = "mypy-1.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d29e324cdda61daaec2336c42512e59c7c375340bd202efa1fe0f7b8f8ca"}, - {file = "mypy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0b6c62206e04061e27009481cb0ec966f7d6172b5b936f3ead3d74f29fe3dcf"}, - {file = "mypy-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:76ec771e2342f1b558c36d49900dfe81d140361dd0d2df6cd71b3db1be155409"}, - {file = "mypy-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc95f8386314272bbc817026f8ce8f4f0d2ef7ae44f947c4664efac9adec929"}, - {file = "mypy-1.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:faff86aa10c1aa4a10e1a301de160f3d8fc8703b88c7e98de46b531ff1276a9a"}, - {file = "mypy-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8c5979d0deb27e0f4479bee18ea0f83732a893e81b78e62e2dda3e7e518c92ee"}, - {file = "mypy-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c5d2cc54175bab47011b09688b418db71403aefad07cbcd62d44010543fc143f"}, - {file = "mypy-1.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:87df44954c31d86df96c8bd6e80dfcd773473e877ac6176a8e29898bfb3501cb"}, - {file = "mypy-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473117e310febe632ddf10e745a355714e771ffe534f06db40702775056614c4"}, - {file = "mypy-1.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:74bc9b6e0e79808bf8678d7678b2ae3736ea72d56eede3820bd3849823e7f305"}, - {file = "mypy-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:44797d031a41516fcf5cbfa652265bb994e53e51994c1bd649ffcd0c3a7eccbf"}, - {file = "mypy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ddae0f39ca146972ff6bb4399f3b2943884a774b8771ea0a8f50e971f5ea5ba8"}, - {file = "mypy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1c4c42c60a8103ead4c1c060ac3cdd3ff01e18fddce6f1016e08939647a0e703"}, - {file = "mypy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e86c2c6852f62f8f2b24cb7a613ebe8e0c7dc1402c61d36a609174f63e0ff017"}, - {file = "mypy-1.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f9dca1e257d4cc129517779226753dbefb4f2266c4eaad610fc15c6a7e14283e"}, - {file = "mypy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:95d8d31a7713510685b05fbb18d6ac287a56c8f6554d88c19e73f724a445448a"}, - {file = "mypy-1.3.0-py3-none-any.whl", hash = "sha256:a8763e72d5d9574d45ce5881962bc8e9046bf7b375b0abf031f3e6811732a897"}, - {file = "mypy-1.3.0.tar.gz", hash = "sha256:e1f4d16e296f5135624b34e8fb741eb0eadedca90862405b1f1fde2040b9bd11"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"}, + {file = "mypy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7"}, + {file = "mypy-1.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f"}, + {file = "mypy-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d"}, + {file = "mypy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b"}, + {file = "mypy-1.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73"}, + {file = "mypy-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e"}, + {file = "mypy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2"}, + {file = "mypy-1.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0"}, + {file = "mypy-1.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62"}, + {file = "mypy-1.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8"}, + {file = "mypy-1.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7"}, + {file = "mypy-1.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb"}, + {file = "mypy-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b"}, + {file = "mypy-1.13.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74"}, + {file = "mypy-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732"}, + {file = "mypy-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc"}, + {file = "mypy-1.13.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d"}, + {file = "mypy-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24"}, + {file = "mypy-1.13.0-py3-none-any.whl", hash = "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a"}, + {file = "mypy-1.13.0.tar.gz", hash = "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e"}, ] [package.dependencies] mypy-extensions = ">=1.0.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=3.10" +typing-extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] +faster-cache = ["orjson"] install-types = ["pip"] -python2 = ["typed-ast (>=1.4.0,<2)"] +mypyc = ["setuptools (>=50)"] reports = ["lxml"] [[package]] @@ -2514,11 +2480,9 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" -tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] @@ -2581,10 +2545,7 @@ files = [ ] [package.dependencies] -pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, -] +pytest = {version = ">=6.2.4", markers = "python_version >= \"3.10\""} [[package]] name = "pytest-repeat" @@ -3092,17 +3053,6 @@ files = [ {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -optional = false -python-versions = ">=3.7" -files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, -] - [[package]] name = "types-jwcrypto" version = "1.5.0.20240925" @@ -3359,16 +3309,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -3523,21 +3463,6 @@ idna = ">=2.0" multidict = ">=4.0" propcache = ">=0.2.0" -[[package]] -name = "zipp" -version = "3.19.1" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.8" -files = [ - {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"}, - {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"}, -] - -[package.extras] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] - [[package]] name = "zstandard" version = "0.21.0" @@ -3598,5 +3523,5 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" -python-versions = "^3.9" -content-hash = "8cb9c38d83eec441391c0528ac2fbefde18c734373b2399e07c69382044e8ced" +python-versions = "^3.11" +content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486" diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 6d26c99832..491b272ac4 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -1,17 +1,17 @@ use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use tracing::{debug, info, warn}; use super::{ComputeCredentials, ComputeUserInfo}; use crate::auth::backend::ComputeCredentialKeys; use crate::auth::{self, AuthFlow}; use crate::config::AuthenticationConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::stream::{PqStream, Stream}; use crate::{compute, sasl}; pub(super) async fn authenticate( - ctx: &RequestMonitoring, + ctx: &RequestContext, creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, @@ -21,11 +21,11 @@ pub(super) async fn authenticate( let scram_keys = match secret { #[cfg(any(test, feature = "testing"))] AuthSecret::Md5(_) => { - info!("auth endpoint chooses MD5"); + debug!("auth endpoint chooses MD5"); return Err(auth::AuthError::bad_auth_method("MD5")); } AuthSecret::Scram(secret) => { - info!("auth endpoint chooses SCRAM"); + debug!("auth endpoint chooses SCRAM"); let scram = auth::Scram(&secret, ctx); let auth_outcome = tokio::time::timeout( @@ -50,6 +50,8 @@ pub(super) async fn authenticate( let client_key = match auth_outcome { sasl::Outcome::Success(key) => key, sasl::Outcome::Failure(reason) => { + // TODO: warnings? + // TODO: should we get rid of this because double logging? info!("auth backend failed with an error: {reason}"); return Err(auth::AuthError::password_failed(&*creds.user)); } diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index e25dc3d45e..bf7a1cb070 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -6,9 +6,10 @@ use tokio_postgres::config::SslMode; use tracing::{info, info_span}; use super::ComputeCredentialKeys; +use crate::auth::IpPattern; use crate::cache::Cached; use crate::config::AuthenticationConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; use crate::proxy::connect_compute::ComputeConnectBackend; @@ -71,13 +72,13 @@ impl ConsoleRedirectBackend { pub(crate) async fn authenticate( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result { + ) -> auth::Result<(ConsoleRedirectNodeInfo, Option>)> { authenticate(ctx, auth_config, &self.console_uri, client) .await - .map(ConsoleRedirectNodeInfo) + .map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist)) } } @@ -87,7 +88,7 @@ pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo); impl ComputeConnectBackend for ConsoleRedirectNodeInfo { async fn wake_compute( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, ) -> Result { Ok(Cached::new_uncached(self.0.clone())) } @@ -98,11 +99,11 @@ impl ComputeConnectBackend for ConsoleRedirectNodeInfo { } async fn authenticate( - ctx: &RequestMonitoring, + ctx: &RequestContext, auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, client: &mut PqStream, -) -> auth::Result { +) -> auth::Result<(NodeInfo, Option>)> { ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect); // registering waiter can fail if we get unlucky with rng. @@ -176,9 +177,12 @@ async fn authenticate( config.password(password.as_ref()); } - Ok(NodeInfo { - config, - aux: db_info.aux, - allow_self_signed_compute: false, // caller may override - }) + Ok(( + NodeInfo { + config, + aux: db_info.aux, + allow_self_signed_compute: false, // caller may override + }, + db_info.allowed_ips, + )) } diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 1411d908a5..3316543022 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -4,7 +4,7 @@ use tracing::{debug, info}; use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; use crate::auth::{self, AuthFlow}; use crate::config::AuthenticationConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::intern::EndpointIdInt; use crate::sasl; @@ -15,7 +15,7 @@ use crate::stream::{self, Stream}; /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. pub(crate) async fn authenticate_cleartext( - ctx: &RequestMonitoring, + ctx: &RequestContext, info: ComputeUserInfo, client: &mut stream::PqStream>, secret: AuthSecret, @@ -57,7 +57,7 @@ pub(crate) async fn authenticate_cleartext( /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) pub(crate) async fn password_hack_no_authentication( - ctx: &RequestMonitoring, + ctx: &RequestContext, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, ) -> auth::Result<(ComputeUserInfo, Vec)> { @@ -73,7 +73,7 @@ pub(crate) async fn password_hack_no_authentication( .get_password() .await?; - info!(project = &*payload.endpoint, "received missing parameter"); + debug!(project = &*payload.endpoint, "received missing parameter"); // Report tentative success; compute node will check the password anyway. Ok(( diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index bfc674139b..517d4fd34b 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -17,7 +17,7 @@ use thiserror::Error; use tokio::time::Instant; use crate::auth::backend::ComputeCredentialKeys; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::errors::GetEndpointJwksError; use crate::http::read_body_with_limit; use crate::intern::RoleNameInt; @@ -39,7 +39,7 @@ const JWKS_FETCH_RETRIES: u32 = 3; pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { fn fetch_auth_rules( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> impl Future, FetchAuthRulesError>> + Send; } @@ -132,6 +132,93 @@ struct JwkSet<'a> { keys: Vec<&'a RawValue>, } +/// Given a jwks_url, fetch the JWKS and parse out all the signing JWKs. +/// Returns `None` and log a warning if there are any errors. +async fn fetch_jwks( + client: &reqwest_middleware::ClientWithMiddleware, + jwks_url: url::Url, +) -> Option { + let req = client.get(jwks_url.clone()); + // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. + let resp = req.send().await.and_then(|r| { + r.error_for_status() + .map_err(reqwest_middleware::Error::Reqwest) + }); + + let resp = match resp { + Ok(r) => r, + // TODO: should we re-insert JWKs if we want to keep this JWKs URL? + // I expect these failures would be quite sparse. + Err(e) => { + tracing::warn!(url=?jwks_url, error=?e, "could not fetch JWKs"); + return None; + } + }; + + let resp: http::Response = resp.into(); + + let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE).await { + Ok(bytes) => bytes, + Err(e) => { + tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs"); + return None; + } + }; + + let jwks = match serde_json::from_slice::(&bytes) { + Ok(jwks) => jwks, + Err(e) => { + tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs"); + return None; + } + }; + + // `jose_jwk::Jwk` is quite large (288 bytes). Let's not pre-allocate for what we don't need. + // + // Even though we limit our responses to 64KiB, we could still receive a payload like + // `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`. Parsing this as `RawValue` uses 468KiB. + // Pre-allocating the corresponding `Vec::::with_capacity(30000)` uses 8.2MiB. + let mut keys = vec![]; + + let mut failed = 0; + for key in jwks.keys { + let key = match serde_json::from_str::(key.get()) { + Ok(key) => key, + Err(e) => { + tracing::debug!(url=?jwks_url, failed=?e, "could not decode JWK"); + failed += 1; + continue; + } + }; + + // if `use` (called `cls` in rust) is specified to be something other than signing, + // we can skip storing it. + if key + .prm + .cls + .as_ref() + .is_some_and(|c| *c != jose_jwk::Class::Signing) + { + continue; + } + + keys.push(key); + } + + keys.shrink_to_fit(); + + if failed > 0 { + tracing::warn!(url=?jwks_url, failed, "could not decode JWKs"); + } + + if keys.is_empty() { + tracing::warn!(url=?jwks_url, "no valid JWKs found inside the response body"); + return None; + } + + Some(jose_jwk::JwkSet { keys }) +} + impl JwkCacheEntryLock { async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { JwkRenewalPermit::acquire_permit(self).await @@ -144,7 +231,7 @@ impl JwkCacheEntryLock { async fn renew_jwks( &self, _permit: JwkRenewalPermit<'_>, - ctx: &RequestMonitoring, + ctx: &RequestContext, client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, auth_rules: &F, @@ -166,87 +253,15 @@ impl JwkCacheEntryLock { // TODO(conrad): run concurrently // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) for rule in rules { - let req = client.get(rule.jwks_url.clone()); - // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`. - // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. - match req.send().await.and_then(|r| { - r.error_for_status() - .map_err(reqwest_middleware::Error::Reqwest) - }) { - // todo: should we re-insert JWKs if we want to keep this JWKs URL? - // I expect these failures would be quite sparse. - Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"), - Ok(r) => { - let resp: http::Response = r.into(); - - let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE) - .await - { - Ok(bytes) => bytes, - Err(e) => { - tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); - continue; - } - }; - - match serde_json::from_slice::(&bytes) { - Err(e) => { - tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); - } - Ok(jwks) => { - // size_of::<&RawValue>() == 16 - // size_of::() == 288 - // better to not pre-allocate this as it might be pretty large - especially if it has many - // keys we don't want or need. - // trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}` - // this would consume 8MiB just like that! - let mut keys = vec![]; - let mut failed = 0; - for key in jwks.keys { - match serde_json::from_str::(key.get()) { - Ok(key) => { - // if `use` (called `cls` in rust) is specified to be something other than signing, - // we can skip storing it. - if key - .prm - .cls - .as_ref() - .is_some_and(|c| *c != jose_jwk::Class::Signing) - { - continue; - } - - keys.push(key); - } - Err(e) => { - tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK"); - failed += 1; - } - } - } - keys.shrink_to_fit(); - - if failed > 0 { - tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs"); - } - - if keys.is_empty() { - tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body"); - continue; - } - - let jwks = jose_jwk::JwkSet { keys }; - key_sets.insert( - rule.id, - KeySet { - jwks, - audience: rule.audience, - role_names: rule.role_names, - }, - ); - } - }; - } + if let Some(jwks) = fetch_jwks(client, rule.jwks_url).await { + key_sets.insert( + rule.id, + KeySet { + jwks, + audience: rule.audience, + role_names: rule.role_names, + }, + ); } } @@ -261,7 +276,7 @@ impl JwkCacheEntryLock { async fn get_or_update_jwk_cache( self: &Arc, - ctx: &RequestMonitoring, + ctx: &RequestContext, client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, fetch: &F, @@ -314,7 +329,7 @@ impl JwkCacheEntryLock { async fn check_jwt( self: &Arc, - ctx: &RequestMonitoring, + ctx: &RequestContext, jwt: &str, client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, @@ -409,7 +424,7 @@ impl JwkCacheEntryLock { impl JwkCache { pub(crate) async fn check_jwt( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, role_name: &RoleName, fetch: &F, @@ -941,7 +956,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL impl FetchAuthRules for Fetch { async fn fetch_auth_rules( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _endpoint: EndpointId, ) -> Result, FetchAuthRulesError> { Ok(self.0.clone()) @@ -1039,7 +1054,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL for token in &tokens { jwk_cache .check_jwt( - &RequestMonitoring::test(), + &RequestContext::test(), endpoint.clone(), role, &fetch, @@ -1097,7 +1112,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL jwk_cache .check_jwt( - &RequestMonitoring::test(), + &RequestContext::test(), endpoint.clone(), &role_name, &fetch, @@ -1136,7 +1151,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL let ep = EndpointId::from("ep"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let err = jwk_cache .check_jwt(&ctx, ep, &role, &fetch, &bad_jwt) .await @@ -1175,7 +1190,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL // this role_name is not accepted let bad_role_name = RoleName::from("cloud_admin"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let err = jwk_cache .check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt) .await @@ -1268,7 +1283,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL let ep = EndpointId::from("ep"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); for test in table { let jwt = new_custom_ec_jwt("1".into(), &key, test.body); @@ -1336,7 +1351,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL jwk_cache .check_jwt( - &RequestMonitoring::test(), + &RequestContext::test(), endpoint.clone(), &role_name, &fetch, diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index f9cb085daf..32e0f53615 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -7,7 +7,7 @@ use super::jwt::{AuthRule, FetchAuthRules}; use crate::auth::backend::jwt::FetchAuthRulesError; use crate::compute::ConnCfg; use crate::compute_ctl::ComputeCtlApi; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::control_plane::NodeInfo; use crate::http; @@ -56,7 +56,7 @@ pub static JWKS_ROLE_MAP: ArcSwapOption = ArcSwapOption::c impl FetchAuthRules for StaticAuthRules { async fn fetch_auth_rules( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _endpoint: EndpointId, ) -> Result, FetchAuthRulesError> { let mappings = JWKS_ROLE_MAP.load(); diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 242fe99de2..7e1b26a11a 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -6,7 +6,6 @@ pub mod local; use std::net::IpAddr; use std::sync::Arc; -use std::time::Duration; pub use console_redirect::ConsoleRedirectBackend; pub(crate) use console_redirect::ConsoleRedirectError; @@ -14,13 +13,13 @@ use ipnet::{Ipv4Net, Ipv6Net}; use local::LocalBackend; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::AuthKeys; -use tracing::{info, warn}; +use tracing::{debug, info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint}; use crate::cache::Cached; use crate::config::AuthenticationConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::client::ControlPlaneClient; use crate::control_plane::errors::GetAuthInfoError; use crate::control_plane::{ @@ -30,7 +29,7 @@ use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; -use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; +use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter}; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; use crate::{scram, stream}; @@ -192,25 +191,10 @@ impl MaskedIp { // This can't be just per IP because that would limit some PaaS that share IP addresses pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>; -impl RateBucketInfo { - /// All of these are per endpoint-maskedip pair. - /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). - /// - /// First bucket: 1000mcpus total per endpoint-ip pair - /// * 4096000 requests per second with 1 hash rounds. - /// * 1000 requests per second with 4096 hash rounds. - /// * 6.8 requests per second with 600000 hash rounds. - pub const DEFAULT_AUTH_SET: [Self; 3] = [ - Self::new(1000 * 4096, Duration::from_secs(1)), - Self::new(600 * 4096, Duration::from_secs(60)), - Self::new(300 * 4096, Duration::from_secs(600)), - ]; -} - impl AuthenticationConfig { pub(crate) fn check_rate_limit( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, secret: AuthSecret, endpoint: &EndpointId, is_cleartext: bool, @@ -265,7 +249,7 @@ impl AuthenticationConfig { /// /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( - ctx: &RequestMonitoring, + ctx: &RequestContext, api: &impl control_plane::ControlPlaneApi, user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, @@ -286,7 +270,7 @@ async fn auth_quirks( Ok(info) => (info, None), }; - info!("fetching user's authentication info"); + debug!("fetching user's authentication info"); let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list @@ -343,7 +327,7 @@ async fn auth_quirks( } async fn authenticate_with_secret( - ctx: &RequestMonitoring, + ctx: &RequestContext, secret: AuthSecret, info: ComputeUserInfo, client: &mut stream::PqStream>, @@ -396,7 +380,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub(crate) async fn authenticate( self, - ctx: &RequestMonitoring, + ctx: &RequestContext, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, @@ -404,7 +388,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { ) -> auth::Result> { let res = match self { Self::ControlPlane(api, user_info) => { - info!( + debug!( user = &*user_info.user, project = user_info.endpoint(), "performing authentication using the console" @@ -427,6 +411,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { } }; + // TODO: replace with some metric info!("user successfully authenticated"); Ok(res) } @@ -435,7 +420,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { impl Backend<'_, ComputeUserInfo> { pub(crate) async fn get_role_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, ) -> Result { match self { Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await, @@ -445,7 +430,7 @@ impl Backend<'_, ComputeUserInfo> { pub(crate) async fn get_allowed_ips_and_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { match self { Self::ControlPlane(api, user_info) => { @@ -460,7 +445,7 @@ impl Backend<'_, ComputeUserInfo> { impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, ) -> Result { match self { Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, @@ -496,7 +481,7 @@ mod tests { use crate::auth::backend::MaskedIp; use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; use crate::config::AuthenticationConfig; - use crate::context::RequestMonitoring; + use crate::context::RequestContext; use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret}; use crate::proxy::NeonOptions; use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; @@ -512,7 +497,7 @@ mod tests { impl control_plane::ControlPlaneApi for Auth { async fn get_role_secret( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _user_info: &super::ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) @@ -520,7 +505,7 @@ mod tests { async fn get_allowed_ips_and_secret( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _user_info: &super::ComputeUserInfo, ) -> Result< (CachedAllowedIps, Option), @@ -534,7 +519,7 @@ mod tests { async fn get_endpoint_jwks( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _endpoint: crate::types::EndpointId, ) -> Result, control_plane::errors::GetEndpointJwksError> { @@ -543,7 +528,7 @@ mod tests { async fn wake_compute( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _user_info: &super::ComputeUserInfo, ) -> Result { unimplemented!() @@ -622,7 +607,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -699,7 +684,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -751,7 +736,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index ddecae6af5..f6bce9f2d8 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -7,10 +7,10 @@ use std::str::FromStr; use itertools::Itertools; use pq_proto::StartupMessageParams; use thiserror::Error; -use tracing::{info, warn}; +use tracing::{debug, warn}; use crate::auth::password_hack::parse_endpoint_param; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, SniKind}; use crate::proxy::NeonOptions; @@ -86,7 +86,7 @@ pub(crate) fn endpoint_sni( impl ComputeUserInfoMaybeEndpoint { pub(crate) fn parse( - ctx: &RequestMonitoring, + ctx: &RequestContext, params: &StartupMessageParams, sni: Option<&str>, common_names: Option<&HashSet>, @@ -147,22 +147,22 @@ impl ComputeUserInfoMaybeEndpoint { } let metrics = Metrics::get(); - info!(%user, "credentials"); + debug!(%user, "credentials"); if sni.is_some() { - info!("Connection with sni"); + debug!("Connection with sni"); metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); } else if endpoint.is_some() { metrics .proxy .accepted_connections_by_sni .inc(SniKind::NoSni); - info!("Connection without sni"); + debug!("Connection without sni"); } else { metrics .proxy .accepted_connections_by_sni .inc(SniKind::PasswordHack); - info!("Connection with password hack"); + debug!("Connection with password hack"); } let options = NeonOptions::parse_params(params); @@ -260,7 +260,7 @@ mod tests { fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -275,7 +275,7 @@ mod tests { ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -290,7 +290,7 @@ mod tests { let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); @@ -307,7 +307,7 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -322,7 +322,7 @@ mod tests { ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -340,7 +340,7 @@ mod tests { ), ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -355,7 +355,7 @@ mod tests { ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -370,7 +370,7 @@ mod tests { let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); @@ -385,14 +385,14 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); @@ -408,7 +408,7 @@ mod tests { let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) .expect_err("should fail"); match err { @@ -427,7 +427,7 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) .expect_err("should fail"); match err { @@ -447,7 +447,7 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 6294549ff6..9c6ce151cb 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -11,7 +11,7 @@ use tracing::info; use super::backend::ComputeCredentialKeys; use super::{AuthError, PasswordHackPayload}; use crate::config::TlsServerEndPoint; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::intern::EndpointIdInt; use crate::sasl; @@ -32,7 +32,7 @@ pub(crate) struct Begin; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. pub(crate) struct Scram<'a>( pub(crate) &'a scram::ServerSecret, - pub(crate) &'a RequestMonitoring, + pub(crate) &'a RequestContext, ); impl AuthMethod for Scram<'_> { @@ -178,6 +178,8 @@ impl AuthFlow<'_, S, Scram<'_>> { SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus), _ => {} } + + // TODO: make this a metric instead info!("client chooses {}", sasl.method); let outcome = sasl::SaslStream::new(self.stream, sasl.message) diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 41b0e11e85..968682cf0f 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -111,7 +111,7 @@ struct SqlOverHttpArgs { sql_over_http_cancel_set_shards: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: u64, + sql_over_http_max_request_size_bytes: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB sql_over_http_max_response_size_bytes: usize, @@ -125,6 +125,7 @@ async fn main() -> anyhow::Result<()> { Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + // TODO: refactor these to use labels debug!("Version: {GIT_VERSION}"); debug!("Build_tag: {BUILD_TAG}"); let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index ef5b5e8509..623a0fd3b2 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -11,7 +11,7 @@ use futures::future::Either; use futures::TryFutureExt; use itertools::Itertools; use proxy::config::TlsServerEndPoint; -use proxy::context::RequestMonitoring; +use proxy::context::RequestContext; use proxy::metrics::{Metrics, ThreadPoolMetrics}; use proxy::protocol2::ConnectionInfo; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; @@ -177,7 +177,7 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, ConnectionInfo { addr: peer_addr, @@ -208,7 +208,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( - ctx: &RequestMonitoring, + ctx: &RequestContext, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -259,7 +259,7 @@ async fn ssl_handshake( } async fn handle_client( - ctx: RequestMonitoring, + ctx: RequestContext, dest_suffix: Arc, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index fda5b25961..a935378162 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -276,7 +276,7 @@ struct SqlOverHttpArgs { sql_over_http_cancel_set_shards: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: u64, + sql_over_http_max_request_size_bytes: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB sql_over_http_max_response_size_bytes: usize, @@ -288,6 +288,7 @@ async fn main() -> anyhow::Result<()> { let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + // TODO: refactor these to use labels info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { @@ -427,8 +428,9 @@ async fn main() -> anyhow::Result<()> { )?))), None => None, }; + let cancellation_handler = Arc::new(CancellationHandler::< - Option>>, + Option>>, >::new( cancel_map.clone(), redis_publisher, diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 07769e053c..20db1fbb14 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken; use tracing::info; use crate::config::EndpointCacheConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; use crate::rate_limiter::GlobalRateLimiter; @@ -75,7 +75,7 @@ impl EndpointsCache { } } - pub(crate) fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool { + pub(crate) fn is_valid(&self, ctx: &RequestContext, endpoint: &EndpointId) -> bool { if !self.ready.load(Ordering::Acquire) { // the endpoint cache is not yet fully initialised. return true; diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index db0970adcb..4b72a66e63 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -7,19 +7,26 @@ use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::Mutex; use tokio_postgres::{CancelToken, NoTls}; -use tracing::info; +use tracing::{debug, info}; use uuid::Uuid; +use crate::auth::{check_peer_addr_is_in_list, IpPattern}; use crate::error::ReportableError; use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; +use std::net::IpAddr; + +use ipnet::{IpNet, Ipv4Net, Ipv6Net}; pub type CancelMap = Arc>>; pub type CancellationHandlerMain = CancellationHandler>>>; pub(crate) type CancellationHandlerMainInternal = Option>>; +type IpSubnetKey = IpNet; + /// Enables serving `CancelRequest`s. /// /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. @@ -29,14 +36,23 @@ pub struct CancellationHandler

{ /// This field used for the monitoring purposes. /// Represents the source of the cancellation request. from: CancellationSource, + // rate limiter of cancellation requests + limiter: Arc>>, } #[derive(Debug, Error)] pub(crate) enum CancelError { #[error("{0}")] IO(#[from] std::io::Error), + #[error("{0}")] Postgres(#[from] tokio_postgres::Error), + + #[error("rate limit exceeded")] + RateLimit, + + #[error("IP is not allowed")] + IpNotAllowed, } impl ReportableError for CancelError { @@ -47,6 +63,8 @@ impl ReportableError for CancelError { crate::error::ErrorKind::Postgres } CancelError::Postgres(_) => crate::error::ErrorKind::Compute, + CancelError::RateLimit => crate::error::ErrorKind::RateLimit, + CancelError::IpNotAllowed => crate::error::ErrorKind::User, } } } @@ -73,19 +91,42 @@ impl CancellationHandler

{ break key; }; - info!("registered new query cancellation key {key}"); + debug!("registered new query cancellation key {key}"); Session { key, cancellation_handler: self, } } + /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. + /// check_allowed - if true, check if the IP is allowed to cancel the query pub(crate) async fn cancel_session( &self, key: CancelKeyData, session_id: Uuid, + peer_addr: &IpAddr, + check_allowed: bool, ) -> Result<(), CancelError> { + // TODO: check for unspecified address is only for backward compatibility, should be removed + if !peer_addr.is_unspecified() { + let subnet_key = match *peer_addr { + IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here + IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), + }; + if !self.limiter.lock().unwrap().check(subnet_key, 1) { + tracing::debug!("Rate limit exceeded. Skipping cancellation message"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::RateLimitExceeded, + }); + return Err(CancelError::RateLimit); + } + } + // NB: we should immediately release the lock after cloning the token. let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { tracing::warn!("query cancellation key not found: {key}"); @@ -96,7 +137,13 @@ impl CancellationHandler

{ source: self.from, kind: crate::metrics::CancellationOutcome::NotFound, }); - match self.client.try_publish(key, session_id).await { + + if session_id == Uuid::nil() { + // was already published, do not publish it again + return Ok(()); + } + + match self.client.try_publish(key, session_id, *peer_addr).await { Ok(()) => {} // do nothing Err(e) => { return Err(CancelError::IO(std::io::Error::new( @@ -107,6 +154,13 @@ impl CancellationHandler

{ } return Ok(()); }; + + if check_allowed + && !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice()) + { + return Err(CancelError::IpNotAllowed); + } + Metrics::get() .proxy .cancellation_requests_total @@ -135,13 +189,29 @@ impl CancellationHandler<()> { map, client: (), from, + limiter: Arc::new(std::sync::Mutex::new( + LeakyBucketRateLimiter::::new_with_shards( + LeakyBucketRateLimiter::::DEFAULT, + 64, + ), + )), } } } impl CancellationHandler>>> { pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { - Self { map, client, from } + Self { + map, + client, + from, + limiter: Arc::new(std::sync::Mutex::new( + LeakyBucketRateLimiter::::new_with_shards( + LeakyBucketRateLimiter::::DEFAULT, + 64, + ), + )), + } } } @@ -152,22 +222,31 @@ impl CancellationHandler>>> { pub struct CancelClosure { socket_addr: SocketAddr, cancel_token: CancelToken, + ip_allowlist: Vec, } impl CancelClosure { - pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self { + pub(crate) fn new( + socket_addr: SocketAddr, + cancel_token: CancelToken, + ip_allowlist: Vec, + ) -> Self { Self { socket_addr, cancel_token, + ip_allowlist, } } /// Cancels the query running on user's compute node. pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; self.cancel_token.cancel_query_raw(socket, NoTls).await?; - info!("query was cancelled"); + debug!("query was cancelled"); Ok(()) } + pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec) { + self.ip_allowlist = ip_allowlist; + } } /// Helper for registering query cancellation tokens. @@ -182,7 +261,7 @@ impl

Session

{ /// Store the cancel token for the given session. /// This enables query cancellation in `crate::proxy::prepare_client_connection`. pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { - info!("enabling query cancellation for this session"); + debug!("enabling query cancellation for this session"); self.cancellation_handler .map .insert(self.key, Some(cancel_closure)); @@ -194,7 +273,7 @@ impl

Session

{ impl

Drop for Session

{ fn drop(&mut self) { self.cancellation_handler.map.remove(&self.key); - info!("dropped query cancellation key {}", &self.key); + debug!("dropped query cancellation key {}", &self.key); } } @@ -229,6 +308,8 @@ mod tests { cancel_key: 0, }, Uuid::new_v4(), + &("127.0.0.1".parse().unwrap()), + true, ) .await .unwrap(); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index ca4a348ed8..8408d4720b 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -14,11 +14,11 @@ use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres_rustls::MakeRustlsConnect; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; use crate::auth::parse_endpoint_param; use crate::cancellation::CancelClosure; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::MetricsAuxInfo; @@ -213,7 +213,7 @@ impl ConnCfg { }; let connect_once = |host, port| { - info!("trying to connect to compute node at {host}:{port}"); + debug!("trying to connect to compute node at {host}:{port}"); connect_with_timeout(host, port).and_then(|socket| async { let socket_addr = socket.peer_addr()?; // This prevents load balancer from severing the connection. @@ -286,7 +286,7 @@ impl ConnCfg { /// Connect to a corresponding compute node. pub(crate) async fn connect( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, allow_self_signed_compute: bool, aux: MetricsAuxInfo, timeout: Duration, @@ -328,6 +328,7 @@ impl ConnCfg { tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); + // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( cold_start_info = ctx.cold_start_info().as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", @@ -341,7 +342,7 @@ impl ConnCfg { // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. // Yet another reason to rework the connection establishing code. - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]); let connection = PostgresConnection { stream, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index b048c9d389..8bc8e3f96f 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -64,7 +64,7 @@ pub struct HttpConfig { pub pool_options: GlobalConnPoolOptions, pub cancel_set: CancelSet, pub client_conn_threshold: u64, - pub max_request_size_bytes: u64, + pub max_request_size_bytes: usize, pub max_response_size_bytes: usize, } diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index cc456f3667..fbd0c8e5c5 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -8,7 +8,7 @@ use tracing::{debug, error, info, Instrument}; use crate::auth::backend::ConsoleRedirectBackend; use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}; use crate::config::{ProxyConfig, ProxyProtocolV2}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; @@ -82,7 +82,7 @@ pub async fn task_main( } }; - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, peer_addr, crate::metrics::Protocol::Tcp, @@ -141,12 +141,12 @@ pub async fn task_main( pub(crate) async fn handle_client( config: &'static ProxyConfig, backend: &'static ConsoleRedirectBackend, - ctx: &RequestMonitoring, + ctx: &RequestContext, cancellation_handler: Arc, stream: S, conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { - info!( + debug!( protocol = %ctx.protocol(), "handling interactive connection from client" ); @@ -156,16 +156,21 @@ pub(crate) async fn handle_client( let request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); - let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, stream, tls, record_handshake_error); + let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { return Ok(cancellation_handler - .cancel_session(cancel_key_data, ctx.session_id()) + .cancel_session( + cancel_key_data, + ctx.session_id(), + &ctx.peer_addr(), + config.authentication_config.ip_allowlist_check_enabled, + ) .await .map(|()| None)?) } @@ -174,7 +179,7 @@ pub(crate) async fn handle_client( ctx.set_db_options(params.clone()); - let user_info = match backend + let (user_info, ip_allowlist) = match backend .authenticate(ctx, &config.authentication_config, &mut stream) .await { @@ -198,6 +203,8 @@ pub(crate) async fn handle_client( .or_else(|e| stream.throw_error(e)) .await?; + node.cancel_closure + .set_ip_allowlist(ip_allowlist.unwrap_or_default()); let session = cancellation_handler.get_session(); prepare_client_connection(&node, &session, &mut stream).await?; diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 6cf99c0c97..6d2d2d51ce 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams; use smol_str::SmolStr; use tokio::sync::mpsc; use tracing::field::display; -use tracing::{debug, info, info_span, Span}; +use tracing::{debug, info_span, Span}; use try_lock::TryLock; use uuid::Uuid; @@ -32,15 +32,15 @@ pub(crate) static LOG_CHAN_DISCONNECT: OnceCell, + TryLock, ); -struct RequestMonitoringInner { +struct RequestContextInner { pub(crate) conn_info: ConnectionInfo, pub(crate) session_id: Uuid, pub(crate) protocol: Protocol, @@ -81,10 +81,10 @@ pub(crate) enum AuthMethod { Cleartext, } -impl Clone for RequestMonitoring { +impl Clone for RequestContext { fn clone(&self) -> Self { let inner = self.0.try_lock().expect("should not deadlock"); - let new = RequestMonitoringInner { + let new = RequestContextInner { conn_info: inner.conn_info.clone(), session_id: inner.session_id, protocol: inner.protocol, @@ -115,13 +115,14 @@ impl Clone for RequestMonitoring { } } -impl RequestMonitoring { +impl RequestContext { pub fn new( session_id: Uuid, conn_info: ConnectionInfo, protocol: Protocol, region: &'static str, ) -> Self { + // TODO: be careful with long lived spans let span = info_span!( "connect_request", %protocol, @@ -131,7 +132,7 @@ impl RequestMonitoring { role = tracing::field::Empty, ); - let inner = RequestMonitoringInner { + let inner = RequestContextInner { conn_info, session_id, protocol, @@ -167,7 +168,7 @@ impl RequestMonitoring { let ip = IpAddr::from([127, 0, 0, 1]); let addr = SocketAddr::new(ip, 5432); let conn_info = ConnectionInfo { addr, extra: None }; - RequestMonitoring::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test") + RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test") } pub(crate) fn console_application_name(&self) -> String { @@ -324,7 +325,7 @@ impl RequestMonitoring { } pub(crate) struct LatencyTimerPause<'a> { - ctx: &'a RequestMonitoring, + ctx: &'a RequestContext, start: tokio::time::Instant, waiting_for: Waiting, } @@ -340,7 +341,7 @@ impl Drop for LatencyTimerPause<'_> { } } -impl RequestMonitoringInner { +impl RequestContextInner { fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); @@ -384,6 +385,10 @@ impl RequestMonitoringInner { } else { ConnectOutcome::Failed }; + + // TODO: get rid of entirely/refactor + // check for false positives + // AND false negatives if let Some(rejected) = self.rejected { let ep = self .endpoint_id @@ -391,7 +396,7 @@ impl RequestMonitoringInner { .map(|x| x.as_str()) .unwrap_or_default(); // This makes sense only if cache is disabled - info!( + debug!( ?outcome, ?rejected, ?ep, @@ -425,7 +430,7 @@ impl RequestMonitoringInner { } } -impl Drop for RequestMonitoringInner { +impl Drop for RequestContextInner { fn drop(&mut self) { if self.sender.is_some() { self.log_connect(); diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 4112de646f..9bf3a275bb 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -20,7 +20,7 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use super::{RequestMonitoringInner, LOG_CHAN}; +use super::{RequestContextInner, LOG_CHAN}; use crate::config::remote_storage_from_toml; use crate::context::LOG_CHAN_DISCONNECT; @@ -117,8 +117,8 @@ impl serde::Serialize for Options<'_> { } } -impl From<&RequestMonitoringInner> for RequestData { - fn from(value: &RequestMonitoringInner) -> Self { +impl From<&RequestContextInner> for RequestData { + fn from(value: &RequestContextInner) -> Self { Self { session_id: value.session_id, peer_addr: value.conn_info.addr.ip().to_string(), diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index fd333d2aac..9537d717a1 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -13,7 +13,7 @@ use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::ComputeUserInfo; use crate::auth::IpPattern; use crate::cache::Cached; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret}; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, @@ -114,7 +114,7 @@ impl MockControlPlane { Ok((secret, allowed_ips)) } - .map_err(crate::error::log_error::) + .inspect_err(|e: &GetAuthInfoError| tracing::error!("{e}")) .instrument(info_span!("postgres", url = self.endpoint.as_str())) .await?; Ok(AuthInfo { @@ -206,7 +206,7 @@ impl super::ControlPlaneApi for MockControlPlane { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached( @@ -216,7 +216,7 @@ impl super::ControlPlaneApi for MockControlPlane { async fn get_allowed_ips_and_secret( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { Ok(( @@ -229,7 +229,7 @@ impl super::ControlPlaneApi for MockControlPlane { async fn get_endpoint_jwks( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, endpoint: EndpointId, ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(endpoint).await @@ -238,7 +238,7 @@ impl super::ControlPlaneApi for MockControlPlane { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _user_info: &ComputeUserInfo, ) -> Result { self.do_wake_compute().map_ok(Cached::new_uncached).await diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index e388d8a538..f8f74372f0 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -8,14 +8,14 @@ use std::time::Duration; use dashmap::DashMap; use tokio::time::Instant; -use tracing::info; +use tracing::{debug, info}; use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::auth::backend::ComputeUserInfo; use crate::cache::endpoints::EndpointsCache; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::{ errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache, }; @@ -41,7 +41,7 @@ pub enum ControlPlaneClient { impl ControlPlaneApi for ControlPlaneClient { async fn get_role_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { match self { @@ -57,7 +57,7 @@ impl ControlPlaneApi for ControlPlaneClient { async fn get_allowed_ips_and_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { match self { @@ -71,7 +71,7 @@ impl ControlPlaneApi for ControlPlaneClient { async fn get_endpoint_jwks( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, errors::GetEndpointJwksError> { match self { @@ -85,7 +85,7 @@ impl ControlPlaneApi for ControlPlaneClient { async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { match self { @@ -214,7 +214,7 @@ impl ApiLocks { self.metrics .semaphore_acquire_seconds .observe(now.elapsed().as_secs_f64()); - info!("acquired permit {:?}", now.elapsed().as_secs_f64()); + debug!("acquired permit {:?}", now.elapsed().as_secs_f64()); Ok(WakeComputePermit { permit: permit? }) } @@ -271,7 +271,7 @@ impl WakeComputePermit { impl FetchAuthRules for ControlPlaneClient { async fn fetch_auth_rules( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, FetchAuthRulesError> { self.get_endpoint_jwks(ctx, endpoint) diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs index 26ff4e1402..2cad981d01 100644 --- a/proxy/src/control_plane/client/neon.rs +++ b/proxy/src/control_plane/client/neon.rs @@ -14,7 +14,7 @@ use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeComput use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::ComputeUserInfo; use crate::cache::Cached; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::caches::ApiCaches; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, @@ -65,7 +65,7 @@ impl NeonControlPlaneClient { async fn do_get_auth_info( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { if !self @@ -73,6 +73,8 @@ impl NeonControlPlaneClient { .endpoints_cache .is_valid(ctx, &user_info.endpoint.normalize()) { + // TODO: refactor this because it's weird + // this is a failure to authenticate but we return Ok. info!("endpoint is not valid, skipping the request"); return Ok(AuthInfo::default()); } @@ -92,7 +94,7 @@ impl NeonControlPlaneClient { ]) .build()?; - info!(url = request.url().as_str(), "sending http request"); + debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; @@ -104,10 +106,12 @@ impl NeonControlPlaneClient { // TODO(anna): retry Err(e) => { return if e.get_reason().is_not_found() { + // TODO: refactor this because it's weird + // this is a failure to authenticate but we return Ok. Ok(AuthInfo::default()) } else { Err(e.into()) - } + }; } }; @@ -130,14 +134,14 @@ impl NeonControlPlaneClient { project_id: body.project_id, }) } - .map_err(crate::error::log_error) - .instrument(info_span!("http", id = request_id)) + .inspect_err(|e| tracing::debug!(error = ?e)) + .instrument(info_span!("do_get_auth_info")) .await } async fn do_get_endpoint_jwks( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, GetEndpointJwksError> { if !self @@ -163,7 +167,7 @@ impl NeonControlPlaneClient { .build() .map_err(GetEndpointJwksError::RequestBuild)?; - info!(url = request.url().as_str(), "sending http request"); + debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self @@ -189,14 +193,14 @@ impl NeonControlPlaneClient { Ok(rules) } - .map_err(crate::error::log_error) - .instrument(info_span!("http", id = request_id)) + .inspect_err(|e| tracing::debug!(error = ?e)) + .instrument(info_span!("do_get_endpoint_jwks")) .await } async fn do_wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { let request_id = ctx.session_id().to_string(); @@ -220,7 +224,7 @@ impl NeonControlPlaneClient { let request = request_builder.build()?; - info!(url = request.url().as_str(), "sending http request"); + debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; @@ -248,8 +252,8 @@ impl NeonControlPlaneClient { Ok(node) } - .map_err(crate::error::log_error) - .instrument(info_span!("http", id = request_id)) + .inspect_err(|e| tracing::debug!(error = ?e)) + .instrument(info_span!("do_wake_compute")) .await } } @@ -258,7 +262,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { let normalized_ep = &user_info.endpoint.normalize(); @@ -292,7 +296,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { async fn get_allowed_ips_and_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { let normalized_ep = &user_info.endpoint.normalize(); @@ -334,7 +338,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { #[tracing::instrument(skip_all)] async fn get_endpoint_jwks( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(ctx, endpoint).await @@ -343,7 +347,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { let key = user_info.endpoint_cache_key(); @@ -375,6 +379,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { // after getting back a permit - it's possible the cache was filled // double check if permit.should_check_cache() { + // TODO: if there is something in the cache, mark the permit as success. check_cache!(); } diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index 70607ac0d0..41972e4e44 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -17,7 +17,7 @@ use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::IpPattern; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::cache::{Cached, TimedLru}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo}; use crate::intern::ProjectIdInt; use crate::types::{EndpointCacheKey, EndpointId}; @@ -75,7 +75,7 @@ pub(crate) struct NodeInfo { impl NodeInfo { pub(crate) async fn connect( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, timeout: Duration, ) -> Result { self.config @@ -116,26 +116,26 @@ pub(crate) trait ControlPlaneApi { /// We still have to mock the scram to avoid leaking information that user doesn't exist. async fn get_role_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result; async fn get_allowed_ips_and_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; async fn get_endpoint_jwks( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, errors::GetEndpointJwksError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result; } diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 7b693a7418..2221aac407 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -10,12 +10,6 @@ pub(crate) fn io_error(e: impl Into>) -> io::Err io::Error::new(io::ErrorKind::Other, e) } -/// A small combinator for pluggable error logging. -pub(crate) fn log_error(e: E) -> E { - tracing::error!("{e}"); - e -} - /// Marks errors that may be safely shown to a client. /// This trait can be seen as a specialized version of [`ToString`]. /// diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index b1642cedb3..ed88c77256 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -122,18 +122,18 @@ impl Endpoint { } #[derive(Error, Debug)] -pub(crate) enum ReadBodyError { +pub(crate) enum ReadBodyError { #[error("Content length exceeds limit of {limit} bytes")] BodyTooLarge { limit: usize }, #[error(transparent)] - Read(#[from] reqwest::Error), + Read(#[from] E), } -pub(crate) async fn read_body_with_limit( - mut b: impl Body + Unpin, +pub(crate) async fn read_body_with_limit( + mut b: impl Body + Unpin, limit: usize, -) -> Result, ReadBodyError> { +) -> Result, ReadBodyError> { // We could use `b.limited().collect().await.to_bytes()` here // but this ends up being slightly more efficient as far as I can tell. diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index f91fcd4120..659c57c865 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -351,6 +351,7 @@ pub enum CancellationSource { pub enum CancellationOutcome { NotFound, Found, + RateLimitExceeded, } #[derive(LabelGroup)] diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 659b7afa68..2e759b0894 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -7,7 +7,7 @@ use super::retry::ShouldRetryWakeCompute; use crate::auth::backend::ComputeCredentialKeys; use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; use crate::config::RetryConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::locks::ApiLocks; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; @@ -47,7 +47,7 @@ pub(crate) trait ConnectMechanism { type Error: From; async fn connect_once( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, timeout: time::Duration, ) -> Result; @@ -59,7 +59,7 @@ pub(crate) trait ConnectMechanism { pub(crate) trait ComputeConnectBackend { async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, ) -> Result; fn get_keys(&self) -> &ComputeCredentialKeys; @@ -82,7 +82,7 @@ impl ConnectMechanism for TcpMechanism<'_> { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] async fn connect_once( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, timeout: time::Duration, ) -> Result { @@ -99,7 +99,7 @@ impl ConnectMechanism for TcpMechanism<'_> { /// Try to connect to the compute node, retrying if necessary. #[tracing::instrument(skip_all)] pub(crate) async fn connect_to_compute( - ctx: &RequestMonitoring, + ctx: &RequestContext, mechanism: &M, user_info: &B, allow_self_signed_compute: bool, @@ -117,7 +117,6 @@ where node_info.set_keys(user_info.get_keys()); node_info.allow_self_signed_compute = allow_self_signed_compute; mechanism.update_connect_config(&mut node_info.config); - let retry_type = RetryType::ConnectToCompute; // try once let err = match mechanism @@ -129,7 +128,7 @@ where Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, - retry_type, + retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); @@ -147,7 +146,7 @@ where Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, - retry_type, + retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); @@ -156,8 +155,9 @@ where node_info } else { // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node - info!("compute node's state has likely changed; requesting a wake-up"); + debug!("compute node's state has likely changed; requesting a wake-up"); let old_node_info = invalidate_cache(node_info); + // TODO: increment num_retries? let mut node_info = wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; node_info.reuse_settings(old_node_info); @@ -169,7 +169,7 @@ where // now that we have a new node, try connect to it repeatedly. // this can error for a few reasons, for instance: // * DNS connection settings haven't quite propagated yet - info!("wake_compute success. attempting to connect"); + debug!("wake_compute success. attempting to connect"); num_retries = 1; loop { match mechanism @@ -181,10 +181,11 @@ where Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, - retry_type, + retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); + // TODO: is this necessary? We have a metric. info!(?num_retries, "connected to compute node after"); return Ok(res); } @@ -194,7 +195,7 @@ where Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, - retry_type, + retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 91a3ceff75..4e4af88634 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -87,6 +87,8 @@ where transfer_one_direction(cx, &mut compute_to_client, compute, client) .map_err(ErrorSource::from_compute)?; + // TODO: 1 info log, with a enum label for close direction. + // Early termination checks from compute to client. if let TransferState::Done(_) = compute_to_client { if let TransferState::Running(buf) = &client_to_compute { diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index a67f1b8112..e27c211932 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -5,11 +5,11 @@ use pq_proto::{ }; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use tracing::{debug, info, warn}; use crate::auth::endpoint_sni; use crate::config::{TlsConfig, PG_ALPN_PROTOCOL}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::Metrics; use crate::proxy::ERR_INSECURE_CONNECTION; @@ -66,7 +66,7 @@ pub(crate) enum HandshakeData { /// we also take an extra care of propagating only the select handshake errors to client. #[tracing::instrument(skip_all)] pub(crate) async fn handshake( - ctx: &RequestMonitoring, + ctx: &RequestContext, stream: S, mut tls: Option<&TlsConfig>, record_handshake_error: bool, @@ -199,6 +199,8 @@ pub(crate) async fn handshake( .await?; } + // This log highlights the start of the connection. + // This contains useful information for debugging, not logged elsewhere, like role name and endpoint id. info!( ?version, ?params, @@ -211,7 +213,7 @@ pub(crate) async fn handshake( FeStartupPacket::StartupMessage { params, version } if version.major() == 3 && version > PG_PROTOCOL_LATEST => { - warn!(?version, "unsupported minor version"); + debug!(?version, "unsupported minor version"); // no protocol extensions are supported. // @@ -233,14 +235,16 @@ pub(crate) async fn handshake( info!( ?version, + ?params, session_type = "normal", "successful handshake; unsupported minor version requested" ); break Ok(HandshakeData::Startup(stream, params)); } - FeStartupPacket::StartupMessage { version, .. } => { + FeStartupPacket::StartupMessage { version, params } => { warn!( ?version, + ?params, session_type = "normal", "unsuccessful handshake; unsupported version" ); diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 17721c23d5..5d9468d89a 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -25,7 +25,7 @@ use self::connect_compute::{connect_to_compute, TcpMechanism}; use self::passthrough::ProxyPassthrough; use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; @@ -117,7 +117,7 @@ pub async fn task_main( } }; - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, conn_info, crate::metrics::Protocol::Tcp, @@ -247,14 +247,14 @@ impl ReportableError for ClientRequestError { pub(crate) async fn handle_client( config: &'static ProxyConfig, auth_backend: &'static auth::Backend<'static, ()>, - ctx: &RequestMonitoring, + ctx: &RequestContext, cancellation_handler: Arc, stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { - info!( + debug!( protocol = %ctx.protocol(), "handling interactive connection from client" ); @@ -268,12 +268,18 @@ pub(crate) async fn handle_client( let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error); + let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { return Ok(cancellation_handler - .cancel_session(cancel_key_data, ctx.session_id()) + .cancel_session( + cancel_key_data, + ctx.session_id(), + &ctx.peer_addr(), + config.authentication_config.ip_allowlist_check_enabled, + ) .await .map(|()| None)?) } diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index e3b4730982..5e07c8eeae 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -1,5 +1,5 @@ use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::info; +use tracing::debug; use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; @@ -45,7 +45,7 @@ pub(crate) async fn proxy_pass( ); // Starting from here we only proxy the client's traffic. - info!("performing the proxy pass..."); + debug!("performing the proxy pass..."); let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute( &mut client, &mut compute, diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index df9f79a7e3..fe211adfeb 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -36,7 +36,7 @@ async fn proxy_mitm( // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; let (end_client, startup) = match handshake( - &RequestMonitoring::test(), + &RequestContext::test(), client1, Some(&server_config1), false, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index be821925b5..3de8ca8736 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -162,7 +162,7 @@ impl TestAuth for Scram { stream: &mut PqStream>, ) -> anyhow::Result<()> { let outcome = auth::AuthFlow::new(stream) - .begin(auth::Scram(&self.0, &RequestMonitoring::test())) + .begin(auth::Scram(&self.0, &RequestContext::test())) .await? .authenticate() .await?; @@ -182,11 +182,10 @@ async fn dummy_proxy( auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let (client, _) = read_proxy_protocol(client).await?; - let mut stream = - match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? { - HandshakeData::Startup(stream, _) => stream, - HandshakeData::Cancel(_) => bail!("cancellation not supported"), - }; + let mut stream = match handshake(&RequestContext::test(), client, tls.as_ref(), false).await? { + HandshakeData::Startup(stream, _) => stream, + HandshakeData::Cancel(_) => bail!("cancellation not supported"), + }; auth.authenticate(&mut stream).await?; @@ -466,7 +465,7 @@ impl ConnectMechanism for TestConnectMechanism { async fn connect_once( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _node_info: &control_plane::CachedNodeInfo, _timeout: std::time::Duration, ) -> Result { @@ -581,7 +580,7 @@ fn helper_create_connect_info( async fn connect_to_compute_success() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -599,7 +598,7 @@ async fn connect_to_compute_success() { async fn connect_to_compute_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -618,7 +617,7 @@ async fn connect_to_compute_retry() { async fn connect_to_compute_non_retry_1() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -637,7 +636,7 @@ async fn connect_to_compute_non_retry_1() { async fn connect_to_compute_non_retry_2() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -657,7 +656,7 @@ async fn connect_to_compute_non_retry_3() { let _ = env_logger::try_init(); tokio::time::pause(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); let user_info = helper_create_connect_info(&mechanism); @@ -689,7 +688,7 @@ async fn connect_to_compute_non_retry_3() { async fn wake_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -708,7 +707,7 @@ async fn wake_retry() { async fn wake_non_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index f9f46bb66c..4e9206feff 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,9 +1,9 @@ -use tracing::{error, info, warn}; +use tracing::{error, info}; use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; -use crate::context::RequestMonitoring; -use crate::control_plane::errors::WakeComputeError; +use crate::context::RequestContext; +use crate::control_plane::errors::{ControlPlaneError, WakeComputeError}; use crate::control_plane::CachedNodeInfo; use crate::error::ReportableError; use crate::metrics::{ @@ -11,39 +11,52 @@ use crate::metrics::{ }; use crate::proxy::retry::{retry_after, should_retry}; +// Use macro to retain original callsite. +macro_rules! log_wake_compute_error { + (error = ?$error:expr, $num_retries:expr, retriable = $retriable:literal) => { + match $error { + WakeComputeError::ControlPlane(ControlPlaneError::Message(_)) => { + info!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node") + } + _ => error!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node"), + } + }; +} + pub(crate) async fn wake_compute( num_retries: &mut u32, - ctx: &RequestMonitoring, + ctx: &RequestContext, api: &B, config: RetryConfig, ) -> Result { - let retry_type = RetryType::WakeCompute; loop { match api.wake_compute(ctx).await { Err(e) if !should_retry(&e, *num_retries, config) => { - error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + log_wake_compute_error!(error = ?e, num_retries, retriable = false); report_error(&e, false); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, - retry_type, + retry_type: RetryType::WakeCompute, }, (*num_retries).into(), ); return Err(e); } Err(e) => { - warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + log_wake_compute_error!(error = ?e, num_retries, retriable = true); report_error(&e, true); } Ok(n) => { Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, - retry_type, + retry_type: RetryType::WakeCompute, }, (*num_retries).into(), ); + // TODO: is this necessary? We have a metric. + // TODO: this log line is misleading as "wake_compute" might return cached (and stale) info. info!(?num_retries, "compute node woken up after"); return Ok(n); } diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 16c398f303..b74a9ab17e 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -195,7 +195,11 @@ impl DynamicLimiter { /// /// Set the outcome to `None` to ignore the job. fn release_inner(&self, start: Instant, outcome: Option) { - tracing::info!("outcome is {:?}", outcome); + if outcome.is_none() { + tracing::warn!("outcome is {:?}", outcome); + } else { + tracing::debug!("outcome is {:?}", outcome); + } if self.config.initial_limit == 0 { return; } diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 5332a5184f..3000cc4c2a 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -31,26 +31,32 @@ impl LimitAlgorithm for Aimd { if utilisation > self.utilisation { let limit = old_limit + self.inc; - let increased_limit = limit.clamp(self.min, self.max); - if increased_limit > old_limit { - tracing::info!(increased_limit, "limit increased"); + let new_limit = limit.clamp(self.min, self.max); + if new_limit > old_limit { + tracing::info!(old_limit, new_limit, "limit increased"); + } else { + tracing::debug!(old_limit, new_limit, "limit clamped at max"); } - increased_limit + new_limit } else { old_limit } } Outcome::Overload => { - let limit = old_limit as f32 * self.dec; + let new_limit = old_limit as f32 * self.dec; // Floor instead of round, so the limit reduces even with small numbers. // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 - let limit = limit.floor() as usize; + let new_limit = new_limit.floor() as usize; - let limit = limit.clamp(self.min, self.max); - tracing::info!(limit, "limit decreased"); - limit + let new_limit = new_limit.clamp(self.min, self.max); + if new_limit < old_limit { + tracing::info!(old_limit, new_limit, "limit decreased"); + } else { + tracing::debug!(old_limit, new_limit, "limit clamped at min"); + } + new_limit } } } diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 4259fd04f4..a048721e77 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -14,13 +14,13 @@ use tracing::info; use crate::intern::EndpointIdInt; -pub(crate) struct GlobalRateLimiter { +pub struct GlobalRateLimiter { data: Vec, info: Vec, } impl GlobalRateLimiter { - pub(crate) fn new(info: Vec) -> Self { + pub fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -34,7 +34,7 @@ impl GlobalRateLimiter { } /// Check that number of connections is below `max_rps` rps. - pub(crate) fn check(&mut self) -> bool { + pub fn check(&mut self) -> bool { let now = Instant::now(); let should_allow_request = self @@ -137,6 +137,19 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; + /// All of these are per endpoint-maskedip pair. + /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). + /// + /// First bucket: 1000mcpus total per endpoint-ip pair + /// * 4096000 requests per second with 1 hash rounds. + /// * 1000 requests per second with 4096 hash rounds. + /// * 6.8 requests per second with 600000 hash rounds. + pub const DEFAULT_AUTH_SET: [Self; 3] = [ + Self::new(1000 * 4096, Duration::from_secs(1)), + Self::new(600 * 4096, Duration::from_secs(60)), + Self::new(300 * 4096, Duration::from_secs(600)), + ]; + pub fn rps(&self) -> f64 { (self.max_rpi as f64) / self.interval.as_secs_f64() } diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs index 3ae2ecaf8f..5f90102da3 100644 --- a/proxy/src/rate_limiter/mod.rs +++ b/proxy/src/rate_limiter/mod.rs @@ -8,5 +8,4 @@ pub(crate) use limit_algorithm::aimd::Aimd; pub(crate) use limit_algorithm::{ DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; -pub(crate) use limiter::GlobalRateLimiter; -pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; +pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 0000246971..633a2f1b81 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -1,5 +1,6 @@ use std::sync::Arc; +use core::net::IpAddr; use pq_proto::CancelKeyData; use redis::AsyncCommands; use tokio::sync::Mutex; @@ -15,6 +16,7 @@ pub trait CancellationPublisherMut: Send + Sync + 'static { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()>; } @@ -24,6 +26,7 @@ pub trait CancellationPublisher: Send + Sync + 'static { &self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()>; } @@ -32,6 +35,7 @@ impl CancellationPublisher for () { &self, _cancel_key_data: CancelKeyData, _session_id: Uuid, + _peer_addr: IpAddr, ) -> anyhow::Result<()> { Ok(()) } @@ -42,8 +46,10 @@ impl CancellationPublisherMut for P { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { -

::try_publish(self, cancel_key_data, session_id).await +

::try_publish(self, cancel_key_data, session_id, peer_addr) + .await } } @@ -52,9 +58,10 @@ impl CancellationPublisher for Option

{ &self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { if let Some(p) = self { - p.try_publish(cancel_key_data, session_id).await + p.try_publish(cancel_key_data, session_id, peer_addr).await } else { Ok(()) } @@ -66,10 +73,11 @@ impl CancellationPublisher for Arc> { &self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { self.lock() .await - .try_publish(cancel_key_data, session_id) + .try_publish(cancel_key_data, session_id, peer_addr) .await } } @@ -97,11 +105,13 @@ impl RedisPublisherClient { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { let payload = serde_json::to_string(&Notification::Cancel(CancelSession { region_id: Some(self.region_id.clone()), cancel_key_data, session_id, + peer_addr: Some(peer_addr), }))?; let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?; Ok(()) @@ -120,12 +130,14 @@ impl RedisPublisherClient { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { + // TODO: review redundant error duplication logs. if !self.limiter.check() { tracing::info!("Rate limit exceeded. Skipping cancellation message"); return Err(anyhow::anyhow!("Rate limit exceeded")); } - match self.publish(cancel_key_data, session_id).await { + match self.publish(cancel_key_data, session_id, peer_addr).await { Ok(()) => return Ok(()), Err(e) => { tracing::error!("failed to publish a message: {e}"); @@ -133,7 +145,7 @@ impl RedisPublisherClient { } tracing::info!("Publisher is disconnected. Reconnectiong..."); self.try_connect().await?; - self.publish(cancel_key_data, session_id).await + self.publish(cancel_key_data, session_id, peer_addr).await } } @@ -142,11 +154,15 @@ impl CancellationPublisherMut for RedisPublisherClient { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { tracing::info!("publishing cancellation key to Redis"); - match self.try_publish_internal(cancel_key_data, session_id).await { + match self + .try_publish_internal(cancel_key_data, session_id, peer_addr) + .await + { Ok(()) => { - tracing::info!("cancellation key successfuly published to Redis"); + tracing::debug!("cancellation key successfuly published to Redis"); Ok(()) } Err(e) => { diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 62e7b1b565..65008ae943 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -60,6 +60,7 @@ pub(crate) struct CancelSession { pub(crate) region_id: Option, pub(crate) cancel_key_data: CancelKeyData, pub(crate) session_id: Uuid, + pub(crate) peer_addr: Option, } fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result @@ -137,10 +138,20 @@ impl MessageHandler { return Ok(()); } } + + // TODO: Remove unspecified peer_addr after the complete migration to the new format + let peer_addr = cancel_session + .peer_addr + .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED)); // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. match self .cancellation_handler - .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil()) + .cancel_session( + cancel_session.cancel_key_data, + uuid::Uuid::nil(), + &peer_addr, + cancel_session.peer_addr.is_some(), + ) .await { Ok(()) => {} @@ -335,6 +346,7 @@ mod tests { cancel_key_data, region_id: None, session_id: uuid, + peer_addr: None, }); let text = serde_json::to_string(&msg)?; let result: Notification = serde_json::from_str(&text)?; @@ -344,6 +356,7 @@ mod tests { cancel_key_data, region_id: Some("region".to_string()), session_id: uuid, + peer_addr: None, }); let text = serde_json::to_string(&msg)?; let result: Notification = serde_json::from_str(&text)?; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 5e9fd151ae..3037e20888 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -12,8 +12,8 @@ use tracing::field::display; use tracing::{debug, info}; use super::conn_pool::poll_client; -use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool}; -use super::http_conn_pool::{self, poll_http2_client, Send}; +use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; +use super::http_conn_pool::{self, poll_http2_client, HttpConnPool, Send}; use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION}; use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; @@ -23,7 +23,7 @@ use crate::compute_ctl::{ ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, }; use crate::config::ProxyConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::control_plane::locks::ApiLocks; @@ -36,9 +36,10 @@ use crate::rate_limiter::EndpointRateLimiter; use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX}; pub(crate) struct PoolingBackend { - pub(crate) http_conn_pool: Arc>, + pub(crate) http_conn_pool: Arc>>, pub(crate) local_pool: Arc>, - pub(crate) pool: Arc>, + pub(crate) pool: + Arc>>, pub(crate) config: &'static ProxyConfig, pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, @@ -48,7 +49,7 @@ pub(crate) struct PoolingBackend { impl PoolingBackend { pub(crate) async fn authenticate_with_password( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, password: &[u8], ) -> Result { @@ -110,7 +111,7 @@ impl PoolingBackend { pub(crate) async fn authenticate_with_jwt( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, jwt: String, ) -> Result { @@ -161,16 +162,16 @@ impl PoolingBackend { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] pub(crate) async fn connect_to_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: ConnInfo, keys: ComputeCredentials, force_new: bool, ) -> Result, HttpConnError> { let maybe_client = if force_new { - info!("pool: pool is disabled"); + debug!("pool: pool is disabled"); None } else { - info!("pool: looking for an existing connection"); + debug!("pool: looking for an existing connection"); self.pool.get(ctx, &conn_info)? }; @@ -201,17 +202,17 @@ impl PoolingBackend { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] pub(crate) async fn connect_to_local_proxy( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: ConnInfo, ) -> Result, HttpConnError> { - info!("pool: looking for an existing connection"); + debug!("pool: looking for an existing connection"); if let Ok(Some(client)) = self.http_conn_pool.get(ctx, &conn_info) { return Ok(client); } let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); - info!(%conn_id, "pool: opening a new connection '{conn_info}'"); + debug!(%conn_id, "pool: opening a new connection '{conn_info}'"); let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials { info: ComputeUserInfo { user: conn_info.user_info.user.clone(), @@ -249,7 +250,7 @@ impl PoolingBackend { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] pub(crate) async fn connect_to_local_postgres( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: ConnInfo, ) -> Result, HttpConnError> { if let Some(client) = self.local_pool.get(ctx, &conn_info)? { @@ -474,7 +475,7 @@ impl ShouldRetryWakeCompute for LocalProxyConnError { } struct TokioMechanism { - pool: Arc>, + pool: Arc>>, conn_info: ConnInfo, conn_id: uuid::Uuid, @@ -490,7 +491,7 @@ impl ConnectMechanism for TokioMechanism { async fn connect_once( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { @@ -524,7 +525,7 @@ impl ConnectMechanism for TokioMechanism { } struct HyperMechanism { - pool: Arc>, + pool: Arc>>, conn_info: ConnInfo, conn_id: uuid::Uuid, @@ -540,7 +541,7 @@ impl ConnectMechanism for HyperMechanism { async fn connect_once( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 1845603bf7..bd262f45ed 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -19,9 +19,10 @@ use { }; use super::conn_pool_lib::{ - Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, GlobalConnPool, + Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, EndpointConnPool, + GlobalConnPool, }; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::Metrics; @@ -52,8 +53,8 @@ impl fmt::Display for ConnInfo { } pub(crate) fn poll_client( - global_pool: Arc>, - ctx: &RequestMonitoring, + global_pool: Arc>>, + ctx: &RequestContext, conn_info: ConnInfo, client: C, mut connection: tokio_postgres::Connection, @@ -167,6 +168,7 @@ pub(crate) fn poll_client( Client::new(inner, conn_info, pool_clone) } +#[derive(Clone)] pub(crate) struct ClientDataRemote { session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -243,7 +245,7 @@ mod tests { }, cancel_set: CancelSet::new(0), client_conn_threshold: u64::MAX, - max_request_size_bytes: u64::MAX, + max_request_size_bytes: usize::MAX, max_response_size_bytes: usize::MAX, })); let pool = GlobalConnPool::new(config); diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 61c39c32c9..fe1d2563bc 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::marker::PhantomData; use std::ops::Deref; use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; @@ -15,7 +16,7 @@ use super::conn_pool::ClientDataRemote; use super::http_conn_pool::ClientDataHttp; use super::local_conn_pool::ClientDataLocal; use crate::auth::backend::ComputeUserInfo; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::types::{DbName, EndpointCacheKey, RoleName}; @@ -43,13 +44,14 @@ impl ConnInfo { } } +#[derive(Clone)] pub(crate) enum ClientDataEnum { Remote(ClientDataRemote), Local(ClientDataLocal), - #[allow(dead_code)] Http(ClientDataHttp), } +#[derive(Clone)] pub(crate) struct ClientInnerCommon { pub(crate) inner: C, pub(crate) aux: MetricsAuxInfo, @@ -91,6 +93,7 @@ pub(crate) struct ConnPoolEntry { pub(crate) struct EndpointConnPool { pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, + /// max # connections per endpoint max_conns: usize, _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, @@ -232,7 +235,7 @@ impl EndpointConnPool { // do logging outside of the mutex if returned { - info!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); + debug!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); } else { info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); } @@ -317,24 +320,49 @@ impl DbUserConn for DbUserConnPool { } } -pub(crate) struct GlobalConnPool { +pub(crate) trait EndpointConnPoolExt { + fn clear_closed(&mut self) -> usize; + fn total_conns(&self) -> usize; +} + +impl EndpointConnPoolExt for EndpointConnPool { + fn clear_closed(&mut self) -> usize { + let mut clients_removed: usize = 0; + for db_pool in self.pools.values_mut() { + clients_removed += db_pool.clear_closed_clients(&mut self.total_conns); + } + clients_removed + } + + fn total_conns(&self) -> usize { + self.total_conns + } +} + +pub(crate) struct GlobalConnPool +where + C: ClientInnerExt, + P: EndpointConnPoolExt, +{ // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>>, + pub(crate) global_pool: DashMap>>, /// Number of endpoint-connection pools /// /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. /// That seems like far too much effort, so we're using a relaxed increment counter instead. /// It's only used for diagnostics. - global_pool_size: AtomicUsize, + pub(crate) global_pool_size: AtomicUsize, /// Total number of connections in the pool - global_connections_count: Arc, + pub(crate) global_connections_count: Arc, - config: &'static crate::config::HttpConfig, + pub(crate) config: &'static crate::config::HttpConfig, + + _marker: PhantomData, } #[derive(Debug, Clone, Copy)] @@ -357,7 +385,11 @@ pub struct GlobalConnPoolOptions { pub max_total_conns: usize, } -impl GlobalConnPool { +impl GlobalConnPool +where + C: ClientInnerExt, + P: EndpointConnPoolExt, +{ pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { let shards = config.pool_options.pool_shards; Arc::new(Self { @@ -365,6 +397,7 @@ impl GlobalConnPool { global_pool_size: AtomicUsize::new(0), config, global_connections_count: Arc::new(AtomicUsize::new(0)), + _marker: PhantomData, }) } @@ -378,60 +411,6 @@ impl GlobalConnPool { self.config.pool_options.idle_timeout } - pub(crate) fn get( - self: &Arc, - ctx: &RequestMonitoring, - conn_info: &ConnInfo, - ) -> Result>, HttpConnError> { - let mut client: Option> = None; - let Some(endpoint) = conn_info.endpoint_cache_key() else { - return Ok(None); - }; - - let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); - if let Some(entry) = endpoint_pool - .write() - .get_conn_entry(conn_info.db_and_user()) - { - client = Some(entry.conn); - } - let endpoint_pool = Arc::downgrade(&endpoint_pool); - - // ok return cached connection if found and establish a new one otherwise - if let Some(mut client) = client { - if client.inner.is_closed() { - info!("pool: cached connection '{conn_info}' is closed, opening a new one"); - return Ok(None); - } - tracing::Span::current() - .record("conn_id", tracing::field::display(client.get_conn_id())); - tracing::Span::current().record( - "pid", - tracing::field::display(client.inner.get_process_id()), - ); - info!( - cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), - "pool: reusing connection '{conn_info}'" - ); - - match client.get_data() { - ClientDataEnum::Local(data) => { - data.session().send(ctx.session_id())?; - } - - ClientDataEnum::Remote(data) => { - data.session().send(ctx.session_id())?; - } - ClientDataEnum::Http(_) => (), - } - - ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); - ctx.success(); - return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); - } - Ok(None) - } - pub(crate) fn shutdown(&self) { // drops all strong references to endpoint-pools self.global_pool.clear(); @@ -464,17 +443,10 @@ impl GlobalConnPool { // if the current endpoint pool is unique (no other strong or weak references) // then it is currently not in use by any connections. if let Some(pool) = Arc::get_mut(x.get_mut()) { - let EndpointConnPool { - pools, total_conns, .. - } = pool.get_mut(); + let endpoints = pool.get_mut(); + clients_removed = endpoints.clear_closed(); - // ensure that closed clients are removed - for db_pool in pools.values_mut() { - clients_removed += db_pool.clear_closed_clients(total_conns); - } - - // we only remove this pool if it has no active connections - if *total_conns == 0 { + if endpoints.total_conns() == 0 { info!("pool: discarding pool for endpoint {endpoint}"); return false; } @@ -510,6 +482,62 @@ impl GlobalConnPool { info!("pool: performed global pool gc. size now {global_pool_size}"); } } +} + +impl GlobalConnPool> { + pub(crate) fn get( + self: &Arc, + ctx: &RequestContext, + conn_info: &ConnInfo, + ) -> Result>, HttpConnError> { + let mut client: Option> = None; + let Some(endpoint) = conn_info.endpoint_cache_key() else { + return Ok(None); + }; + + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + if let Some(entry) = endpoint_pool + .write() + .get_conn_entry(conn_info.db_and_user()) + { + client = Some(entry.conn); + } + let endpoint_pool = Arc::downgrade(&endpoint_pool); + + // ok return cached connection if found and establish a new one otherwise + if let Some(mut client) = client { + if client.inner.is_closed() { + info!("pool: cached connection '{conn_info}' is closed, opening a new one"); + return Ok(None); + } + tracing::Span::current() + .record("conn_id", tracing::field::display(client.get_conn_id())); + tracing::Span::current().record( + "pid", + tracing::field::display(client.inner.get_process_id()), + ); + debug!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + + match client.get_data() { + ClientDataEnum::Local(data) => { + data.session().send(ctx.session_id())?; + } + + ClientDataEnum::Remote(data) => { + data.session().send(ctx.session_id())?; + } + ClientDataEnum::Http(_) => (), + } + + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); + ctx.success(); + return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); + } + Ok(None) + } pub(crate) fn get_or_create_endpoint_pool( self: &Arc, @@ -556,7 +584,6 @@ impl GlobalConnPool { pool } } - pub(crate) struct Client { span: Span, inner: Option>, diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index a1d4473b01..fde38d0de3 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -2,17 +2,18 @@ use std::collections::VecDeque; use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; -use dashmap::DashMap; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; -use rand::Rng; use tokio::net::TcpStream; use tracing::{debug, error, info, info_span, Instrument}; use super::backend::HttpConnError; -use super::conn_pool_lib::{ClientInnerExt, ConnInfo}; -use crate::context::RequestMonitoring; +use super::conn_pool_lib::{ + ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry, + EndpointConnPoolExt, GlobalConnPool, +}; +use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::types::EndpointCacheKey; @@ -23,17 +24,11 @@ pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; #[derive(Clone)] -pub(crate) struct ConnPoolEntry { - conn: C, - conn_id: uuid::Uuid, - aux: MetricsAuxInfo, -} - pub(crate) struct ClientDataHttp(); // Per-endpoint connection pool // Number of open connections is limited by the `max_conns_per_endpoint`. -pub(crate) struct EndpointConnPool { +pub(crate) struct HttpConnPool { // TODO(conrad): // either we should open more connections depending on stream count // (not exposed by hyper, need our own counter) @@ -48,14 +43,19 @@ pub(crate) struct EndpointConnPool { global_connections_count: Arc, } -impl EndpointConnPool { +impl HttpConnPool { fn get_conn_entry(&mut self) -> Option> { let Self { conns, .. } = self; loop { let conn = conns.pop_front()?; - if !conn.conn.is_closed() { - conns.push_back(conn.clone()); + if !conn.conn.inner.is_closed() { + let new_conn = ConnPoolEntry { + conn: conn.conn.clone(), + _last_access: std::time::Instant::now(), + }; + + conns.push_back(new_conn); return Some(conn); } } @@ -69,7 +69,7 @@ impl EndpointConnPool { } = self; let old_len = conns.len(); - conns.retain(|conn| conn.conn_id != conn_id); + conns.retain(|entry| entry.conn.conn_id != conn_id); let new_len = conns.len(); let removed = old_len - new_len; if removed > 0 { @@ -84,7 +84,22 @@ impl EndpointConnPool { } } -impl Drop for EndpointConnPool { +impl EndpointConnPoolExt for HttpConnPool { + fn clear_closed(&mut self) -> usize { + let Self { conns, .. } = self; + let old_len = conns.len(); + conns.retain(|entry| !entry.conn.inner.is_closed()); + + let new_len = conns.len(); + old_len - new_len + } + + fn total_conns(&self) -> usize { + self.conns.len() + } +} + +impl Drop for HttpConnPool { fn drop(&mut self) { if !self.conns.is_empty() { self.global_connections_count @@ -98,121 +113,11 @@ impl Drop for EndpointConnPool { } } -pub(crate) struct GlobalConnPool { - // endpoint -> per-endpoint connection pool - // - // That should be a fairly conteded map, so return reference to the per-endpoint - // pool as early as possible and release the lock. - global_pool: DashMap>>>, - - /// Number of endpoint-connection pools - /// - /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. - /// That seems like far too much effort, so we're using a relaxed increment counter instead. - /// It's only used for diagnostics. - global_pool_size: AtomicUsize, - - /// Total number of connections in the pool - global_connections_count: Arc, - - config: &'static crate::config::HttpConfig, -} - -impl GlobalConnPool { - pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { - let shards = config.pool_options.pool_shards; - Arc::new(Self { - global_pool: DashMap::with_shard_amount(shards), - global_pool_size: AtomicUsize::new(0), - config, - global_connections_count: Arc::new(AtomicUsize::new(0)), - }) - } - - pub(crate) fn shutdown(&self) { - // drops all strong references to endpoint-pools - self.global_pool.clear(); - } - - pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { - let epoch = self.config.pool_options.gc_epoch; - let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); - loop { - interval.tick().await; - - let shard = rng.gen_range(0..self.global_pool.shards().len()); - self.gc(shard); - } - } - - fn gc(&self, shard: usize) { - debug!(shard, "pool: performing epoch reclamation"); - - // acquire a random shard lock - let mut shard = self.global_pool.shards()[shard].write(); - - let timer = Metrics::get() - .proxy - .http_pool_reclaimation_lag_seconds - .start_timer(); - let current_len = shard.len(); - let mut clients_removed = 0; - shard.retain(|endpoint, x| { - // if the current endpoint pool is unique (no other strong or weak references) - // then it is currently not in use by any connections. - if let Some(pool) = Arc::get_mut(x.get_mut()) { - let EndpointConnPool { conns, .. } = pool.get_mut(); - - let old_len = conns.len(); - - conns.retain(|conn| !conn.conn.is_closed()); - - let new_len = conns.len(); - let removed = old_len - new_len; - clients_removed += removed; - - // we only remove this pool if it has no active connections - if conns.is_empty() { - info!("pool: discarding pool for endpoint {endpoint}"); - return false; - } - } - - true - }); - - let new_len = shard.len(); - drop(shard); - timer.observe(); - - // Do logging outside of the lock. - if clients_removed > 0 { - let size = self - .global_connections_count - .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - - clients_removed; - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(clients_removed as i64); - info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); - } - let removed = current_len - new_len; - - if removed > 0 { - let global_pool_size = self - .global_pool_size - .fetch_sub(removed, atomic::Ordering::Relaxed) - - removed; - info!("pool: performed global pool gc. size now {global_pool_size}"); - } - } - +impl GlobalConnPool> { #[expect(unused_results)] pub(crate) fn get( self: &Arc, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let result: Result>, HttpConnError>; @@ -226,27 +131,28 @@ impl GlobalConnPool { return result; }; - tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); - info!( + tracing::Span::current().record("conn_id", tracing::field::display(client.conn.conn_id)); + debug!( cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "pool: reusing connection '{conn_info}'" ); ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); ctx.success(); - Ok(Some(Client::new(client.conn, client.aux))) + + Ok(Some(Client::new(client.conn.clone()))) } fn get_or_create_endpoint_pool( self: &Arc, endpoint: &EndpointCacheKey, - ) -> Arc>> { + ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); } // slow path - let new_pool = Arc::new(RwLock::new(EndpointConnPool { + let new_pool = Arc::new(RwLock::new(HttpConnPool { conns: VecDeque::new(), _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count: self.global_connections_count.clone(), @@ -279,8 +185,8 @@ impl GlobalConnPool { } pub(crate) fn poll_http2_client( - global_pool: Arc>, - ctx: &RequestMonitoring, + global_pool: Arc>>, + ctx: &RequestContext, conn_info: &ConnInfo, client: Send, connection: Connect, @@ -299,11 +205,15 @@ pub(crate) fn poll_http2_client( let pool = match conn_info.endpoint_cache_key() { Some(endpoint) => { let pool = global_pool.get_or_create_endpoint_pool(&endpoint); - - pool.write().conns.push_back(ConnPoolEntry { - conn: client.clone(), - conn_id, + let client = ClientInnerCommon { + inner: client.clone(), aux: aux.clone(), + conn_id, + data: ClientDataEnum::Http(ClientDataHttp()), + }; + pool.write().conns.push_back(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), }); Metrics::get() .proxy @@ -335,23 +245,30 @@ pub(crate) fn poll_http2_client( .instrument(span), ); - Client::new(client, aux) + let client = ClientInnerCommon { + inner: client, + aux, + conn_id, + data: ClientDataEnum::Http(ClientDataHttp()), + }; + + Client::new(client) } pub(crate) struct Client { - pub(crate) inner: C, - aux: MetricsAuxInfo, + pub(crate) inner: ClientInnerCommon, } impl Client { - pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self { - Self { inner, aux } + pub(self) fn new(inner: ClientInnerCommon) -> Self { + Self { inner } } pub(crate) fn metrics(&self) -> Arc { + let aux = &self.inner.aux; USAGE_METRICS.register(Ids { - endpoint_id: self.aux.endpoint_id, - branch_id: self.aux.branch_id, + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, }) } } diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 99d4329f88..9abe35db08 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -29,14 +29,14 @@ use tokio_postgres::tls::NoTlsStream; use tokio_postgres::types::ToSql; use tokio_postgres::{AsyncMessage, Socket}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{debug, error, info, info_span, warn, Instrument}; use super::backend::HttpConnError; use super::conn_pool_lib::{ Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn, EndpointConnPool, }; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; @@ -44,6 +44,7 @@ pub(crate) const EXT_NAME: &str = "pg_session_jwt"; pub(crate) const EXT_VERSION: &str = "0.1.2"; pub(crate) const EXT_SCHEMA: &str = "auth"; +#[derive(Clone)] pub(crate) struct ClientDataLocal { session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -88,7 +89,7 @@ impl LocalConnPool { pub(crate) fn get( self: &Arc, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let client = self @@ -110,7 +111,7 @@ impl LocalConnPool { "pid", tracing::field::display(client.inner.get_process_id()), ); - info!( + debug!( cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "local_pool: reusing connection '{conn_info}'" ); @@ -159,7 +160,7 @@ impl LocalConnPool { #[allow(clippy::too_many_arguments)] pub(crate) fn poll_client( global_pool: Arc>, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: ConnInfo, client: C, mut connection: tokio_postgres::Connection, diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index cf758855fa..77025f419d 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -45,7 +45,7 @@ use utils::http::error::ApiError; use crate::cancellation::CancellationHandlerMain; use crate::config::{ProxyConfig, ProxyProtocolV2}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::metrics::Metrics; use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo}; use crate::proxy::run_until_cancelled; @@ -88,7 +88,7 @@ pub async fn task_main( } }); - let http_conn_pool = http_conn_pool::GlobalConnPool::new(&config.http_config); + let http_conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config); { let http_conn_pool = Arc::clone(&http_conn_pool); tokio::spawn(async move { @@ -423,7 +423,7 @@ async fn request_handler( if config.http_config.accept_websockets && framed_websockets::upgrade::is_upgrade_request(&request) { - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, conn_info, crate::metrics::Protocol::Ws, @@ -458,7 +458,7 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response.map(|b| b.map_err(|x| match x {}).boxed())) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, conn_info, crate::metrics::Protocol::Http, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index f0975617d4..afd93d02f0 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -8,17 +8,17 @@ use http::header::AUTHORIZATION; use http::Method; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; -use hyper::body::{Body, Incoming}; +use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; use hyper::{header, HeaderMap, Request, Response, StatusCode}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; -use tokio::time; +use tokio::time::{self, Instant}; use tokio_postgres::error::{DbError, ErrorPosition, SqlState}; use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use tokio_util::sync::CancellationToken; -use tracing::{error, info}; +use tracing::{debug, error, info}; use typed_json::json; use url::Url; use urlencoding; @@ -34,8 +34,9 @@ use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::http::{read_body_with_limit, ReadBodyError}; use crate::metrics::{HttpDirection, Metrics}; use crate::proxy::{run_until_cancelled, NeonOptions}; use crate::serverless::backend::HttpConnError; @@ -47,6 +48,7 @@ use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; struct QueryData { query: String, #[serde(deserialize_with = "bytes_to_pg_text")] + #[serde(default)] params: Vec>, #[serde(default)] array_mode: Option, @@ -133,7 +135,7 @@ impl UserFacingError for ConnInfoError { fn get_conn_info( config: &'static AuthenticationConfig, - ctx: &RequestMonitoring, + ctx: &RequestContext, headers: &HeaderMap, tls: Option<&TlsConfig>, ) -> Result { @@ -240,7 +242,7 @@ fn get_conn_info( pub(crate) async fn handle( config: &'static ProxyConfig, - ctx: RequestMonitoring, + ctx: RequestContext, request: Request, backend: Arc, cancel: CancellationToken, @@ -357,8 +359,6 @@ pub(crate) enum SqlOverHttpError { ConnectCompute(#[from] HttpConnError), #[error("{0}")] ConnInfo(#[from] ConnInfoError), - #[error("request is too large (max is {0} bytes)")] - RequestTooLarge(u64), #[error("response is too large (max is {0} bytes)")] ResponseTooLarge(usize), #[error("invalid isolation level")] @@ -377,7 +377,6 @@ impl ReportableError for SqlOverHttpError { SqlOverHttpError::ReadPayload(e) => e.get_error_kind(), SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(), SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), - SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User, SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, SqlOverHttpError::Postgres(p) => p.get_error_kind(), @@ -393,7 +392,6 @@ impl UserFacingError for SqlOverHttpError { SqlOverHttpError::ReadPayload(p) => p.to_string(), SqlOverHttpError::ConnectCompute(c) => c.to_string_client(), SqlOverHttpError::ConnInfo(c) => c.to_string_client(), - SqlOverHttpError::RequestTooLarge(_) => self.to_string(), SqlOverHttpError::ResponseTooLarge(_) => self.to_string(), SqlOverHttpError::InvalidIsolationLevel => self.to_string(), SqlOverHttpError::Postgres(p) => p.to_string(), @@ -406,13 +404,12 @@ impl UserFacingError for SqlOverHttpError { impl HttpCodeError for SqlOverHttpError { fn get_http_status_code(&self) -> StatusCode { match self { - SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST, + SqlOverHttpError::ReadPayload(e) => e.get_http_status_code(), SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() { ErrorKind::User => StatusCode::BAD_REQUEST, _ => StatusCode::INTERNAL_SERVER_ERROR, }, SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST, - SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE, SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE, SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST, SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST, @@ -426,19 +423,41 @@ impl HttpCodeError for SqlOverHttpError { pub(crate) enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] Read(#[from] hyper::Error), + #[error("request is too large (max is {limit} bytes)")] + BodyTooLarge { limit: usize }, #[error("could not parse the HTTP request body: {0}")] Parse(#[from] serde_json::Error), } +impl From> for ReadPayloadError { + fn from(value: ReadBodyError) -> Self { + match value { + ReadBodyError::BodyTooLarge { limit } => Self::BodyTooLarge { limit }, + ReadBodyError::Read(e) => Self::Read(e), + } + } +} + impl ReportableError for ReadPayloadError { fn get_error_kind(&self) -> ErrorKind { match self { ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, + ReadPayloadError::BodyTooLarge { .. } => ErrorKind::User, ReadPayloadError::Parse(_) => ErrorKind::User, } } } +impl HttpCodeError for ReadPayloadError { + fn get_http_status_code(&self) -> StatusCode { + match self { + ReadPayloadError::Read(_) => StatusCode::BAD_REQUEST, + ReadPayloadError::BodyTooLarge { .. } => StatusCode::PAYLOAD_TOO_LARGE, + ReadPayloadError::Parse(_) => StatusCode::BAD_REQUEST, + } + } +} + #[derive(Debug, thiserror::Error)] pub(crate) enum SqlOverHttpCancel { #[error("query was cancelled")] @@ -516,7 +535,7 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, - ctx: &RequestMonitoring, + ctx: &RequestContext, request: Request, backend: Arc, ) -> Result>, SqlOverHttpError> { @@ -562,7 +581,7 @@ async fn handle_inner( async fn handle_db_inner( cancel: CancellationToken, config: &'static ProxyConfig, - ctx: &RequestMonitoring, + ctx: &RequestContext, request: Request, conn_info: ConnInfo, auth: AuthData, @@ -580,28 +599,20 @@ async fn handle_db_inner( let parsed_headers = HttpHeaders::try_parse(headers)?; - let request_content_length = match request.body().size_hint().upper() { - Some(v) => v, - None => config.http_config.max_request_size_bytes + 1, - }; - info!(request_content_length, "request size in bytes"); - Metrics::get() - .proxy - .http_conn_content_length_bytes - .observe(HttpDirection::Request, request_content_length as f64); - - // we don't have a streaming request support yet so this is to prevent OOM - // from a malicious user sending an extremely large request body - if request_content_length > config.http_config.max_request_size_bytes { - return Err(SqlOverHttpError::RequestTooLarge( - config.http_config.max_request_size_bytes, - )); - } - let fetch_and_process_request = Box::pin( async { - let body = request.into_body().collect().await?.to_bytes(); - info!(length = body.len(), "request payload read"); + let body = read_body_with_limit( + request.into_body(), + config.http_config.max_request_size_bytes, + ) + .await?; + + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Request, body.len() as f64); + + debug!(length = body.len(), "request payload read"); let payload: Payload = serde_json::from_slice(&body)?; Ok::(payload) // Adjust error type accordingly } @@ -733,7 +744,7 @@ pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue { } async fn handle_auth_broker_inner( - ctx: &RequestMonitoring, + ctx: &RequestContext, request: Request, conn_info: ConnInfo, jwt: String, @@ -768,6 +779,7 @@ async fn handle_auth_broker_inner( let _metrics = client.metrics(); Ok(client + .inner .inner .send_request(req) .await @@ -968,10 +980,11 @@ async fn query_to_json( current_size: &mut usize, parsed_headers: HttpHeaders, ) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { - info!("executing query"); + let query_start = Instant::now(); + let query_params = data.params; let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); - info!("finished executing query"); + let query_acknowledged = Instant::now(); // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too @@ -990,6 +1003,7 @@ async fn query_to_json( } } + let query_resp_end = Instant::now(); let ready = row_stream.ready_status(); // grab the command tag and number of rows affected @@ -1009,7 +1023,9 @@ async fn query_to_json( rows = rows.len(), ?ready, command_tag, - "finished reading rows" + acknowledgement = ?(query_acknowledged - query_start), + response = ?(query_resp_end - query_start), + "finished executing query" ); let columns_len = row_stream.columns().len(); @@ -1095,3 +1111,63 @@ impl Discard<'_> { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_payload() { + let payload = "{\"query\":\"SELECT * FROM users WHERE name = ?\",\"params\":[\"test\"],\"arrayMode\":true}"; + let deserialized_payload: Payload = serde_json::from_str(payload).unwrap(); + + match deserialized_payload { + Payload::Single(QueryData { + query, + params, + array_mode, + }) => { + assert_eq!(query, "SELECT * FROM users WHERE name = ?"); + assert_eq!(params, vec![Some(String::from("test"))]); + assert!(array_mode.unwrap()); + } + Payload::Batch(_) => { + panic!("deserialization failed: case with single query, one param, and array mode") + } + } + + let payload = "{\"queries\":[{\"query\":\"SELECT * FROM users0 WHERE name = ?\",\"params\":[\"test0\"], \"arrayMode\":false},{\"query\":\"SELECT * FROM users1 WHERE name = ?\",\"params\":[\"test1\"],\"arrayMode\":true}]}"; + let deserialized_payload: Payload = serde_json::from_str(payload).unwrap(); + + match deserialized_payload { + Payload::Batch(BatchQueryData { queries }) => { + assert_eq!(queries.len(), 2); + for (i, query) in queries.into_iter().enumerate() { + assert_eq!( + query.query, + format!("SELECT * FROM users{i} WHERE name = ?") + ); + assert_eq!(query.params, vec![Some(format!("test{i}"))]); + assert_eq!(query.array_mode.unwrap(), i > 0); + } + } + Payload::Single(_) => panic!("deserialization failed: case with multiple queries"), + } + + let payload = "{\"query\":\"SELECT 1\"}"; + let deserialized_payload: Payload = serde_json::from_str(payload).unwrap(); + + match deserialized_payload { + Payload::Single(QueryData { + query, + params, + array_mode, + }) => { + assert_eq!(query, "SELECT 1"); + assert_eq!(params, vec![]); + assert!(array_mode.is_none()); + } + Payload::Batch(_) => panic!("deserialization failed: case with only one query"), + } + } +} diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index ba36116c2c..4088fea835 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -14,7 +14,7 @@ use tracing::warn; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::{io_error, ReportableError}; use crate::metrics::Metrics; use crate::proxy::{handle_client, ClientMode, ErrorSource}; @@ -126,7 +126,7 @@ impl AsyncBufRead for WebSocketRw { pub(crate) async fn serve_websocket( config: &'static ProxyConfig, auth_backend: &'static crate::auth::Backend<'static, ()>, - ctx: RequestMonitoring, + ctx: RequestContext, websocket: OnUpgrade, cancellation_handler: Arc, endpoint_rate_limiter: Arc, diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 89df48c5d3..11f426819d 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -133,6 +133,7 @@ impl PqStream { msg: &'static str, error_kind: ErrorKind, ) -> Result { + // TODO: only log this for actually interesting errors tracing::info!( kind = error_kind.to_metric_label(), msg, diff --git a/pyproject.toml b/pyproject.toml index 197946fff8..ccd3ab1864 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ authors = [] package-mode = false [tool.poetry.dependencies] -python = "^3.9" +python = "^3.11" pytest = "^7.4.4" psycopg2-binary = "^2.9.10" typing-extensions = "^4.6.1" @@ -51,7 +51,7 @@ testcontainers = "^4.8.1" jsonnet = "^0.20.0" [tool.poetry.group.dev.dependencies] -mypy = "==1.3.0" +mypy = "==1.13.0" ruff = "^0.7.0" [build-system] @@ -89,7 +89,7 @@ module = [ ignore_missing_imports = true [tool.ruff] -target-version = "py39" +target-version = "py311" extend-exclude = [ "vendor/", "target/", @@ -108,6 +108,3 @@ select = [ "B", # bugbear "UP", # pyupgrade ] - -[tool.ruff.lint.pyupgrade] -keep-runtime-typing = true # Remove this stanza when we require Python 3.10 diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 85561e4aff..635a9222e1 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -28,8 +28,10 @@ hyper0.workspace = true futures.workspace = true once_cell.workspace = true parking_lot.workspace = true +pageserver_api.workspace = true postgres.workspace = true postgres-protocol.workspace = true +pprof.workspace = true rand.workspace = true regex.workspace = true scopeguard.workspace = true @@ -57,6 +59,7 @@ sd-notify.workspace = true storage_broker.workspace = true tokio-stream.workspace = true utils.workspace = true +wal_decoder.workspace = true workspace_hack.workspace = true diff --git a/safekeeper/benches/README.md b/safekeeper/benches/README.md index 4119cc8d6e..d73fbccf05 100644 --- a/safekeeper/benches/README.md +++ b/safekeeper/benches/README.md @@ -14,6 +14,10 @@ cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false # List available benchmarks. cargo bench --package safekeeper --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false --profile-time 10 ``` Additional charts and statistics are available in `target/criterion/report/index.html`. diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index e32d7526ca..c637b4fb24 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -10,6 +10,7 @@ use camino_tempfile::tempfile; use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; use itertools::Itertools as _; use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; +use pprof::criterion::{Output, PProfProfiler}; use safekeeper::receive_wal::{self, WalAcceptor}; use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, @@ -24,8 +25,9 @@ const GB: usize = 1024 * MB; // Register benchmarks with Criterion. criterion_group!( - benches, - bench_process_msg, + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_process_msg, bench_wal_acceptor, bench_wal_acceptor_throughput, bench_file_write diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 3f00b69cde..cec7c3c7ee 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -2,11 +2,15 @@ //! protocol commands. use anyhow::Context; +use pageserver_api::models::ShardParameters; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; use std::future::Future; use std::str::{self, FromStr}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, info_span, Instrument}; +use utils::postgres_client::PostgresClientProtocol; +use utils::shard::{ShardCount, ShardNumber}; use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; @@ -35,6 +39,8 @@ pub struct SafekeeperPostgresHandler { pub tenant_id: Option, pub timeline_id: Option, pub ttid: TenantTimelineId, + pub shard: Option, + pub protocol: Option, /// Unique connection id is logged in spans for observability. pub conn_id: ConnectionId, /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured. @@ -107,11 +113,28 @@ impl postgres_backend::Handler ) -> Result<(), QueryError> { if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(options) = params.options_raw() { + let mut shard_count: Option = None; + let mut shard_number: Option = None; + let mut shard_stripe_size: Option = None; + for opt in options { // FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy, // remove these after the PR gets deployed: // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064 match opt.split_once('=') { + Some(("protocol", value)) => { + let raw_value = value + .parse::() + .with_context(|| format!("Failed to parse {value} as protocol"))?; + + self.protocol = Some( + PostgresClientProtocol::try_from(raw_value).map_err(|_| { + QueryError::Other(anyhow::anyhow!( + "Unexpected client protocol type: {raw_value}" + )) + })?, + ); + } Some(("ztenantid", value)) | Some(("tenant_id", value)) => { self.tenant_id = Some(value.parse().with_context(|| { format!("Failed to parse {value} as tenant id") @@ -127,9 +150,54 @@ impl postgres_backend::Handler metrics.set_client_az(client_az) } } + Some(("shard_count", value)) => { + shard_count = Some(value.parse::().with_context(|| { + format!("Failed to parse {value} as shard count") + })?); + } + Some(("shard_number", value)) => { + shard_number = Some(value.parse::().with_context(|| { + format!("Failed to parse {value} as shard number") + })?); + } + Some(("shard_stripe_size", value)) => { + shard_stripe_size = Some(value.parse::().with_context(|| { + format!("Failed to parse {value} as shard stripe size") + })?); + } _ => continue, } } + + match self.protocol() { + PostgresClientProtocol::Vanilla => { + if shard_count.is_some() + || shard_number.is_some() + || shard_stripe_size.is_some() + { + return Err(QueryError::Other(anyhow::anyhow!( + "Shard params specified for vanilla protocol" + ))); + } + } + PostgresClientProtocol::Interpreted => { + match (shard_count, shard_number, shard_stripe_size) { + (Some(count), Some(number), Some(stripe_size)) => { + let params = ShardParameters { + count: ShardCount(count), + stripe_size: ShardStripeSize(stripe_size), + }; + self.shard = + Some(ShardIdentity::from_params(ShardNumber(number), ¶ms)); + } + _ => { + return Err(QueryError::Other(anyhow::anyhow!( + "Shard params were not specified" + ))); + } + } + } + } } if let Some(app_name) = params.get("application_name") { @@ -150,6 +218,11 @@ impl postgres_backend::Handler tracing::field::debug(self.appname.clone()), ); + if let Some(shard) = self.shard.as_ref() { + tracing::Span::current() + .record("shard", tracing::field::display(shard.shard_slug())); + } + Ok(()) } else { Err(QueryError::Other(anyhow::anyhow!( @@ -258,6 +331,8 @@ impl SafekeeperPostgresHandler { tenant_id: None, timeline_id: None, ttid: TenantTimelineId::empty(), + shard: None, + protocol: None, conn_id, claims: None, auth, @@ -265,6 +340,10 @@ impl SafekeeperPostgresHandler { } } + pub fn protocol(&self) -> PostgresClientProtocol { + self.protocol.unwrap_or(PostgresClientProtocol::Vanilla) + } + // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id fn check_permission(&self, tenant_id: Option) -> Result<(), QueryError> { diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index df68f8a68e..28294abdb9 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,7 +1,6 @@ -use hyper::{Body, Request, Response, StatusCode, Uri}; -use once_cell::sync::Lazy; +use hyper::{Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fmt; use std::io::Write as _; use std::str::FromStr; @@ -14,7 +13,9 @@ use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter}; +use utils::http::endpoint::{ + profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter, +}; use utils::http::request::parse_query_param; use postgres_ffi::WAL_SEGMENT_SIZE; @@ -572,14 +573,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder let mut router = endpoint::make_router(); if conf.http_auth.is_some() { router = router.middleware(auth_middleware(|request| { - #[allow(clippy::mutable_key_type)] - static ALLOWLIST_ROUTES: Lazy> = Lazy::new(|| { - ["/v1/status", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect() - }); - if ALLOWLIST_ROUTES.contains(request.uri()) { + const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"]; + if ALLOWLIST_ROUTES.contains(&request.uri().path()) { None } else { // Option> is always provided as data below, hence unwrap(). @@ -598,6 +593,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .data(Arc::new(conf)) .data(auth) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 6d68b6b59b..abe6e00a66 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -29,6 +29,7 @@ pub mod receive_wal; pub mod recovery; pub mod remove_wal; pub mod safekeeper; +pub mod send_interpreted_wal; pub mod send_wal; pub mod state; pub mod timeline; @@ -38,6 +39,7 @@ pub mod timeline_manager; pub mod timelines_set; pub mod wal_backup; pub mod wal_backup_partial; +pub mod wal_reader_stream; pub mod wal_service; pub mod wal_storage; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 2edcc4ef6f..bfa1764abf 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -239,6 +239,10 @@ impl SafekeeperPostgresHandler { pgb: &mut PostgresBackend, tli: &mut Option, ) -> Result<(), CopyStreamHandlerEnd> { + // The `tli` parameter is only used for passing _out_ a timeline, one should + // not have been passed in. + assert!(tli.is_none()); + // Notify the libpq client that it's allowed to send `CopyData` messages pgb.write_message(&BeMessage::CopyBothResponse).await?; @@ -256,6 +260,7 @@ impl SafekeeperPostgresHandler { // sends, so this avoids deadlocks. let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?; let peer_addr = *pgb.get_peer_addr(); + let mut network_reader = NetworkReader { ttid: self.ttid, conn_id: self.conn_id, @@ -275,10 +280,14 @@ impl SafekeeperPostgresHandler { .subscribe(); *tli = Some(timeline.wal_residence_guard().await?); + let timeline_cancel = timeline.cancel.clone(); tokio::select! { // todo: add read|write .context to these errors r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r, r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, + _ = timeline_cancel.cancelled() => { + return Err(CopyStreamHandlerEnd::Cancelled); + } } } else { res.map(|_| ()) @@ -303,7 +312,7 @@ impl SafekeeperPostgresHandler { // Otherwise, WalAcceptor thread must have errored. match wal_acceptor_res { - Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination + Ok(Ok(_)) => Ok(()), // Clean shutdown Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))), Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!( "WalAcceptor task panicked", @@ -356,6 +365,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { Ok((tli, next_msg)) } + /// This function is cancellation-safe (only does network I/O and channel read/writes). async fn run( self, msg_tx: Sender, @@ -397,6 +407,7 @@ async fn read_network_loop( loop { let started = Instant::now(); let size = next_msg.size(); + match msg_tx.send_timeout(next_msg, SLOW_THRESHOLD).await { Ok(()) => {} // Slow send, log a message and keep trying. Log context has timeline ID. @@ -428,6 +439,8 @@ async fn read_network_loop( /// Read replies from WalAcceptor and pass them back to socket. Returns Ok(()) /// if reply_rx closed; it must mean WalAcceptor terminated, joining it should /// tell the error. +/// +/// This function is cancellation-safe (only does network I/O and channel read/writes). async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, @@ -461,7 +474,7 @@ async fn network_write( Some(AcceptorProposerMessage::AppendResponse(append_response)) } _ => None, - } + }, }; let Some(msg) = msg else { @@ -527,6 +540,10 @@ impl WalAcceptor { /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; /// it must mean that network thread terminated. + /// + /// This function is *not* cancellation safe, it does local disk I/O: it should always + /// be allowed to run to completion. It respects Timeline::cancel and shuts down cleanly + /// when that gets triggered. async fn run(&mut self) -> anyhow::Result<()> { let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id); @@ -541,7 +558,7 @@ impl WalAcceptor { // Tracks whether we have unflushed appends. let mut dirty = false; - loop { + while !self.tli.is_cancelled() { let reply = tokio::select! { // Process inbound message. msg = self.msg_rx.recv() => { @@ -599,6 +616,10 @@ impl WalAcceptor { WAL_RECEIVER_QUEUE_DEPTH.observe(self.msg_rx.len() as f64); None // no reply } + + _ = self.tli.cancel.cancelled() => { + break; + } }; // Send reply, if any. @@ -610,7 +631,7 @@ impl WalAcceptor { } // Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259. - if dirty { + if dirty && !self.tli.cancel.is_cancelled() { self.tli .process_msg(&ProposerAcceptorMessage::FlushWAL) .await?; diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index 9c4149d8f1..7b87166aa0 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -17,6 +17,7 @@ use tokio::{ use tokio_postgres::replication::ReplicationStream; use tokio_postgres::types::PgLsn; use tracing::*; +use utils::postgres_client::{ConnectionConfigArgs, PostgresClientProtocol}; use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}; use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; @@ -325,7 +326,17 @@ async fn recovery_stream( conf: &SafeKeeperConf, ) -> anyhow::Result { // TODO: pass auth token - let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?; + let connection_conf_args = ConnectionConfigArgs { + protocol: PostgresClientProtocol::Vanilla, + ttid: tli.ttid, + shard_number: None, + shard_count: None, + shard_stripe_size: None, + listen_pg_addr_str: &donor.pg_connstr, + auth_token: None, + availability_zone: None, + }; + let cfg = wal_stream_connection_config(connection_conf_args)?; let mut cfg = cfg.to_tokio_postgres_config(); // It will make safekeeper give out not committed WAL (up to flush_lsn). cfg.application_name(&format!("safekeeper_{}", conf.my_id)); diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs new file mode 100644 index 0000000000..cf0ee276e9 --- /dev/null +++ b/safekeeper/src/send_interpreted_wal.rs @@ -0,0 +1,121 @@ +use std::time::Duration; + +use anyhow::Context; +use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend}; +use postgres_ffi::MAX_SEND_SIZE; +use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder}; +use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::time::MissedTickBehavior; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; +use wal_decoder::models::InterpretedWalRecord; + +use crate::send_wal::EndWatchView; +use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder}; + +/// Shard-aware interpreted record sender. +/// This is used for sending WAL to the pageserver. Said WAL +/// is pre-interpreted and filtered for the shard. +pub(crate) struct InterpretedWalSender<'a, IO> { + pub(crate) pgb: &'a mut PostgresBackend, + pub(crate) wal_stream_builder: WalReaderStreamBuilder, + pub(crate) end_watch_view: EndWatchView, + pub(crate) shard: ShardIdentity, + pub(crate) pg_version: u32, + pub(crate) appname: Option, +} + +impl InterpretedWalSender<'_, IO> { + /// Send interpreted WAL to a receiver. + /// Stops when an error occurs or the receiver is caught up and there's no active compute. + /// + /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ? + /// convenience. + pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> { + let mut wal_position = self.wal_stream_builder.start_pos(); + let mut wal_decoder = + WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version); + + let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?; + let mut stream = std::pin::pin!(stream); + + let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1)); + keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); + keepalive_ticker.reset(); + + loop { + tokio::select! { + // Get some WAL from the stream and then: decode, interpret and send it + wal = stream.next() => { + let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal { + Some(some) => some?, + None => { break; } + }; + + wal_position = wal_end_lsn; + wal_decoder.feed_bytes(&wal); + + let mut records = Vec::new(); + let mut max_next_record_lsn = None; + while let Some((next_record_lsn, recdata)) = wal_decoder + .poll_decode() + .with_context(|| "Failed to decode WAL")? + { + assert!(next_record_lsn.is_aligned()); + max_next_record_lsn = Some(next_record_lsn); + + // Deserialize and interpret WAL record + let interpreted = InterpretedWalRecord::from_bytes_filtered( + recdata, + &self.shard, + next_record_lsn, + self.pg_version, + ) + .with_context(|| "Failed to interpret WAL")?; + + if !interpreted.is_empty() { + records.push(interpreted); + } + } + + let mut buf = Vec::new(); + records + .ser_into(&mut buf) + .with_context(|| "Failed to serialize interpreted WAL")?; + + // Reset the keep alive ticker since we are sending something + // over the wire now. + keepalive_ticker.reset(); + + self.pgb + .write_message(&BeMessage::InterpretedWalRecords(InterpretedWalRecordsBody { + streaming_lsn: wal_end_lsn.0, + commit_lsn: available_wal_end_lsn.0, + next_record_lsn: max_next_record_lsn.unwrap_or(Lsn::INVALID).0, + data: buf.as_slice(), + })).await?; + } + + // Send a periodic keep alive when the connection has been idle for a while. + _ = keepalive_ticker.tick() => { + self.pgb + .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { + wal_end: self.end_watch_view.get().0, + timestamp: get_current_timestamp(), + request_reply: true, + })) + .await?; + } + } + } + + // The loop above ends when the receiver is caught up and there's no more WAL to send. + Err(CopyStreamHandlerEnd::ServerInitiated(format!( + "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", + self.appname, wal_position, + ))) + } +} diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 6d94ff98b1..1acfcad418 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -5,12 +5,15 @@ use crate::handler::SafekeeperPostgresHandler; use crate::metrics::RECEIVED_PS_FEEDBACKS; use crate::receive_wal::WalReceivers; use crate::safekeeper::{Term, TermLsn}; +use crate::send_interpreted_wal::InterpretedWalSender; use crate::timeline::WalResidentTimeline; +use crate::wal_reader_stream::WalReaderStreamBuilder; use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use crate::GlobalTimelines; use anyhow::{bail, Context as AnyhowContext}; use bytes::Bytes; +use futures::future::Either; use parking_lot::Mutex; use postgres_backend::PostgresBackend; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; @@ -22,6 +25,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use utils::failpoint_support; use utils::id::TenantTimelineId; use utils::pageserver_feedback::PageserverFeedback; +use utils::postgres_client::PostgresClientProtocol; use std::cmp::{max, min}; use std::net::SocketAddr; @@ -226,7 +230,7 @@ impl WalSenders { /// Get remote_consistent_lsn reported by the pageserver. Returns None if /// client is not pageserver. - fn get_ws_remote_consistent_lsn(self: &Arc, id: WalSenderId) -> Option { + pub fn get_ws_remote_consistent_lsn(self: &Arc, id: WalSenderId) -> Option { let shared = self.mutex.lock(); let slot = shared.get_slot(id); match slot.feedback { @@ -370,6 +374,16 @@ pub struct WalSenderGuard { walsenders: Arc, } +impl WalSenderGuard { + pub fn id(&self) -> WalSenderId { + self.id + } + + pub fn walsenders(&self) -> &Arc { + &self.walsenders + } +} + impl Drop for WalSenderGuard { fn drop(&mut self) { self.walsenders.unregister(self.id); @@ -440,11 +454,12 @@ impl SafekeeperPostgresHandler { } info!( - "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}", + "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}, protocol={}", start_pos, end_pos, matches!(end_watch, EndWatch::Flush(_)), - appname + appname, + self.protocol(), ); // switch to copy @@ -456,19 +471,51 @@ impl SafekeeperPostgresHandler { // not synchronized with sends, so this avoids deadlocks. let reader = pgb.split().context("START_REPLICATION split")?; - let mut sender = WalSender { - pgb, - // should succeed since we're already holding another guard - tli: tli.wal_residence_guard().await?, - appname, - start_pos, - end_pos, - term, - end_watch, - ws_guard: ws_guard.clone(), - wal_reader, - send_buf: vec![0u8; MAX_SEND_SIZE], + let send_fut = match self.protocol() { + PostgresClientProtocol::Vanilla => { + let sender = WalSender { + pgb, + // should succeed since we're already holding another guard + tli: tli.wal_residence_guard().await?, + appname, + start_pos, + end_pos, + term, + end_watch, + ws_guard: ws_guard.clone(), + wal_reader, + send_buf: vec![0u8; MAX_SEND_SIZE], + }; + + Either::Left(sender.run()) + } + PostgresClientProtocol::Interpreted => { + let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000; + let end_watch_view = end_watch.view(); + let wal_stream_builder = WalReaderStreamBuilder { + tli: tli.wal_residence_guard().await?, + start_pos, + end_pos, + term, + end_watch, + wal_sender_guard: ws_guard.clone(), + }; + + let sender = InterpretedWalSender { + pgb, + wal_stream_builder, + end_watch_view, + shard: self.shard.unwrap(), + pg_version, + appname, + }; + + Either::Right(sender.run()) + } }; + + let tli_cancel = tli.cancel.clone(); + let mut reply_reader = ReplyReader { reader, ws_guard: ws_guard.clone(), @@ -477,8 +524,11 @@ impl SafekeeperPostgresHandler { let res = tokio::select! { // todo: add read|write .context to these errors - r = sender.run() => r, + r = send_fut => r, r = reply_reader.run() => r, + _ = tli_cancel.cancelled() => { + return Err(CopyStreamHandlerEnd::Cancelled); + } }; let ws_state = ws_guard @@ -499,16 +549,22 @@ impl SafekeeperPostgresHandler { } } +/// TODO(vlad): maybe lift this instead /// Walsender streams either up to commit_lsn (normally) or flush_lsn in the /// given term (recovery by walproposer or peer safekeeper). -enum EndWatch { +#[derive(Clone)] +pub(crate) enum EndWatch { Commit(Receiver), Flush(Receiver), } impl EndWatch { + pub(crate) fn view(&self) -> EndWatchView { + EndWatchView(self.clone()) + } + /// Get current end of WAL. - fn get(&self) -> Lsn { + pub(crate) fn get(&self) -> Lsn { match self { EndWatch::Commit(r) => *r.borrow(), EndWatch::Flush(r) => r.borrow().lsn, @@ -516,15 +572,44 @@ impl EndWatch { } /// Wait for the update. - async fn changed(&mut self) -> anyhow::Result<()> { + pub(crate) async fn changed(&mut self) -> anyhow::Result<()> { match self { EndWatch::Commit(r) => r.changed().await?, EndWatch::Flush(r) => r.changed().await?, } Ok(()) } + + pub(crate) async fn wait_for_lsn( + &mut self, + lsn: Lsn, + client_term: Option, + ) -> anyhow::Result { + loop { + let end_pos = self.get(); + if end_pos > lsn { + return Ok(end_pos); + } + if let EndWatch::Flush(rx) = &self { + let curr_term = rx.borrow().term; + if let Some(client_term) = client_term { + if curr_term != client_term { + bail!("term changed: requested {}, now {}", client_term, curr_term); + } + } + } + self.changed().await?; + } + } } +pub(crate) struct EndWatchView(EndWatch); + +impl EndWatchView { + pub(crate) fn get(&self) -> Lsn { + self.0.get() + } +} /// A half driving sending WAL. struct WalSender<'a, IO> { pgb: &'a mut PostgresBackend, @@ -557,10 +642,11 @@ impl WalSender<'_, IO> { /// Send WAL until /// - an error occurs /// - receiver is caughtup and there is no computes (if streaming up to commit_lsn) + /// - timeline's cancellation token fires /// /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ? /// convenience. - async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> { + async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> { loop { // Wait for the next portion if it is not there yet, or just // update our end of WAL available for sending value, we @@ -601,15 +687,14 @@ impl WalSender<'_, IO> { }; let send_buf = &send_buf[..send_size]; - // and send it - self.pgb - .write_message(&BeMessage::XLogData(XLogDataBody { - wal_start: self.start_pos.0, - wal_end: self.end_pos.0, - timestamp: get_current_timestamp(), - data: send_buf, - })) - .await?; + // and send it, while respecting Timeline::cancel + let msg = BeMessage::XLogData(XLogDataBody { + wal_start: self.start_pos.0, + wal_end: self.end_pos.0, + timestamp: get_current_timestamp(), + data: send_buf, + }); + self.pgb.write_message(&msg).await?; if let Some(appname) = &self.appname { if appname == "replica" { @@ -674,13 +759,13 @@ impl WalSender<'_, IO> { } } - self.pgb - .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { - wal_end: self.end_pos.0, - timestamp: get_current_timestamp(), - request_reply: true, - })) - .await?; + let msg = BeMessage::KeepAlive(WalSndKeepAlive { + wal_end: self.end_pos.0, + timestamp: get_current_timestamp(), + request_reply: true, + }); + + self.pgb.write_message(&msg).await?; } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 85add6bfea..ef928f7633 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use tokio::fs::{self}; use tokio_util::sync::CancellationToken; use utils::id::TenantId; +use utils::sync::gate::Gate; use std::cmp::max; use std::ops::{Deref, DerefMut}; @@ -467,6 +468,10 @@ pub struct Timeline { timeline_dir: Utf8PathBuf, manager_ctl: ManagerCtl, + /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding + /// this gate, you must respect [`Timeline::cancel`] + pub(crate) gate: Gate, + /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires pub(crate) cancel: CancellationToken, @@ -508,6 +513,7 @@ impl Timeline { mutex: RwLock::new(shared_state), walsenders: WalSenders::new(walreceivers.clone()), walreceivers, + gate: Default::default(), cancel: CancellationToken::default(), manager_ctl: ManagerCtl::new(), broker_active: AtomicBool::new(false), @@ -533,56 +539,6 @@ impl Timeline { )) } - /// Initialize fresh timeline on disk and start background tasks. If init - /// fails, timeline is cancelled and cannot be used anymore. - /// - /// Init is transactional, so if it fails, created files will be deleted, - /// and state on disk should remain unchanged. - pub async fn init_new( - self: &Arc, - shared_state: &mut WriteGuardSharedState<'_>, - conf: &SafeKeeperConf, - broker_active_set: Arc, - partial_backup_rate_limiter: RateLimiter, - ) -> Result<()> { - match fs::metadata(&self.timeline_dir).await { - Ok(_) => { - // Timeline directory exists on disk, we should leave state unchanged - // and return error. - bail!(TimelineError::Invalid(self.ttid)); - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} - Err(e) => { - return Err(e.into()); - } - } - - // Create timeline directory. - fs::create_dir_all(&self.timeline_dir).await?; - - // Write timeline to disk and start background tasks. - if let Err(e) = shared_state.sk.state_mut().flush().await { - // Bootstrap failed, cancel timeline and remove timeline directory. - self.cancel(shared_state); - - if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await { - warn!( - "failed to remove timeline {} directory after bootstrap failure: {}", - self.ttid, fs_err - ); - } - - return Err(e); - } - self.bootstrap( - shared_state, - conf, - broker_active_set, - partial_backup_rate_limiter, - ); - Ok(()) - } - /// Bootstrap new or existing timeline starting background tasks. pub fn bootstrap( self: &Arc, @@ -593,33 +549,61 @@ impl Timeline { ) { let (tx, rx) = self.manager_ctl.bootstrap_manager(); + let Ok(gate_guard) = self.gate.enter() else { + // Init raced with shutdown + return; + }; + // Start manager task which will monitor timeline state and update // background tasks. - tokio::spawn(timeline_manager::main_task( - ManagerTimeline { tli: self.clone() }, - conf.clone(), - broker_active_set, - tx, - rx, - partial_backup_rate_limiter, - )); + tokio::spawn({ + let this = self.clone(); + let conf = conf.clone(); + async move { + let _gate_guard = gate_guard; + timeline_manager::main_task( + ManagerTimeline { tli: this }, + conf, + broker_active_set, + tx, + rx, + partial_backup_rate_limiter, + ) + .await + } + }); + } + + /// Background timeline activities (which hold Timeline::gate) will no + /// longer run once this function completes. + pub async fn shutdown(&self) { + info!("timeline {} shutting down", self.ttid); + self.cancel.cancel(); + + // Wait for any concurrent tasks to stop using this timeline, to avoid e.g. attempts + // to read deleted files. + self.gate.close().await; } /// Delete timeline from disk completely, by removing timeline directory. - /// Background timeline activities will stop eventually. /// /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but /// deletion API endpoint is retriable. + /// + /// Timeline must be in shut-down state (i.e. call [`Self::shutdown`] first) pub async fn delete( &self, shared_state: &mut WriteGuardSharedState<'_>, only_local: bool, ) -> Result { - self.cancel(shared_state); + // Assert that [`Self::shutdown`] was already called + assert!(self.cancel.is_cancelled()); + assert!(self.gate.close_complete()); + + // Close associated FDs. Nobody will be able to touch timeline data once + // it is cancelled, so WAL storage won't be opened again. + shared_state.sk.close_wal_store(); - // TODO: It's better to wait for s3 offloader termination before - // removing data from s3. Though since s3 doesn't have transactions it - // still wouldn't guarantee absense of data after removal. let conf = GlobalTimelines::get_global_config(); if !only_local && conf.is_wal_backup_enabled() { // Note: we concurrently delete remote storage data from multiple @@ -631,16 +615,6 @@ impl Timeline { Ok(dir_existed) } - /// Cancel timeline to prevent further usage. Background tasks will stop - /// eventually after receiving cancellation signal. - fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) { - info!("timeline {} is cancelled", self.ttid); - self.cancel.cancel(); - // Close associated FDs. Nobody will be able to touch timeline data once - // it is cancelled, so WAL storage won't be opened again. - shared_state.sk.close_wal_store(); - } - /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { self.cancel.is_cancelled() diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs index 1ddac573d2..9102a40df8 100644 --- a/safekeeper/src/timeline_guard.rs +++ b/safekeeper/src/timeline_guard.rs @@ -7,6 +7,7 @@ use std::collections::HashSet; use tracing::debug; +use utils::sync::gate::GateGuard; use crate::timeline_manager::ManagerCtlMessage; @@ -16,6 +17,12 @@ pub struct GuardId(u64); pub struct ResidenceGuard { manager_tx: tokio::sync::mpsc::UnboundedSender, guard_id: GuardId, + + /// [`ResidenceGuard`] represents a guarantee that a timeline's data remains resident, + /// which by extension also means the timeline is not shut down (since after shut down + /// our data may be deleted). Therefore everyone holding a residence guard must also + /// hold a guard on [`crate::timeline::Timeline::gate`] + _gate_guard: GateGuard, } impl Drop for ResidenceGuard { @@ -52,7 +59,8 @@ impl AccessService { self.guards.is_empty() } - pub(crate) fn create_guard(&mut self) -> ResidenceGuard { + /// `timeline_gate_guard` is a guarantee that the timeline is not shut down + pub(crate) fn create_guard(&mut self, timeline_gate_guard: GateGuard) -> ResidenceGuard { let guard_id = self.next_guard_id; self.next_guard_id += 1; self.guards.insert(guard_id); @@ -63,6 +71,7 @@ impl AccessService { ResidenceGuard { manager_tx: self.manager_tx.clone(), guard_id, + _gate_guard: timeline_gate_guard, } } diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index e9fed21bf5..c02fb904cf 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -266,8 +266,10 @@ pub async fn main_task( // Start recovery task which always runs on the timeline. if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled { - let tli = mgr.wal_resident_timeline(); - mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone()))); + // Recovery task is only spawned if we can get a residence guard (i.e. timeline is not already shutting down) + if let Ok(tli) = mgr.wal_resident_timeline() { + mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone()))); + } } // If timeline is evicted, reflect that in the metric. @@ -375,6 +377,13 @@ pub async fn main_task( // shutdown background tasks if mgr.conf.is_wal_backup_enabled() { + if let Some(backup_task) = mgr.backup_task.take() { + // If we fell through here, then the timeline is shutting down. This is important + // because otherwise joining on the wal_backup handle might hang. + assert!(mgr.tli.cancel.is_cancelled()); + + backup_task.join().await; + } wal_backup::update_task(&mut mgr, false, &last_state).await; } @@ -442,10 +451,18 @@ impl Manager { /// Get a WalResidentTimeline. /// Manager code must use this function instead of one from `Timeline` /// directly, because it will deadlock. - pub(crate) fn wal_resident_timeline(&mut self) -> WalResidentTimeline { + /// + /// This function is fallible because the guard may not be created if the timeline is + /// shutting down. + pub(crate) fn wal_resident_timeline(&mut self) -> anyhow::Result { assert!(!self.is_offloaded); - let guard = self.access_service.create_guard(); - WalResidentTimeline::new(self.tli.clone(), guard) + let guard = self.access_service.create_guard( + self.tli + .gate + .enter() + .map_err(|_| anyhow::anyhow!("Timeline shutting down"))?, + ); + Ok(WalResidentTimeline::new(self.tli.clone(), guard)) } /// Get a snapshot of the timeline state. @@ -559,6 +576,11 @@ impl Manager { if removal_horizon_segno > self.last_removed_segno { // we need to remove WAL + let Ok(timeline_gate_guard) = self.tli.gate.enter() else { + tracing::info!("Timeline shutdown, not spawning WAL removal task"); + return; + }; + let remover = match self.tli.read_shared_state().await.sk { StateSK::Loaded(ref sk) => { crate::wal_storage::Storage::remove_up_to(&sk.wal_store, removal_horizon_segno) @@ -573,6 +595,8 @@ impl Manager { self.wal_removal_task = Some(tokio::spawn( async move { + let _timeline_gate_guard = timeline_gate_guard; + remover.await?; Ok(removal_horizon_segno) } @@ -619,10 +643,15 @@ impl Manager { return; } + let Ok(resident) = self.wal_resident_timeline() else { + // Shutting down + return; + }; + // Get WalResidentTimeline and start partial backup task. let cancel = CancellationToken::new(); let handle = tokio::spawn(wal_backup_partial::main_task( - self.wal_resident_timeline(), + resident, self.conf.clone(), self.global_rate_limiter.clone(), cancel.clone(), @@ -664,7 +693,7 @@ impl Manager { self.partial_backup_task = None; } - let tli = self.wal_resident_timeline(); + let tli = self.wal_resident_timeline()?; let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await; // Reset might fail e.g. when cfile is already reset but s3 removal // failed, so set manager state to None beforehand. In any case caller @@ -688,7 +717,12 @@ impl Manager { let guard = if self.is_offloaded { Err(anyhow::anyhow!("timeline is offloaded, can't get a guard")) } else { - Ok(self.access_service.create_guard()) + match self.tli.gate.enter() { + Ok(gate_guard) => Ok(self.access_service.create_guard(gate_guard)), + Err(_) => Err(anyhow::anyhow!( + "timeline is shutting down, can't get a guard" + )), + } }; if tx.send(guard).is_err() { @@ -699,7 +733,10 @@ impl Manager { let result = if self.is_offloaded { None } else { - Some(self.access_service.create_guard()) + match self.tli.gate.enter() { + Ok(gate_guard) => Some(self.access_service.create_guard(gate_guard)), + Err(_) => None, + } }; if tx.send(result).is_err() { diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 33d94da034..067945fd5f 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -457,10 +457,12 @@ impl GlobalTimelines { Ok(timeline) => { let was_active = timeline.broker_active.load(Ordering::Relaxed); + info!("deleting timeline {}, only_local={}", ttid, only_local); + timeline.shutdown().await; + // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; - info!("deleting timeline {}, only_local={}", ttid, only_local); let dir_existed = timeline.delete(&mut shared_state, only_local).await?; Ok(TimelineDeleteForceResult { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 6c87e5a926..34b5dbeaa1 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -25,7 +25,6 @@ use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; use tokio::sync::{watch, OnceCell}; -use tokio::time::sleep; use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; @@ -46,6 +45,14 @@ pub struct WalBackupTaskHandle { handle: JoinHandle<()>, } +impl WalBackupTaskHandle { + pub(crate) async fn join(self) { + if let Err(e) = self.handle.await { + error!("WAL backup task panicked: {}", e); + } + } +} + /// Do we have anything to upload to S3, i.e. should safekeepers run backup activity? pub(crate) fn is_wal_backup_required( wal_seg_size: usize, @@ -74,11 +81,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let async_task = backup_task_main( - mgr.wal_resident_timeline(), - mgr.conf.backup_parallel_jobs, - shutdown_rx, - ); + let Ok(resident) = mgr.wal_resident_timeline() else { + info!("Timeline shut down"); + return; + }; + + let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx); let handle = if mgr.conf.current_thread_runtime { tokio::spawn(async_task) @@ -108,9 +116,7 @@ async fn shut_down_task(entry: &mut Option) { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. - if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task panicked: {}", e); - } + wb_handle.join().await; } } @@ -214,6 +220,7 @@ async fn backup_task_main( let _guard = WAL_BACKUP_TASKS.guard(); info!("started"); + let cancel = tli.tli.cancel.clone(); let mut wb = WalBackupTask { wal_seg_size: tli.get_wal_seg_size().await, commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), @@ -230,25 +237,34 @@ async fn backup_task_main( _ = wb.run() => {} _ = shutdown_rx.recv() => { canceled = true; + }, + _ = cancel.cancelled() => { + canceled = true; } } info!("task {}", if canceled { "canceled" } else { "terminated" }); } impl WalBackupTask { + /// This function must be called from a select! that also respects self.timeline's + /// cancellation token. This is done in [`backup_task_main`]. + /// + /// The future returned by this function is safe to drop at any time because it + /// does not write to local disk. async fn run(&mut self) { let mut backup_lsn = Lsn(0); let mut retry_attempt = 0u32; // offload loop - loop { + while !self.timeline.cancel.is_cancelled() { if retry_attempt == 0 { // wait for new WAL to arrive if let Err(e) = self.commit_lsn_watch_rx.changed().await { - // should never happen, as we hold Arc to timeline. + // should never happen, as we hold Arc to timeline and transmitter's lifetime + // is within Timeline's error!("commit_lsn watch shut down: {:?}", e); return; - } + }; } else { // or just sleep if we errored previously let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS; @@ -256,7 +272,7 @@ impl WalBackupTask { { retry_delay = min(retry_delay, backoff_delay); } - sleep(Duration::from_millis(retry_delay)).await; + tokio::time::sleep(Duration::from_millis(retry_delay)).await; } let commit_lsn = *self.commit_lsn_watch_rx.borrow(); diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs new file mode 100644 index 0000000000..f8c0c502cd --- /dev/null +++ b/safekeeper/src/wal_reader_stream.rs @@ -0,0 +1,149 @@ +use std::sync::Arc; + +use async_stream::try_stream; +use bytes::Bytes; +use futures::Stream; +use postgres_backend::CopyStreamHandlerEnd; +use std::time::Duration; +use tokio::time::timeout; +use utils::lsn::Lsn; + +use crate::{ + safekeeper::Term, + send_wal::{EndWatch, WalSenderGuard}, + timeline::WalResidentTimeline, +}; + +pub(crate) struct WalReaderStreamBuilder { + pub(crate) tli: WalResidentTimeline, + pub(crate) start_pos: Lsn, + pub(crate) end_pos: Lsn, + pub(crate) term: Option, + pub(crate) end_watch: EndWatch, + pub(crate) wal_sender_guard: Arc, +} + +impl WalReaderStreamBuilder { + pub(crate) fn start_pos(&self) -> Lsn { + self.start_pos + } +} + +pub(crate) struct WalBytes { + /// Raw PG WAL + pub(crate) wal: Bytes, + /// Start LSN of [`Self::wal`] + #[allow(dead_code)] + pub(crate) wal_start_lsn: Lsn, + /// End LSN of [`Self::wal`] + pub(crate) wal_end_lsn: Lsn, + /// End LSN of WAL available on the safekeeper. + /// + /// For pagservers this will be commit LSN, + /// while for the compute it will be the flush LSN. + pub(crate) available_wal_end_lsn: Lsn, +} + +impl WalReaderStreamBuilder { + /// Builds a stream of Postgres WAL starting from [`Self::start_pos`]. + /// The stream terminates when the receiver (pageserver) is fully caught up + /// and there's no active computes. + pub(crate) async fn build( + self, + buffer_size: usize, + ) -> anyhow::Result>> { + // TODO(vlad): The code below duplicates functionality from [`crate::send_wal`]. + // We can make the raw WAL sender use this stream too and remove the duplication. + let Self { + tli, + mut start_pos, + mut end_pos, + term, + mut end_watch, + wal_sender_guard, + } = self; + let mut wal_reader = tli.get_walreader(start_pos).await?; + let mut buffer = vec![0; buffer_size]; + + const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); + + Ok(try_stream! { + loop { + let have_something_to_send = end_pos > start_pos; + + if !have_something_to_send { + // wait for lsn + let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await; + match res { + Ok(ok) => { + end_pos = ok?; + }, + Err(_) => { + if let EndWatch::Commit(_) = end_watch { + if let Some(remote_consistent_lsn) = wal_sender_guard + .walsenders() + .get_ws_remote_consistent_lsn(wal_sender_guard.id()) + { + if tli.should_walsender_stop(remote_consistent_lsn).await { + // Stop streaming if the receivers are caught up and + // there's no active compute. This causes the loop in + // [`crate::send_interpreted_wal::InterpretedWalSender::run`] + // to exit and terminate the WAL stream. + return; + } + } + } + + continue; + } + } + } + + + assert!( + end_pos > start_pos, + "nothing to send after waiting for WAL" + ); + + // try to send as much as available, capped by the buffer size + let mut chunk_end_pos = start_pos + buffer_size as u64; + // if we went behind available WAL, back off + if chunk_end_pos >= end_pos { + chunk_end_pos = end_pos; + } else { + // If sending not up to end pos, round down to page boundary to + // avoid breaking WAL record not at page boundary, as protocol + // demands. See walsender.c (XLogSendPhysical). + chunk_end_pos = chunk_end_pos + .checked_sub(chunk_end_pos.block_offset()) + .unwrap(); + } + let send_size = (chunk_end_pos.0 - start_pos.0) as usize; + let buffer = &mut buffer[..send_size]; + let send_size: usize; + { + // If uncommitted part is being pulled, check that the term is + // still the expected one. + let _term_guard = if let Some(t) = term { + Some(tli.acquire_term(t).await?) + } else { + None + }; + // Read WAL into buffer. send_size can be additionally capped to + // segment boundary here. + send_size = wal_reader.read(buffer).await? + }; + let wal = Bytes::copy_from_slice(&buffer[..send_size]); + + yield WalBytes { + wal, + wal_start_lsn: start_pos, + wal_end_lsn: start_pos + send_size as u64, + available_wal_end_lsn: end_pos + }; + + start_pos += send_size as u64; + } + }) + } +} diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 9312f8b3e7..3fb668ed2d 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -14,7 +14,7 @@ import psycopg2.extras import toml if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any FLAKY_TESTS_QUERY = """ SELECT @@ -65,7 +65,7 @@ def main(args: argparse.Namespace): pageserver_virtual_file_io_engine_parameter = "" # re-use existing records of flaky tests from before parametrization by compaction_algorithm - def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: + def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: """Duplicated from parametrize.py""" toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py index a4fd3f6132..835e28c5d6 100644 --- a/scripts/force_layer_download.py +++ b/scripts/force_layer_download.py @@ -194,9 +194,11 @@ async def main_impl(args, report_out, client: Client): tenant_ids = await client.get_tenant_ids() get_timeline_id_coros = [client.get_timeline_ids(tenant_id) for tenant_id in tenant_ids] gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True) - assert len(tenant_ids) == len(gathered) tenant_and_timline_ids = [] - for tid, tlids in zip(tenant_ids, gathered): + for tid, tlids in zip(tenant_ids, gathered, strict=True): + # TODO: add error handling if tlids isinstance(Exception) + assert isinstance(tlids, list) + for tlid in tlids: tenant_and_timline_ids.append((tid, tlid)) elif len(comps) == 1: diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py index e0dd0a7189..064c516718 100644 --- a/scripts/ingest_regress_test_result-new-format.py +++ b/scripts/ingest_regress_test_result-new-format.py @@ -11,7 +11,7 @@ import re import sys from contextlib import contextmanager from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path import backoff @@ -31,6 +31,7 @@ CREATE TABLE IF NOT EXISTS results ( duration INT NOT NULL, flaky BOOLEAN NOT NULL, arch arch DEFAULT 'X64', + lfc BOOLEAN DEFAULT false NOT NULL, build_type TEXT NOT NULL, pg_version INT NOT NULL, run_id BIGINT NOT NULL, @@ -54,6 +55,7 @@ class Row: duration: int flaky: bool arch: str + lfc: bool build_type: str pg_version: int run_id: int @@ -132,6 +134,7 @@ def ingest_test_result( if p["name"].startswith("__") } arch = parameters.get("arch", "UNKNOWN").strip("'") + lfc = parameters.get("lfc", "False") == "True" build_type, pg_version, unparametrized_name = parse_test_name(test["name"]) labels = {label["name"]: label["value"] for label in test["labels"]} @@ -140,11 +143,12 @@ def ingest_test_result( suite=labels["suite"], name=unparametrized_name, status=test["status"], - started_at=datetime.fromtimestamp(test["time"]["start"] / 1000, tz=timezone.utc), - stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc), + started_at=datetime.fromtimestamp(test["time"]["start"] / 1000, tz=UTC), + stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=UTC), duration=test["time"]["duration"], flaky=test["flaky"] or test["retriesStatusChange"], arch=arch, + lfc=lfc, build_type=build_type, pg_version=pg_version, run_id=run_id, diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index 91668a42a7..b026efbc3b 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -21,7 +21,7 @@ use utils::{backoff, id::TenantId}; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, init_remote, list_objects_with_retries, - metadata_stream::{stream_tenant_timelines, stream_tenants}, + metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}, BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES, }; @@ -118,9 +118,17 @@ pub async fn find_garbage( console_config: ConsoleConfig, depth: TraversingDepth, node_kind: NodeKind, + tenant_id_prefix: Option, output_path: String, ) -> anyhow::Result<()> { - let garbage = find_garbage_inner(bucket_config, console_config, depth, node_kind).await?; + let garbage = find_garbage_inner( + bucket_config, + console_config, + depth, + node_kind, + tenant_id_prefix, + ) + .await?; let serialized = serde_json::to_vec_pretty(&garbage)?; tokio::fs::write(&output_path, &serialized).await?; @@ -152,6 +160,7 @@ async fn find_garbage_inner( console_config: ConsoleConfig, depth: TraversingDepth, node_kind: NodeKind, + tenant_id_prefix: Option, ) -> anyhow::Result { // Construct clients for S3 and for Console API let (remote_client, target) = init_remote(bucket_config.clone(), node_kind).await?; @@ -178,7 +187,7 @@ async fn find_garbage_inner( // Enumerate Tenants in S3, and check if each one exists in Console tracing::info!("Finding all tenants in {}...", bucket_config.desc_str()); - let tenants = stream_tenants(&remote_client, &target); + let tenants = stream_tenants_maybe_prefix(&remote_client, &target, tenant_id_prefix); let tenants_checked = tenants.map_ok(|t| { let api_client = cloud_admin_api_client.clone(); let console_cache = console_cache.clone(); diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index 0ffb570984..92979d609e 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -54,6 +54,8 @@ enum Command { node_kind: NodeKind, #[arg(short, long, default_value_t=TraversingDepth::Tenant)] depth: TraversingDepth, + #[arg(short, long, default_value=None)] + tenant_id_prefix: Option, #[arg(short, long, default_value_t = String::from("garbage.json"))] output_path: String, }, @@ -209,10 +211,19 @@ async fn main() -> anyhow::Result<()> { Command::FindGarbage { node_kind, depth, + tenant_id_prefix, output_path, } => { let console_config = ConsoleConfig::from_env()?; - find_garbage(bucket_config, console_config, depth, node_kind, output_path).await + find_garbage( + bucket_config, + console_config, + depth, + node_kind, + tenant_id_prefix, + output_path, + ) + .await } Command::PurgeGarbage { input_path, diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index efda7c213d..47447d681c 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -17,9 +17,20 @@ use utils::id::{TenantId, TimelineId}; pub fn stream_tenants<'a>( remote_client: &'a GenericRemoteStorage, target: &'a RootTarget, +) -> impl Stream> + 'a { + stream_tenants_maybe_prefix(remote_client, target, None) +} +/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes +pub fn stream_tenants_maybe_prefix<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a RootTarget, + tenant_id_prefix: Option, ) -> impl Stream> + 'a { try_stream! { - let tenants_target = target.tenants_root(); + let mut tenants_target = target.tenants_root(); + if let Some(tenant_id_prefix) = tenant_id_prefix { + tenants_target.prefix_in_bucket += &tenant_id_prefix; + } let mut tenants_stream = std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target)); while let Some(chunk) = tenants_stream.next().await { diff --git a/test_runner/README.md b/test_runner/README.md index 55d8d2faa9..f342ef8aaa 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -113,7 +113,7 @@ The test suite has a Python enum with equal name but different meaning: ```python @enum.unique -class RemoteStorageKind(str, enum.Enum): +class RemoteStorageKind(StrEnum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py index be16be81de..8382ce20b3 100644 --- a/test_runner/fixtures/auth_tokens.py +++ b/test_runner/fixtures/auth_tokens.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from enum import Enum +from enum import StrEnum from typing import Any import jwt @@ -37,8 +37,7 @@ class AuthKeys: return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class TokenScope(str, Enum): +class TokenScope(StrEnum): ADMIN = "admin" PAGE_SERVER_API = "pageserverapi" GENERATIONS_API = "generations_api" diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 8e68775471..bb8e75902e 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -9,6 +9,7 @@ import re import timeit from contextlib import contextmanager from datetime import datetime +from enum import StrEnum from pathlib import Path from typing import TYPE_CHECKING @@ -24,8 +25,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver if TYPE_CHECKING: - from collections.abc import Iterator, Mapping - from typing import Callable, Optional + from collections.abc import Callable, Iterator, Mapping """ @@ -61,7 +61,7 @@ class PgBenchRunResult: number_of_threads: int number_of_transactions_actually_processed: int latency_average: float - latency_stddev: Optional[float] + latency_stddev: float | None tps: float run_duration: float run_start_timestamp: int @@ -171,14 +171,14 @@ _PGBENCH_INIT_EXTRACTORS: Mapping[str, re.Pattern[str]] = { @dataclasses.dataclass class PgBenchInitResult: - total: Optional[float] - drop_tables: Optional[float] - create_tables: Optional[float] - client_side_generate: Optional[float] - server_side_generate: Optional[float] - vacuum: Optional[float] - primary_keys: Optional[float] - foreign_keys: Optional[float] + total: float | None + drop_tables: float | None + create_tables: float | None + client_side_generate: float | None + server_side_generate: float | None + vacuum: float | None + primary_keys: float | None + foreign_keys: float | None duration: float start_timestamp: int end_timestamp: int @@ -196,7 +196,7 @@ class PgBenchInitResult: last_line = stderr.splitlines()[-1] - timings: dict[str, Optional[float]] = {} + timings: dict[str, float | None] = {} last_line_items = re.split(r"\(|\)|,", last_line) for item in last_line_items: for key, regex in _PGBENCH_INIT_EXTRACTORS.items(): @@ -227,7 +227,7 @@ class PgBenchInitResult: @enum.unique -class MetricReport(str, enum.Enum): # str is a hack to make it json serializable +class MetricReport(StrEnum): # str is a hack to make it json serializable # this means that this is a constant test parameter # like number of transactions, or number of clients TEST_PARAM = "test_param" @@ -256,9 +256,8 @@ class NeonBenchmarker: metric_value: float, unit: str, report: MetricReport, - labels: Optional[ - dict[str, str] - ] = None, # use this to associate additional key/value pairs in json format for associated Neon object IDs like project ID with the metric + # use this to associate additional key/value pairs in json format for associated Neon object IDs like project ID with the metric + labels: dict[str, str] | None = None, ): """ Record a benchmark result. @@ -412,7 +411,7 @@ class NeonBenchmarker: self, pageserver: NeonPageserver, metric_name: str, - label_filters: Optional[dict[str, str]] = None, + label_filters: dict[str, str] | None = None, ) -> int: """Fetch the value of given int counter from pageserver metrics.""" all_metrics = pageserver.http_client().get_metrics() diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 0ea7148f50..6c22b31e00 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -2,14 +2,14 @@ from __future__ import annotations import random from dataclasses import dataclass -from enum import Enum +from enum import StrEnum from functools import total_ordering from typing import TYPE_CHECKING, TypeVar from typing_extensions import override if TYPE_CHECKING: - from typing import Any, Union + from typing import Any T = TypeVar("T", bound="Id") @@ -24,7 +24,7 @@ class Lsn: representation is like "1/0123abcd". See also pg_lsn datatype in Postgres """ - def __init__(self, x: Union[int, str]): + def __init__(self, x: int | str): if isinstance(x, int): self.lsn_int = x else: @@ -67,7 +67,7 @@ class Lsn: return NotImplemented return self.lsn_int - other.lsn_int - def __add__(self, other: Union[int, Lsn]) -> Lsn: + def __add__(self, other: int | Lsn) -> Lsn: if isinstance(other, int): return Lsn(self.lsn_int + other) elif isinstance(other, Lsn): @@ -190,8 +190,23 @@ class TenantTimelineId: ) -# Workaround for compat with python 3.9, which does not have `typing.Self` -TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId") +@dataclass +class ShardIndex: + shard_number: int + shard_count: int + + # cf impl Display for ShardIndex + @override + def __str__(self) -> str: + return f"{self.shard_number:02x}{self.shard_count:02x}" + + @classmethod + def parse(cls: type[ShardIndex], input: str) -> ShardIndex: + assert len(input) == 4 + return cls( + shard_number=int(input[0:2], 16), + shard_count=int(input[2:4], 16), + ) class TenantShardId: @@ -202,7 +217,7 @@ class TenantShardId: assert self.shard_number < self.shard_count or self.shard_count == 0 @classmethod - def parse(cls: type[TTenantShardId], input: str) -> TTenantShardId: + def parse(cls: type[TenantShardId], input: str) -> TenantShardId: if len(input) == 32: return cls( tenant_id=TenantId(input), @@ -226,6 +241,10 @@ class TenantShardId: # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id) return str(self.tenant_id) + @property + def shard_index(self) -> ShardIndex: + return ShardIndex(self.shard_number, self.shard_count) + @override def __repr__(self): return self.__str__() @@ -249,7 +268,6 @@ class TenantShardId: return hash(self._tuple()) -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class TimelineArchivalState(str, Enum): +class TimelineArchivalState(StrEnum): ARCHIVED = "Archived" UNARCHIVED = "Unarchived" diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 85b6e7a3b8..c0892399bd 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -99,7 +99,7 @@ class PgCompare(ABC): assert row is not None assert len(row) == len(pg_stat.columns) - for col, val in zip(pg_stat.columns, row): + for col, val in zip(pg_stat.columns, row, strict=False): results[f"{pg_stat.table}.{col}"] = int(val) return results diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 6354b7f833..33f01f80fb 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -12,7 +12,8 @@ from fixtures.common_types import TenantId from fixtures.log_helper import log if TYPE_CHECKING: - from typing import Any, Callable, Optional + from collections.abc import Callable + from typing import Any class ComputeReconfigure: @@ -20,12 +21,12 @@ class ComputeReconfigure: self.server = server self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" self.workloads: dict[TenantId, Any] = {} - self.on_notify: Optional[Callable[[Any], None]] = None + self.on_notify: Callable[[Any], None] | None = None def register_workload(self, workload: Any): self.workloads[workload.tenant_id] = workload - def register_on_notify(self, fn: Optional[Callable[[Any], None]]): + def register_on_notify(self, fn: Callable[[Any], None] | None): """ Add some extra work during a notification, like sleeping to slow things down, or logging what was notified. @@ -68,7 +69,7 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer): # This causes the endpoint to query storage controller for its location, which # is redundant since we already have it here, but this avoids extending the # neon_local CLI to take full lists of locations - reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[no-any-return] + reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] return Response(status=200) diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py index e890b2bcf1..3e35af3b5b 100644 --- a/test_runner/fixtures/h2server.py +++ b/test_runner/fixtures/h2server.py @@ -31,7 +31,7 @@ from h2.settings import SettingCodes from typing_extensions import override if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any RequestData = collections.namedtuple("RequestData", ["headers", "data"]) @@ -49,7 +49,7 @@ class H2Protocol(asyncio.Protocol): def __init__(self): config = H2Configuration(client_side=False, header_encoding="utf-8") self.conn = H2Connection(config=config) - self.transport: Optional[asyncio.Transport] = None + self.transport: asyncio.Transport | None = None self.stream_data: dict[int, RequestData] = {} self.flow_control_futures: dict[int, asyncio.Future[Any]] = {} @@ -61,7 +61,7 @@ class H2Protocol(asyncio.Protocol): self.transport.write(self.conn.data_to_send()) @override - def connection_lost(self, exc: Optional[Exception]): + def connection_lost(self, exc: Exception | None): for future in self.flow_control_futures.values(): future.cancel() self.flow_control_futures = {} diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 39c8f70a9c..330f007a77 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -1,16 +1,12 @@ from __future__ import annotations from collections import defaultdict -from typing import TYPE_CHECKING from prometheus_client.parser import text_string_to_metric_families from prometheus_client.samples import Sample from fixtures.log_helper import log -if TYPE_CHECKING: - from typing import Optional - class Metrics: metrics: dict[str, list[Sample]] @@ -20,7 +16,7 @@ class Metrics: self.metrics = defaultdict(list) self.name = name - def query_all(self, name: str, filter: Optional[dict[str, str]] = None) -> list[Sample]: + def query_all(self, name: str, filter: dict[str, str] | None = None) -> list[Sample]: filter = filter or {} res: list[Sample] = [] @@ -32,7 +28,7 @@ class Metrics: pass return res - def query_one(self, name: str, filter: Optional[dict[str, str]] = None) -> Sample: + def query_one(self, name: str, filter: dict[str, str] | None = None) -> Sample: res = self.query_all(name, filter or {}) assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}" return res[0] @@ -47,9 +43,7 @@ class MetricsGetter: def get_metrics(self) -> Metrics: raise NotImplementedError() - def get_metric_value( - self, name: str, filter: Optional[dict[str, str]] = None - ) -> Optional[float]: + def get_metric_value(self, name: str, filter: dict[str, str] | None = None) -> float | None: metrics = self.get_metrics() results = metrics.query_all(name, filter=filter) if not results: @@ -59,7 +53,7 @@ class MetricsGetter: return results[0].value def get_metrics_values( - self, names: list[str], filter: Optional[dict[str, str]] = None, absence_ok: bool = False + self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False ) -> dict[str, float]: """ When fetching multiple named metrics, it is more efficient to use this diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 9de6681beb..df80f0683c 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -8,7 +8,7 @@ import requests from fixtures.log_helper import log if TYPE_CHECKING: - from typing import Any, Literal, Optional + from typing import Any, Literal from fixtures.pg_version import PgVersion @@ -40,11 +40,11 @@ class NeonAPI: def create_project( self, - pg_version: Optional[PgVersion] = None, - name: Optional[str] = None, - branch_name: Optional[str] = None, - branch_role_name: Optional[str] = None, - branch_database_name: Optional[str] = None, + pg_version: PgVersion | None = None, + name: str | None = None, + branch_name: str | None = None, + branch_role_name: str | None = None, + branch_database_name: str | None = None, ) -> dict[str, Any]: data: dict[str, Any] = { "project": { @@ -179,8 +179,8 @@ class NeonAPI: def get_connection_uri( self, project_id: str, - branch_id: Optional[str] = None, - endpoint_id: Optional[str] = None, + branch_id: str | None = None, + endpoint_id: str | None = None, database_name: str = "neondb", role_name: str = "neondb_owner", pooled: bool = True, @@ -249,7 +249,7 @@ class NeonAPI: @final class NeonApiEndpoint: - def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]): + def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: str | None): self.neon_api = neon_api self.project_id: str self.endpoint_id: str diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index d220ea57a2..a85a191455 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -20,13 +20,9 @@ from fixtures.pg_version import PgVersion if TYPE_CHECKING: from typing import ( Any, - Optional, - TypeVar, cast, ) - T = TypeVar("T") - # Used to be an ABC. abc.ABC removed due to linter without name change. class AbstractNeonCli: @@ -36,7 +32,7 @@ class AbstractNeonCli: Do not use directly, use specific subclasses instead. """ - def __init__(self, extra_env: Optional[dict[str, str]], binpath: Path): + def __init__(self, extra_env: dict[str, str] | None, binpath: Path): self.extra_env = extra_env self.binpath = binpath @@ -45,7 +41,7 @@ class AbstractNeonCli: def raw_cli( self, arguments: list[str], - extra_env_vars: Optional[dict[str, str]] = None, + extra_env_vars: dict[str, str] | None = None, check_return_code=True, timeout=None, ) -> subprocess.CompletedProcess[str]: @@ -173,7 +169,7 @@ class NeonLocalCli(AbstractNeonCli): def __init__( self, - extra_env: Optional[dict[str, str]], + extra_env: dict[str, str] | None, binpath: Path, repo_dir: Path, pg_distrib_dir: Path, @@ -195,10 +191,10 @@ class NeonLocalCli(AbstractNeonCli): tenant_id: TenantId, timeline_id: TimelineId, pg_version: PgVersion, - conf: Optional[dict[str, Any]] = None, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - placement_policy: Optional[str] = None, + conf: dict[str, Any] | None = None, + shard_count: int | None = None, + shard_stripe_size: int | None = None, + placement_policy: str | None = None, set_default: bool = False, ): """ @@ -302,8 +298,8 @@ class NeonLocalCli(AbstractNeonCli): tenant_id: TenantId, timeline_id: TimelineId, new_branch_name, - ancestor_branch_name: Optional[str] = None, - ancestor_start_lsn: Optional[Lsn] = None, + ancestor_branch_name: str | None = None, + ancestor_start_lsn: Lsn | None = None, ): cmd = [ "timeline", @@ -331,8 +327,8 @@ class NeonLocalCli(AbstractNeonCli): base_lsn: Lsn, base_tarfile: Path, pg_version: PgVersion, - end_lsn: Optional[Lsn] = None, - wal_tarfile: Optional[Path] = None, + end_lsn: Lsn | None = None, + wal_tarfile: Path | None = None, ): cmd = [ "timeline", @@ -380,7 +376,7 @@ class NeonLocalCli(AbstractNeonCli): def init( self, init_config: dict[str, Any], - force: Optional[str] = None, + force: str | None = None, ) -> subprocess.CompletedProcess[str]: with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: init_config_tmpfile.write(toml.dumps(init_config)) @@ -400,9 +396,9 @@ class NeonLocalCli(AbstractNeonCli): def storage_controller_start( self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, + timeout_in_seconds: int | None = None, + instance_id: int | None = None, + base_port: int | None = None, ): cmd = ["storage_controller", "start"] if timeout_in_seconds is not None: @@ -413,7 +409,7 @@ class NeonLocalCli(AbstractNeonCli): cmd.append(f"--base-port={base_port}") return self.raw_cli(cmd) - def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None): + def storage_controller_stop(self, immediate: bool, instance_id: int | None = None): cmd = ["storage_controller", "stop"] if immediate: cmd.extend(["-m", "immediate"]) @@ -424,8 +420,8 @@ class NeonLocalCli(AbstractNeonCli): def pageserver_start( self, id: int, - extra_env_vars: Optional[dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, + extra_env_vars: dict[str, str] | None = None, + timeout_in_seconds: int | None = None, ) -> subprocess.CompletedProcess[str]: start_args = ["pageserver", "start", f"--id={id}"] if timeout_in_seconds is not None: @@ -442,9 +438,9 @@ class NeonLocalCli(AbstractNeonCli): def safekeeper_start( self, id: int, - extra_opts: Optional[list[str]] = None, - extra_env_vars: Optional[dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, + extra_opts: list[str] | None = None, + extra_env_vars: dict[str, str] | None = None, + timeout_in_seconds: int | None = None, ) -> subprocess.CompletedProcess[str]: if extra_opts is not None: extra_opts = [f"-e={opt}" for opt in extra_opts] @@ -457,7 +453,7 @@ class NeonLocalCli(AbstractNeonCli): ) def safekeeper_stop( - self, id: Optional[int] = None, immediate=False + self, id: int | None = None, immediate=False ) -> subprocess.CompletedProcess[str]: args = ["safekeeper", "stop"] if id is not None: @@ -467,7 +463,7 @@ class NeonLocalCli(AbstractNeonCli): return self.raw_cli(args) def storage_broker_start( - self, timeout_in_seconds: Optional[int] = None + self, timeout_in_seconds: int | None = None ) -> subprocess.CompletedProcess[str]: cmd = ["storage_broker", "start"] if timeout_in_seconds is not None: @@ -485,10 +481,10 @@ class NeonLocalCli(AbstractNeonCli): http_port: int, tenant_id: TenantId, pg_version: PgVersion, - endpoint_id: Optional[str] = None, + endpoint_id: str | None = None, hot_standby: bool = False, - lsn: Optional[Lsn] = None, - pageserver_id: Optional[int] = None, + lsn: Lsn | None = None, + pageserver_id: int | None = None, allow_multiple=False, ) -> subprocess.CompletedProcess[str]: args = [ @@ -523,11 +519,11 @@ class NeonLocalCli(AbstractNeonCli): def endpoint_start( self, endpoint_id: str, - safekeepers: Optional[list[int]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, + safekeepers: list[int] | None = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, allow_multiple=False, - basebackup_request_tries: Optional[int] = None, + basebackup_request_tries: int | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -555,9 +551,9 @@ class NeonLocalCli(AbstractNeonCli): def endpoint_reconfigure( self, endpoint_id: str, - tenant_id: Optional[TenantId] = None, - pageserver_id: Optional[int] = None, - safekeepers: Optional[list[int]] = None, + tenant_id: TenantId | None = None, + pageserver_id: int | None = None, + safekeepers: list[int] | None = None, check_return_code=True, ) -> subprocess.CompletedProcess[str]: args = ["endpoint", "reconfigure", endpoint_id] @@ -574,7 +570,7 @@ class NeonLocalCli(AbstractNeonCli): endpoint_id: str, destroy=False, check_return_code=True, - mode: Optional[str] = None, + mode: str | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 205a47a9d5..07d442b4a6 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -17,7 +17,7 @@ from collections.abc import Iterable, Iterator from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime -from enum import Enum +from enum import StrEnum from functools import cached_property from pathlib import Path from types import TracebackType @@ -90,10 +90,12 @@ from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, COMPONENT_BINARIES, + USE_LFC, allure_add_grafana_links, assert_no_errors, get_dir_size, print_gc_result, + size_to_bytes, subprocess_capture, wait_until, ) @@ -101,13 +103,8 @@ from fixtures.utils import ( from .neon_api import NeonAPI, NeonApiEndpoint if TYPE_CHECKING: - from typing import ( - Any, - Callable, - Optional, - TypeVar, - Union, - ) + from collections.abc import Callable + from typing import Any, Self, TypeVar from fixtures.paths import SnapshotDirLocked @@ -338,10 +335,10 @@ class NeonEnvBuilder: top_output_dir: Path, test_output_dir: Path, combination, - test_overlay_dir: Optional[Path] = None, - pageserver_remote_storage: Optional[RemoteStorage] = None, + test_overlay_dir: Path | None = None, + pageserver_remote_storage: RemoteStorage | None = None, # toml that will be decomposed into `--config-override` flags during `pageserver --init` - pageserver_config_override: Optional[str | Callable[[dict[str, Any]], None]] = None, + pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None, num_safekeepers: int = 1, num_pageservers: int = 1, # Use non-standard SK ids to check for various parsing bugs @@ -349,16 +346,16 @@ class NeonEnvBuilder: # fsync is disabled by default to make the tests go faster safekeepers_enable_fsync: bool = False, auth_enabled: bool = False, - rust_log_override: Optional[str] = None, + rust_log_override: str | None = None, default_branch_name: str = DEFAULT_BRANCH_NAME, preserve_database_files: bool = False, - initial_tenant: Optional[TenantId] = None, - initial_timeline: Optional[TimelineId] = None, - pageserver_virtual_file_io_engine: Optional[str] = None, - pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None, - safekeeper_extra_opts: Optional[list[str]] = None, - storage_controller_port_override: Optional[int] = None, - pageserver_virtual_file_io_mode: Optional[str] = None, + initial_tenant: TenantId | None = None, + initial_timeline: TimelineId | None = None, + pageserver_virtual_file_io_engine: str | None = None, + pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = None, + safekeeper_extra_opts: list[str] | None = None, + storage_controller_port_override: int | None = None, + pageserver_virtual_file_io_mode: str | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -367,7 +364,7 @@ class NeonEnvBuilder: # Pageserver remote storage self.pageserver_remote_storage = pageserver_remote_storage # Safekeepers remote storage - self.safekeepers_remote_storage: Optional[RemoteStorage] = None + self.safekeepers_remote_storage: RemoteStorage | None = None self.run_id = run_id self.mock_s3_server: MockS3Server = mock_s3_server @@ -378,7 +375,7 @@ class NeonEnvBuilder: self.safekeepers_enable_fsync = safekeepers_enable_fsync self.auth_enabled = auth_enabled self.default_branch_name = default_branch_name - self.env: Optional[NeonEnv] = None + self.env: NeonEnv | None = None self.keep_remote_storage_contents: bool = True self.neon_binpath = neon_binpath self.neon_local_binpath = neon_binpath @@ -391,14 +388,14 @@ class NeonEnvBuilder: self.test_output_dir = test_output_dir self.test_overlay_dir = test_overlay_dir self.overlay_mounts_created_by_us: list[tuple[str, Path]] = [] - self.config_init_force: Optional[str] = None + self.config_init_force: str | None = None self.top_output_dir = top_output_dir - self.control_plane_compute_hook_api: Optional[str] = None - self.storage_controller_config: Optional[dict[Any, Any]] = None + self.control_plane_compute_hook_api: str | None = None + self.storage_controller_config: dict[Any, Any] | None = None - self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine + self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine - self.pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = ( + self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm ) if self.pageserver_default_tenant_config_compaction_algorithm is not None: @@ -440,10 +437,10 @@ class NeonEnvBuilder: def init_start( self, - initial_tenant_conf: Optional[dict[str, Any]] = None, + initial_tenant_conf: dict[str, Any] | None = None, default_remote_storage_if_missing: bool = True, - initial_tenant_shard_count: Optional[int] = None, - initial_tenant_shard_stripe_size: Optional[int] = None, + initial_tenant_shard_count: int | None = None, + initial_tenant_shard_stripe_size: int | None = None, ) -> NeonEnv: """ Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline. @@ -781,8 +778,8 @@ class NeonEnvBuilder: self, kind: RemoteStorageKind, user: RemoteStorageUser, - bucket_name: Optional[str] = None, - bucket_region: Optional[str] = None, + bucket_name: str | None = None, + bucket_region: str | None = None, ) -> RemoteStorage: ret = kind.configure( self.repo_dir, @@ -840,14 +837,14 @@ class NeonEnvBuilder: if isinstance(x, S3Storage): x.do_cleanup() - def __enter__(self) -> NeonEnvBuilder: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, ): # Stop all the nodes. if self.env: @@ -1136,7 +1133,7 @@ class NeonEnv: force=config.config_init_force, ) - def start(self, timeout_in_seconds: Optional[int] = None): + def start(self, timeout_in_seconds: int | None = None): # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start(timeout_in_seconds=timeout_in_seconds) @@ -1150,21 +1147,19 @@ class NeonEnv: with concurrent.futures.ThreadPoolExecutor( max_workers=2 + len(self.pageservers) + len(self.safekeepers) ) as executor: - futs.append( - executor.submit(lambda: self.broker.start() or None) - ) # The `or None` is for the linter + futs.append(executor.submit(lambda: self.broker.start())) for pageserver in self.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) for safekeeper in self.safekeepers: futs.append( executor.submit( - lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) @@ -1237,7 +1232,7 @@ class NeonEnv: ), "env.pageserver must only be used with single pageserver NeonEnv" return self.pageservers[0] - def get_pageserver(self, id: Optional[int]) -> NeonPageserver: + def get_pageserver(self, id: int | None) -> NeonPageserver: """ Look up a pageserver by its node ID. @@ -1254,7 +1249,7 @@ class NeonEnv: raise RuntimeError(f"Pageserver with ID {id} not found") - def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]): + def get_tenant_pageserver(self, tenant_id: TenantId | TenantShardId): """ Get the NeonPageserver where this tenant shard is currently attached, according to the storage controller. @@ -1316,12 +1311,12 @@ class NeonEnv: def create_tenant( self, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - conf: Optional[dict[str, Any]] = None, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - placement_policy: Optional[str] = None, + tenant_id: TenantId | None = None, + timeline_id: TimelineId | None = None, + conf: dict[str, Any] | None = None, + shard_count: int | None = None, + shard_stripe_size: int | None = None, + placement_policy: str | None = None, set_default: bool = False, ) -> tuple[TenantId, TimelineId]: """ @@ -1343,7 +1338,7 @@ class NeonEnv: return tenant_id, timeline_id - def config_tenant(self, tenant_id: Optional[TenantId], conf: dict[str, str]): + def config_tenant(self, tenant_id: TenantId | None, conf: dict[str, str]): """ Update tenant config. """ @@ -1353,10 +1348,10 @@ class NeonEnv: def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, - tenant_id: Optional[TenantId] = None, - ancestor_branch_name: Optional[str] = None, - ancestor_start_lsn: Optional[Lsn] = None, - new_timeline_id: Optional[TimelineId] = None, + tenant_id: TenantId | None = None, + ancestor_branch_name: str | None = None, + ancestor_start_lsn: Lsn | None = None, + new_timeline_id: TimelineId | None = None, ) -> TimelineId: new_timeline_id = new_timeline_id or TimelineId.generate() tenant_id = tenant_id or self.initial_tenant @@ -1370,8 +1365,8 @@ class NeonEnv: def create_timeline( self, new_branch_name: str, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, + tenant_id: TenantId | None = None, + timeline_id: TimelineId | None = None, ) -> TimelineId: timeline_id = timeline_id or TimelineId.generate() tenant_id = tenant_id or self.initial_tenant @@ -1396,8 +1391,8 @@ def neon_simple_env( compatibility_pg_distrib_dir: Path, pg_version: PgVersion, pageserver_virtual_file_io_engine: str, - pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], - pageserver_virtual_file_io_mode: Optional[str], + pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, + pageserver_virtual_file_io_mode: str | None, ) -> Iterator[NeonEnv]: """ Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync. @@ -1453,9 +1448,9 @@ def neon_env_builder( test_overlay_dir: Path, top_output_dir: Path, pageserver_virtual_file_io_engine: str, - pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], + pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, record_property: Callable[[str, object], None], - pageserver_virtual_file_io_mode: Optional[str], + pageserver_virtual_file_io_mode: str | None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1530,7 +1525,7 @@ class LogUtils: def log_contains( self, pattern: str, offset: None | LogCursor = None - ) -> Optional[tuple[str, LogCursor]]: + ) -> tuple[str, LogCursor] | None: """Check that the log contains a line that matches the given regex""" logfile = self.logfile if not logfile.exists(): @@ -1569,14 +1564,13 @@ class StorageControllerApiException(Exception): # See libs/pageserver_api/src/controller_api.rs # for the rust definitions of the enums below -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class PageserverAvailability(str, Enum): +class PageserverAvailability(StrEnum): ACTIVE = "Active" UNAVAILABLE = "Unavailable" OFFLINE = "Offline" -class PageserverSchedulingPolicy(str, Enum): +class PageserverSchedulingPolicy(StrEnum): ACTIVE = "Active" DRAINING = "Draining" FILLING = "Filling" @@ -1584,7 +1578,7 @@ class PageserverSchedulingPolicy(str, Enum): PAUSE_FOR_RESTART = "PauseForRestart" -class StorageControllerLeadershipStatus(str, Enum): +class StorageControllerLeadershipStatus(StrEnum): LEADER = "leader" STEPPED_DOWN = "stepped_down" CANDIDATE = "candidate" @@ -1602,16 +1596,16 @@ class NeonStorageController(MetricsGetter, LogUtils): def start( self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, - ): + timeout_in_seconds: int | None = None, + instance_id: int | None = None, + base_port: int | None = None, + ) -> Self: assert not self.running self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) self.running = True return self - def stop(self, immediate: bool = False) -> NeonStorageController: + def stop(self, immediate: bool = False) -> Self: if self.running: self.env.neon_cli.storage_controller_stop(immediate) self.running = False @@ -1673,7 +1667,7 @@ class NeonStorageController(MetricsGetter, LogUtils): return resp - def headers(self, scope: Optional[TokenScope]) -> dict[str, str]: + def headers(self, scope: TokenScope | None) -> dict[str, str]: headers = {} if self.auth_enabled and scope is not None: jwt_token = self.env.auth_keys.generate_token(scope=scope) @@ -1711,9 +1705,9 @@ class NeonStorageController(MetricsGetter, LogUtils): def attach_hook_issue( self, - tenant_shard_id: Union[TenantId, TenantShardId], + tenant_shard_id: TenantId | TenantShardId, pageserver_id: int, - generation_override: Optional[int] = None, + generation_override: int | None = None, ) -> int: body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} if generation_override is not None: @@ -1729,7 +1723,7 @@ class NeonStorageController(MetricsGetter, LogUtils): assert isinstance(gen, int) return gen - def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]): + def attach_hook_drop(self, tenant_shard_id: TenantId | TenantShardId): self.request( "POST", f"{self.api}/debug/v1/attach-hook", @@ -1737,7 +1731,7 @@ class NeonStorageController(MetricsGetter, LogUtils): headers=self.headers(TokenScope.ADMIN), ) - def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]: + def inspect(self, tenant_shard_id: TenantId | TenantShardId) -> tuple[int, int] | None: """ :return: 2-tuple of (generation, pageserver id), or None if unknown """ @@ -1857,10 +1851,10 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_create( self, tenant_id: TenantId, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - tenant_config: Optional[dict[Any, Any]] = None, - placement_policy: Optional[Union[dict[Any, Any], str]] = None, + shard_count: int | None = None, + shard_stripe_size: int | None = None, + tenant_config: dict[Any, Any] | None = None, + placement_policy: dict[Any, Any] | str | None = None, ): """ Use this rather than pageserver_api() when you need to include shard parameters @@ -1891,6 +1885,20 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() log.info(f"tenant_create success: {response.json()}") + def timeline_create( + self, + tenant_id: TenantId, + body: dict[str, Any], + ): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_id}/timeline", + json=body, + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) + response.raise_for_status() + log.info(f"timeline_create success: {response.json()}") + def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} @@ -1941,7 +1949,7 @@ class NeonStorageController(MetricsGetter, LogUtils): return response.json() def tenant_shard_split( - self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None + self, tenant_id: TenantId, shard_count: int, shard_stripe_size: int | None = None ) -> list[TenantShardId]: response = self.request( "PUT", @@ -2039,8 +2047,8 @@ class NeonStorageController(MetricsGetter, LogUtils): def poll_node_status( self, node_id: int, - desired_availability: Optional[PageserverAvailability], - desired_scheduling_policy: Optional[PageserverSchedulingPolicy], + desired_availability: PageserverAvailability | None, + desired_scheduling_policy: PageserverSchedulingPolicy | None, max_attempts: int, backoff: float, ): @@ -2259,7 +2267,7 @@ class NeonStorageController(MetricsGetter, LogUtils): json=body, ) - def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]: + def get_safekeeper(self, id: int) -> dict[str, Any] | None: try: response = self.request( "GET", @@ -2285,14 +2293,14 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] - def __enter__(self) -> NeonStorageController: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop(immediate=True) @@ -2304,10 +2312,10 @@ class NeonProxiedStorageController(NeonStorageController): def start( self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, - ): + timeout_in_seconds: int | None = None, + instance_id: int | None = None, + base_port: int | None = None, + ) -> Self: assert instance_id is not None and base_port is not None self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) @@ -2317,7 +2325,7 @@ class NeonProxiedStorageController(NeonStorageController): return self def stop_instance( - self, immediate: bool = False, instance_id: Optional[int] = None + self, immediate: bool = False, instance_id: int | None = None ) -> NeonStorageController: assert instance_id in self.instances if self.instances[instance_id]["running"]: @@ -2327,7 +2335,7 @@ class NeonProxiedStorageController(NeonStorageController): self.running = any(meta["running"] for meta in self.instances.values()) return self - def stop(self, immediate: bool = False) -> NeonStorageController: + def stop(self, immediate: bool = False) -> Self: for iid, details in self.instances.items(): if details["running"]: self.env.neon_cli.storage_controller_stop(immediate, iid) @@ -2346,7 +2354,7 @@ class NeonProxiedStorageController(NeonStorageController): def log_contains( self, pattern: str, offset: None | LogCursor = None - ) -> Optional[tuple[str, LogCursor]]: + ) -> tuple[str, LogCursor] | None: raise NotImplementedError() @@ -2393,8 +2401,8 @@ class NeonPageserver(PgProtocol, LogUtils): def timeline_dir( self, - tenant_shard_id: Union[TenantId, TenantShardId], - timeline_id: Optional[TimelineId] = None, + tenant_shard_id: TenantId | TenantShardId, + timeline_id: TimelineId | None = None, ) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" if timeline_id is None: @@ -2403,7 +2411,7 @@ class NeonPageserver(PgProtocol, LogUtils): def tenant_dir( self, - tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None, + tenant_shard_id: TenantId | TenantShardId | None = None, ) -> Path: """Get a tenant directory's path based on the repo directory of the test environment""" if tenant_shard_id is None: @@ -2447,9 +2455,9 @@ class NeonPageserver(PgProtocol, LogUtils): def start( self, - extra_env_vars: Optional[dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, - ) -> NeonPageserver: + extra_env_vars: dict[str, str] | None = None, + timeout_in_seconds: int | None = None, + ) -> Self: """ Start the page server. `overrides` allows to add some config to this pageserver start. @@ -2484,7 +2492,7 @@ class NeonPageserver(PgProtocol, LogUtils): return self - def stop(self, immediate: bool = False) -> NeonPageserver: + def stop(self, immediate: bool = False) -> Self: """ Stop the page server. Returns self. @@ -2497,7 +2505,7 @@ class NeonPageserver(PgProtocol, LogUtils): def restart( self, immediate: bool = False, - timeout_in_seconds: Optional[int] = None, + timeout_in_seconds: int | None = None, ): """ High level wrapper for restart: restarts the process, and waits for @@ -2532,14 +2540,14 @@ class NeonPageserver(PgProtocol, LogUtils): wait_until(20, 0.5, complete) - def __enter__(self) -> NeonPageserver: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop(immediate=True) @@ -2548,7 +2556,7 @@ class NeonPageserver(PgProtocol, LogUtils): pytest.skip("pageserver was built without 'testing' feature") def http_client( - self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + self, auth_token: str | None = None, retries: Retry | None = None ) -> PageserverHttpClient: return PageserverHttpClient( port=self.service_port.http, @@ -2585,7 +2593,7 @@ class NeonPageserver(PgProtocol, LogUtils): self, tenant_id: TenantId, config: None | dict[str, Any] = None, - generation: Optional[int] = None, + generation: int | None = None, override_storage_controller_generation: bool = False, ): """ @@ -2619,7 +2627,7 @@ class NeonPageserver(PgProtocol, LogUtils): return client.tenant_location_conf(tenant_id, config, **kwargs) def read_tenant_location_conf( - self, tenant_shard_id: Union[TenantId, TenantShardId] + self, tenant_shard_id: TenantId | TenantShardId ) -> dict[str, Any]: path = self.tenant_dir(tenant_shard_id) / "config-v1" log.info(f"Reading location conf from {path}") @@ -2634,9 +2642,9 @@ class NeonPageserver(PgProtocol, LogUtils): def tenant_create( self, tenant_id: TenantId, - conf: Optional[dict[str, Any]] = None, - auth_token: Optional[str] = None, - generation: Optional[int] = None, + conf: dict[str, Any] | None = None, + auth_token: str | None = None, + generation: int | None = None, ) -> TenantId: if generation is None: generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) @@ -2656,7 +2664,7 @@ class NeonPageserver(PgProtocol, LogUtils): return tenant_id def list_layers( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId ) -> list[Path]: """ Inspect local storage on a pageserver to discover which layer files are present. @@ -2749,7 +2757,7 @@ class PgBin: if "/" not in str(command[0]): command[0] = str(self.pg_bin_path / command[0]) - def _build_env(self, env_add: Optional[Env]) -> Env: + def _build_env(self, env_add: Env | None) -> Env: if env_add is None: return self.env env = self.env.copy() @@ -2766,8 +2774,8 @@ class PgBin: def run_nonblocking( self, command: list[str], - env: Optional[Env] = None, - cwd: Optional[Union[str, Path]] = None, + env: Env | None = None, + cwd: str | Path | None = None, ) -> subprocess.Popen[Any]: """ Run one of the postgres binaries, not waiting for it to finish @@ -2790,8 +2798,8 @@ class PgBin: def run( self, command: list[str], - env: Optional[Env] = None, - cwd: Optional[Union[str, Path]] = None, + env: Env | None = None, + cwd: str | Path | None = None, ) -> None: """ Run one of the postgres binaries, waiting for it to finish @@ -2813,8 +2821,8 @@ class PgBin: def run_capture( self, command: list[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, + env: Env | None = None, + cwd: str | None = None, with_command_header=True, **popen_kwargs: Any, ) -> str: @@ -2941,7 +2949,7 @@ class VanillaPostgres(PgProtocol): conf_file.write("\n".join(hba) + "\n") conf_file.write(data) - def start(self, log_path: Optional[str] = None): + def start(self, log_path: str | None = None): assert not self.running self.running = True @@ -2960,14 +2968,14 @@ class VanillaPostgres(PgProtocol): """Return size of pgdatadir subdirectory in bytes.""" return get_dir_size(self.pgdatadir / subdir) - def __enter__(self) -> VanillaPostgres: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): if self.running: self.stop() @@ -3009,14 +3017,14 @@ class RemotePostgres(PgProtocol): # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE raise Exception("cannot get size of a Postgres instance") - def __enter__(self) -> RemotePostgres: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): # do nothing pass @@ -3092,7 +3100,7 @@ class PSQL: self.path = full_path self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" - async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process: + async def run(self, query: str | None = None) -> asyncio.subprocess.Process: run_args = [self.path, "--no-psqlrc", "--quiet", "--tuples-only", self.database_url] if query is not None: run_args += ["--command", query] @@ -3138,7 +3146,7 @@ class NeonProxy(PgProtocol): """All auth backends must inherit from this class""" @property - def default_conn_url(self) -> Optional[str]: + def default_conn_url(self) -> str | None: return None @abc.abstractmethod @@ -3155,7 +3163,7 @@ class NeonProxy(PgProtocol): ] class Console(AuthBackend): - def __init__(self, endpoint: str, fixed_rate_limit: Optional[int] = None): + def __init__(self, endpoint: str, fixed_rate_limit: int | None = None): self.endpoint = endpoint self.fixed_rate_limit = fixed_rate_limit @@ -3183,7 +3191,7 @@ class NeonProxy(PgProtocol): pg_conn_url: str @property - def default_conn_url(self) -> Optional[str]: + def default_conn_url(self) -> str | None: return self.pg_conn_url def extra_args(self) -> list[str]: @@ -3202,8 +3210,8 @@ class NeonProxy(PgProtocol): mgmt_port: int, external_http_port: int, auth_backend: NeonProxy.AuthBackend, - metric_collection_endpoint: Optional[str] = None, - metric_collection_interval: Optional[str] = None, + metric_collection_endpoint: str | None = None, + metric_collection_interval: str | None = None, ): host = "127.0.0.1" domain = "proxy.localtest.me" # resolves to 127.0.0.1 @@ -3221,9 +3229,9 @@ class NeonProxy(PgProtocol): self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval self.http_timeout_seconds = 15 - self._popen: Optional[subprocess.Popen[bytes]] = None + self._popen: subprocess.Popen[bytes] | None = None - def start(self) -> NeonProxy: + def start(self) -> Self: assert self._popen is None # generate key of it doesn't exist @@ -3351,14 +3359,14 @@ class NeonProxy(PgProtocol): log.info(f"SUCCESS, found auth url: {line}") return line - def __enter__(self) -> NeonProxy: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): if self._popen is not None: self._popen.terminate() @@ -3439,9 +3447,9 @@ class NeonAuthBroker: self.mgmt_port = mgmt_port self.auth_backend = auth_backend self.http_timeout_seconds = 15 - self._popen: Optional[subprocess.Popen[bytes]] = None + self._popen: subprocess.Popen[bytes] | None = None - def start(self) -> NeonAuthBroker: + def start(self) -> Self: assert self._popen is None # generate key of it doesn't exist @@ -3510,14 +3518,14 @@ class NeonAuthBroker: request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") return request_result.text - def __enter__(self) -> NeonAuthBroker: + def __enter__(self) -> Self: return self def __exit__( self, - _exc_type: Optional[type[BaseException]], - _exc_value: Optional[BaseException], - _traceback: Optional[TracebackType], + _exc_type: type[BaseException] | None, + _exc_value: BaseException | None, + _traceback: TracebackType | None, ): if self._popen is not None: self._popen.terminate() @@ -3673,9 +3681,9 @@ class Endpoint(PgProtocol, LogUtils): ): super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") self.env = env - self.branch_name: Optional[str] = None # dubious - self.endpoint_id: Optional[str] = None # dubious, see asserts below - self.pgdata_dir: Optional[Path] = None # Path to computenode PGDATA + self.branch_name: str | None = None # dubious + self.endpoint_id: str | None = None # dubious, see asserts below + self.pgdata_dir: Path | None = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_port = pg_port self.http_port = http_port @@ -3692,7 +3700,7 @@ class Endpoint(PgProtocol, LogUtils): self._running = threading.Semaphore(0) def http_client( - self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + self, auth_token: str | None = None, retries: Retry | None = None ) -> EndpointHttpClient: return EndpointHttpClient( port=self.http_port, @@ -3701,13 +3709,13 @@ class Endpoint(PgProtocol, LogUtils): def create( self, branch_name: str, - endpoint_id: Optional[str] = None, + endpoint_id: str | None = None, hot_standby: bool = False, - lsn: Optional[Lsn] = None, - config_lines: Optional[list[str]] = None, - pageserver_id: Optional[int] = None, + lsn: Lsn | None = None, + config_lines: list[str] | None = None, + pageserver_id: int | None = None, allow_multiple: bool = False, - ) -> Endpoint: + ) -> Self: """ Create a new Postgres endpoint. Returns self. @@ -3736,24 +3744,57 @@ class Endpoint(PgProtocol, LogUtils): self.pgdata_dir = self.env.repo_dir / path self.logfile = self.endpoint_path() / "compute.log" - config_lines = config_lines or [] - # set small 'max_replication_write_lag' to enable backpressure # and make tests more stable. config_lines = ["max_replication_write_lag=15MB"] + config_lines + # Delete file cache if it exists (and we're recreating the endpoint) + if USE_LFC: + if (lfc_path := Path(self.lfc_path())).exists(): + lfc_path.unlink() + else: + lfc_path.parent.mkdir(parents=True, exist_ok=True) + for line in config_lines: + if ( + line.find("neon.max_file_cache_size") > -1 + or line.find("neon.file_cache_size_limit") > -1 + ): + m = re.search(r"=\s*(\S+)", line) + assert m is not None, f"malformed config line {line}" + size = m.group(1) + assert size_to_bytes(size) >= size_to_bytes( + "1MB" + ), "LFC size cannot be set less than 1MB" + # shared_buffers = 512kB to make postgres use LFC intensively + # neon.max_file_cache_size and neon.file_cache size limit are + # set to 1MB because small LFC is better for testing (helps to find more problems) + config_lines = [ + "shared_buffers = 512kB", + f"neon.file_cache_path = '{self.lfc_path()}'", + "neon.max_file_cache_size = 1MB", + "neon.file_cache_size_limit = 1MB", + ] + config_lines + else: + for line in config_lines: + assert ( + line.find("neon.max_file_cache_size") == -1 + ), "Setting LFC parameters is not allowed when LFC is disabled" + assert ( + line.find("neon.file_cache_size_limit") == -1 + ), "Setting LFC parameters is not allowed when LFC is disabled" + self.config(config_lines) return self def start( self, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, - safekeepers: Optional[list[int]] = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, + safekeepers: list[int] | None = None, allow_multiple: bool = False, - basebackup_request_tries: Optional[int] = None, - ) -> Endpoint: + basebackup_request_tries: int | None = None, + ) -> Self: """ Start the Postgres instance. Returns self. @@ -3775,6 +3816,9 @@ class Endpoint(PgProtocol, LogUtils): basebackup_request_tries=basebackup_request_tries, ) self._running.release(1) + self.log_config_value("shared_buffers") + self.log_config_value("neon.max_file_cache_size") + self.log_config_value("neon.file_cache_size_limit") return self @@ -3800,7 +3844,11 @@ class Endpoint(PgProtocol, LogUtils): """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)""" return self.endpoint_path() / "postgresql.conf" - def config(self, lines: list[str]) -> Endpoint: + def lfc_path(self) -> Path: + """Path to the lfc file""" + return self.endpoint_path() / "file_cache" / "file.cache" + + def config(self, lines: list[str]) -> Self: """ Add lines to postgresql.conf. Lines should be an array of valid postgresql.conf rows. @@ -3828,9 +3876,7 @@ class Endpoint(PgProtocol, LogUtils): def is_running(self): return self._running._value > 0 - def reconfigure( - self, pageserver_id: Optional[int] = None, safekeepers: Optional[list[int]] = None - ): + def reconfigure(self, pageserver_id: int | None = None, safekeepers: list[int] | None = None): assert self.endpoint_id is not None # If `safekeepers` is not None, they are remember them as active and use # in the following commands. @@ -3877,8 +3923,8 @@ class Endpoint(PgProtocol, LogUtils): def stop( self, mode: str = "fast", - sks_wait_walreceiver_gone: Optional[tuple[list[Safekeeper], TimelineId]] = None, - ) -> Endpoint: + sks_wait_walreceiver_gone: tuple[list[Safekeeper], TimelineId] | None = None, + ) -> Self: """ Stop the Postgres instance if it's running. @@ -3912,7 +3958,7 @@ class Endpoint(PgProtocol, LogUtils): return self - def stop_and_destroy(self, mode: str = "immediate") -> Endpoint: + def stop_and_destroy(self, mode: str = "immediate") -> Self: """ Stop the Postgres instance, then destroy the endpoint. Returns self. @@ -3931,15 +3977,15 @@ class Endpoint(PgProtocol, LogUtils): def create_start( self, branch_name: str, - endpoint_id: Optional[str] = None, + endpoint_id: str | None = None, hot_standby: bool = False, - lsn: Optional[Lsn] = None, - config_lines: Optional[list[str]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, + lsn: Lsn | None = None, + config_lines: list[str] | None = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, allow_multiple: bool = False, - basebackup_request_tries: Optional[int] = None, - ) -> Endpoint: + basebackup_request_tries: int | None = None, + ) -> Self: """ Create an endpoint, apply config, and start Postgres. Returns self. @@ -3962,14 +4008,14 @@ class Endpoint(PgProtocol, LogUtils): return self - def __enter__(self) -> Endpoint: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop() @@ -3980,16 +4026,46 @@ class Endpoint(PgProtocol, LogUtils): assert self.pgdata_dir is not None # please mypy return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024 - def clear_shared_buffers(self, cursor: Optional[Any] = None): + def clear_buffers(self, cursor: Any | None = None): """ Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.' - - Might also clear LFC. + It clears LFC as well by setting neon.file_cache_size_limit to 0 and then returning it to the previous value, + if LFC is enabled """ if cursor is not None: cursor.execute("select clear_buffer_cache()") + if not USE_LFC: + return + cursor.execute("SHOW neon.file_cache_size_limit") + res = cursor.fetchone() + assert res, "Cannot get neon.file_cache_size_limit" + file_cache_size_limit = res[0] + if file_cache_size_limit == 0: + return + cursor.execute("ALTER SYSTEM SET neon.file_cache_size_limit=0") + cursor.execute("SELECT pg_reload_conf()") + cursor.execute(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'") + cursor.execute("SELECT pg_reload_conf()") else: self.safe_psql("select clear_buffer_cache()") + if not USE_LFC: + return + file_cache_size_limit = self.safe_psql_scalar( + "SHOW neon.file_cache_size_limit", log_query=False + ) + if file_cache_size_limit == 0: + return + self.safe_psql("ALTER SYSTEM SET neon.file_cache_size_limit=0") + self.safe_psql("SELECT pg_reload_conf()") + self.safe_psql(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'") + self.safe_psql("SELECT pg_reload_conf()") + + def log_config_value(self, param): + """ + Writes the config value param to log + """ + res = self.safe_psql_scalar(f"SHOW {param}", log_query=False) + log.info("%s = %s", param, res) class EndpointFactory: @@ -4003,14 +4079,14 @@ class EndpointFactory: def create_start( self, branch_name: str, - endpoint_id: Optional[str] = None, - tenant_id: Optional[TenantId] = None, - lsn: Optional[Lsn] = None, + endpoint_id: str | None = None, + tenant_id: TenantId | None = None, + lsn: Lsn | None = None, hot_standby: bool = False, - config_lines: Optional[list[str]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, - basebackup_request_tries: Optional[int] = None, + config_lines: list[str] | None = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, + basebackup_request_tries: int | None = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -4035,12 +4111,12 @@ class EndpointFactory: def create( self, branch_name: str, - endpoint_id: Optional[str] = None, - tenant_id: Optional[TenantId] = None, - lsn: Optional[Lsn] = None, + endpoint_id: str | None = None, + tenant_id: TenantId | None = None, + lsn: Lsn | None = None, hot_standby: bool = False, - config_lines: Optional[list[str]] = None, - pageserver_id: Optional[int] = None, + config_lines: list[str] | None = None, + pageserver_id: int | None = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -4063,7 +4139,7 @@ class EndpointFactory: pageserver_id=pageserver_id, ) - def stop_all(self, fail_on_error=True) -> EndpointFactory: + def stop_all(self, fail_on_error=True) -> Self: exception = None for ep in self.endpoints: try: @@ -4078,7 +4154,7 @@ class EndpointFactory: return self def new_replica( - self, origin: Endpoint, endpoint_id: str, config_lines: Optional[list[str]] = None + self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4094,7 +4170,7 @@ class EndpointFactory: ) def new_replica_start( - self, origin: Endpoint, endpoint_id: str, config_lines: Optional[list[str]] = None + self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4132,7 +4208,7 @@ class Safekeeper(LogUtils): port: SafekeeperPort, id: int, running: bool = False, - extra_opts: Optional[list[str]] = None, + extra_opts: list[str] | None = None, ): self.env = env self.port = port @@ -4158,8 +4234,8 @@ class Safekeeper(LogUtils): self.extra_opts = extra_opts def start( - self, extra_opts: Optional[list[str]] = None, timeout_in_seconds: Optional[int] = None - ) -> Safekeeper: + self, extra_opts: list[str] | None = None, timeout_in_seconds: int | None = None + ) -> Self: if extra_opts is None: # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two. extra_opts = self.extra_opts @@ -4194,7 +4270,7 @@ class Safekeeper(LogUtils): break # success return self - def stop(self, immediate: bool = False) -> Safekeeper: + def stop(self, immediate: bool = False) -> Self: self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self @@ -4238,7 +4314,7 @@ class Safekeeper(LogUtils): return res def http_client( - self, auth_token: Optional[str] = None, gen_sk_wide_token: bool = True + self, auth_token: str | None = None, gen_sk_wide_token: bool = True ) -> SafekeeperHttpClient: """ When auth_token is None but gen_sk_wide is True creates safekeeper wide @@ -4371,14 +4447,14 @@ class NeonBroker(LogUtils): def start( self, - timeout_in_seconds: Optional[int] = None, - ): + timeout_in_seconds: int | None = None, + ) -> Self: assert not self.running self.env.neon_cli.storage_broker_start(timeout_in_seconds) self.running = True return self - def stop(self): + def stop(self) -> Self: if self.running: self.env.neon_cli.storage_broker_stop() self.running = False @@ -4394,8 +4470,7 @@ class NeonBroker(LogUtils): assert_no_errors(self.logfile, "storage_controller", []) -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class NodeKind(str, Enum): +class NodeKind(StrEnum): PAGESERVER = "pageserver" SAFEKEEPER = "safekeeper" @@ -4406,7 +4481,7 @@ class StorageScrubber: self.log_dir = log_dir def scrubber_cli( - self, args: list[str], timeout, extra_env: Optional[dict[str, str]] = None + self, args: list[str], timeout, extra_env: dict[str, str] | None = None ) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) s3_storage = self.env.pageserver_remote_storage @@ -4469,8 +4544,8 @@ class StorageScrubber: self, post_to_storage_controller: bool = False, node_kind: NodeKind = NodeKind.PAGESERVER, - timeline_lsns: Optional[list[dict[str, Any]]] = None, - extra_env: Optional[dict[str, str]] = None, + timeline_lsns: list[dict[str, Any]] | None = None, + extra_env: dict[str, str] | None = None, ) -> tuple[bool, Any]: """ Returns the health status and the metadata summary. @@ -4504,8 +4579,8 @@ class StorageScrubber: def pageserver_physical_gc( self, min_age_secs: int, - tenant_ids: Optional[list[TenantId]] = None, - mode: Optional[str] = None, + tenant_ids: list[TenantId] | None = None, + mode: str | None = None, ): args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"] @@ -4619,7 +4694,7 @@ def check_restored_datadir_content( test_output_dir: Path, env: NeonEnv, endpoint: Endpoint, - ignored_files: Optional[list[str]] = None, + ignored_files: list[str] | None = None, ): pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) @@ -4721,7 +4796,7 @@ def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> L def tenant_get_shards( - env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None + env: NeonEnv, tenant_id: TenantId, pageserver_id: int | None = None ) -> list[tuple[TenantShardId, NeonPageserver]]: """ Helper for when you want to talk to one or more pageservers, and the @@ -4784,8 +4859,8 @@ def wait_for_last_flush_lsn( endpoint: Endpoint, tenant: TenantId, timeline: TimelineId, - pageserver_id: Optional[int] = None, - auth_token: Optional[str] = None, + pageserver_id: int | None = None, + auth_token: str | None = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" @@ -4814,7 +4889,7 @@ def flush_ep_to_pageserver( ep: Endpoint, tenant: TenantId, timeline: TimelineId, - pageserver_id: Optional[int] = None, + pageserver_id: int | None = None, ) -> Lsn: """ Stop endpoint and wait until all committed WAL reaches the pageserver @@ -4857,7 +4932,7 @@ def wait_for_wal_insert_lsn( endpoint: Endpoint, tenant: TenantId, timeline: TimelineId, - pageserver_id: Optional[int] = None, + pageserver_id: int | None = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0]) @@ -4878,7 +4953,7 @@ def fork_at_current_lsn( endpoint: Endpoint, new_branch_name: str, ancestor_branch_name: str, - tenant_id: Optional[TenantId] = None, + tenant_id: TenantId | None = None, ) -> TimelineId: """ Create new branch at the last LSN of an existing branch. @@ -4951,8 +5026,9 @@ def last_flush_lsn_upload( endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId, - pageserver_id: Optional[int] = None, - auth_token: Optional[str] = None, + pageserver_id: int | None = None, + auth_token: str | None = None, + wait_until_uploaded: bool = True, ) -> Lsn: """ Wait for pageserver to catch to the latest flush LSN of given endpoint, @@ -4966,7 +5042,9 @@ def last_flush_lsn_upload( for tenant_shard_id, pageserver in shards: ps_http = pageserver.http_client(auth_token=auth_token) wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn) - ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_checkpoint( + tenant_shard_id, timeline_id, wait_until_uploaded=wait_until_uploaded + ) return last_flush_lsn @@ -4987,10 +5065,11 @@ def generate_uploads_and_deletions( env: NeonEnv, *, init: bool = True, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - data: Optional[str] = None, + tenant_id: TenantId | None = None, + timeline_id: TimelineId | None = None, + data: str | None = None, pageserver: NeonPageserver, + wait_until_uploaded: bool = True, ): """ Using the environment's default tenant + timeline, generate a load pattern @@ -5013,7 +5092,12 @@ def generate_uploads_and_deletions( if init: endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + env, + endpoint, + tenant_id, + timeline_id, + pageserver_id=pageserver.id, + wait_until_uploaded=wait_until_uploaded, ) def churn(data): @@ -5036,7 +5120,12 @@ def generate_uploads_and_deletions( # in a state where there are "future layers" in remote storage that will generate deletions # after a restart. last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + env, + endpoint, + tenant_id, + timeline_id, + pageserver_id=pageserver.id, + wait_until_uploaded=wait_until_uploaded, ) # Compaction should generate some GC-elegible layers @@ -5052,4 +5141,4 @@ def generate_uploads_and_deletions( # background ingest, no more uploads pending, and therefore no non-determinism # in subsequent actions like pageserver restarts. flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) - ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=wait_until_uploaded) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index d05704c8e0..5059039678 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -25,8 +25,14 @@ def scan_pageserver_log_for_errors( # It's an ERROR or WARN. Is it in the allow-list? for a in allowed_errors: - if re.match(a, line): - break + try: + if re.match(a, line): + break + # We can switch `re.error` with `re.PatternError` after 3.13 + # https://docs.python.org/3/library/re.html#re.PatternError + except re.error: + print(f"Invalid regex: '{a}'", file=sys.stderr) + raise else: errors.append((lineno, line)) return errors diff --git a/test_runner/fixtures/pageserver/common_types.py b/test_runner/fixtures/pageserver/common_types.py index 2319701e0b..0e068db593 100644 --- a/test_runner/fixtures/pageserver/common_types.py +++ b/test_runner/fixtures/pageserver/common_types.py @@ -2,7 +2,7 @@ from __future__ import annotations import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn @@ -46,7 +46,7 @@ class DeltaLayerName: return ret -LayerName = Union[ImageLayerName, DeltaLayerName] +LayerName = ImageLayerName | DeltaLayerName class InvalidFileName(Exception): diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 01583757fa..4cf3ece396 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1,24 +1,32 @@ from __future__ import annotations +import dataclasses +import json +import random +import string import time from collections import defaultdict from dataclasses import dataclass from datetime import datetime -from typing import TYPE_CHECKING, Any +from typing import Any import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId +from fixtures.common_types import ( + Id, + Lsn, + TenantId, + TenantShardId, + TimelineArchivalState, + TimelineId, +) from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion from fixtures.utils import Fn -if TYPE_CHECKING: - from typing import Optional, Union - class PageserverApiException(Exception): def __init__(self, message, status_code: int): @@ -27,6 +35,69 @@ class PageserverApiException(Exception): self.status_code = status_code +@dataclass +class ImportPgdataIdemptencyKey: + key: str + + @staticmethod + def random() -> ImportPgdataIdemptencyKey: + return ImportPgdataIdemptencyKey( + "".join(random.choices(string.ascii_letters + string.digits, k=20)) + ) + + +@dataclass +class LocalFs: + path: str + + +@dataclass +class AwsS3: + region: str + bucket: str + key: str + + +@dataclass +class ImportPgdataLocation: + LocalFs: None | LocalFs = None + AwsS3: None | AwsS3 = None + + +@dataclass +class TimelineCreateRequestModeImportPgdata: + location: ImportPgdataLocation + idempotency_key: ImportPgdataIdemptencyKey + + +@dataclass +class TimelineCreateRequestMode: + Branch: None | dict[str, Any] = None + Bootstrap: None | dict[str, Any] = None + ImportPgdata: None | TimelineCreateRequestModeImportPgdata = None + + +@dataclass +class TimelineCreateRequest: + new_timeline_id: TimelineId + mode: TimelineCreateRequestMode + + def to_json(self) -> str: + class EnhancedJSONEncoder(json.JSONEncoder): + def default(self, o): + if dataclasses.is_dataclass(o) and not isinstance(o, type): + return dataclasses.asdict(o) + elif isinstance(o, Id): + return o.id.hex() + return super().default(o) + + # mode is flattened + this = dataclasses.asdict(self) + mode = this.pop("mode") + this.update(mode) + return json.dumps(self, cls=EnhancedJSONEncoder) + + class TimelineCreate406(PageserverApiException): def __init__(self, res: requests.Response): assert res.status_code == 406 @@ -43,7 +114,7 @@ class TimelineCreate409(PageserverApiException): class InMemoryLayerInfo: kind: str lsn_start: str - lsn_end: Optional[str] + lsn_end: str | None @classmethod def from_json(cls, d: dict[str, Any]) -> InMemoryLayerInfo: @@ -60,10 +131,10 @@ class HistoricLayerInfo: layer_file_name: str layer_file_size: int lsn_start: str - lsn_end: Optional[str] + lsn_end: str | None remote: bool # None for image layers, true if pageserver thinks this is an L0 delta layer - l0: Optional[bool] + l0: bool | None visible: bool @classmethod @@ -180,8 +251,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self, port: int, is_testing_enabled_or_skip: Fn, - auth_token: Optional[str] = None, - retries: Optional[Retry] = None, + auth_token: str | None = None, + retries: Retry | None = None, ): super().__init__() self.port = port @@ -278,7 +349,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_attach( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, generation: int, config: None | dict[str, Any] = None, ): @@ -305,7 +376,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): }, ) - def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool): + def tenant_reset(self, tenant_id: TenantId | TenantShardId, drop_cache: bool): params = {} if drop_cache: params["drop_cache"] = "true" @@ -315,10 +386,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_location_conf( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, location_conf: dict[str, Any], flush_ms=None, - lazy: Optional[bool] = None, + lazy: bool | None = None, ): body = location_conf.copy() @@ -346,20 +417,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json["tenant_shards"], list) return res_json - def tenant_get_location(self, tenant_id: TenantShardId): + def tenant_get_location(self, tenant_id: TenantId | TenantShardId): res = self.get( f"http://localhost:{self.port}/v1/location_config/{tenant_id}", ) self.verbose_error(res) return res.json() - def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_delete(self, tenant_id: TenantId | TenantShardId): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) return res def tenant_status( - self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False + self, tenant_id: TenantId | TenantShardId, activate: bool = False ) -> dict[Any, Any]: """ :activate: hint the server not to accelerate activation of this tenant in response @@ -378,17 +449,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def tenant_config(self, tenant_id: Union[TenantId, TenantShardId]) -> TenantConfig: + def tenant_config(self, tenant_id: TenantId | TenantShardId) -> TenantConfig: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config") self.verbose_error(res) return TenantConfig.from_json(res.json()) - def tenant_heatmap_upload(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_heatmap_upload(self, tenant_id: TenantId | TenantShardId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") self.verbose_error(res) def tenant_secondary_download( - self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None + self, tenant_id: TenantId | TenantShardId, wait_ms: int | None = None ) -> tuple[int, dict[Any, Any]]: url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download" if wait_ms is not None: @@ -397,13 +468,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return (res.status_code, res.json()) - def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_secondary_status(self, tenant_id: TenantId | TenantShardId): url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status" res = self.get(url) self.verbose_error(res) return res.json() - def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]): + def set_tenant_config(self, tenant_id: TenantId | TenantShardId, config: dict[str, Any]): """ Only use this via storage_controller.pageserver_api(). @@ -420,8 +491,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def patch_tenant_config_client_side( self, tenant_id: TenantId, - inserts: Optional[dict[str, Any]] = None, - removes: Optional[list[str]] = None, + inserts: dict[str, Any] | None = None, + removes: list[str] | None = None, ): """ Only use this via storage_controller.pageserver_api(). @@ -436,11 +507,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter): del current[key] self.set_tenant_config(tenant_id, current) - def tenant_size(self, tenant_id: Union[TenantId, TenantShardId]) -> int: + def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int: return self.tenant_size_and_modelinputs(tenant_id)[0] def tenant_size_and_modelinputs( - self, tenant_id: Union[TenantId, TenantShardId] + self, tenant_id: TenantId | TenantShardId ) -> tuple[int, dict[str, Any]]: """ Returns the tenant size, together with the model inputs as the second tuple item. @@ -456,7 +527,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(inputs, dict) return (size, inputs) - def tenant_size_debug(self, tenant_id: Union[TenantId, TenantShardId]) -> str: + def tenant_size_debug(self, tenant_id: TenantId | TenantShardId) -> str: """ Returns the tenant size debug info, as an HTML string """ @@ -468,10 +539,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_time_travel_remote_storage( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timestamp: datetime, done_if_after: datetime, - shard_counts: Optional[list[int]] = None, + shard_counts: list[int] | None = None, ): """ Issues a request to perform time travel operations on the remote storage @@ -490,7 +561,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_list( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, ) -> list[dict[str, Any]]: @@ -510,7 +581,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_and_offloaded_list( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, ) -> TimelinesInfoAndOffloaded: res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline_and_offloaded", @@ -523,11 +594,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_create( self, pg_version: PgVersion, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, new_timeline_id: TimelineId, - ancestor_timeline_id: Optional[TimelineId] = None, - ancestor_start_lsn: Optional[Lsn] = None, - existing_initdb_timeline_id: Optional[TimelineId] = None, + ancestor_timeline_id: TimelineId | None = None, + ancestor_start_lsn: Lsn | None = None, + existing_initdb_timeline_id: TimelineId | None = None, **kwargs, ) -> dict[Any, Any]: body: dict[str, Any] = { @@ -558,7 +629,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_detail( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, @@ -584,7 +655,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_delete( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, **kwargs + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, **kwargs ): """ Note that deletion is not instant, it is scheduled and performed mostly in the background. @@ -600,9 +671,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_gc( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - gc_horizon: Optional[int], + gc_horizon: int | None, ) -> dict[str, Any]: """ Unlike most handlers, this will wait for the layers to be actually @@ -624,16 +695,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + def timeline_block_gc(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc", ) log.info(f"Got GC request response code: {res.status_code}") self.verbose_error(res) - def timeline_unblock_gc( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ): + def timeline_unblock_gc(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc", ) @@ -642,7 +711,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_offload( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, ): self.is_testing_enabled_or_skip() @@ -658,14 +727,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_compact( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, force_l0_compaction=False, wait_until_uploaded=False, enhanced_gc_bottom_most_compaction=False, - body: Optional[dict[str, Any]] = None, + body: dict[str, Any] | None = None, ): self.is_testing_enabled_or_skip() query = {} @@ -692,7 +761,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert res_json is None def timeline_preserve_initdb_archive( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId ): log.info( f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}" @@ -704,7 +773,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_archival_config( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, state: TimelineArchivalState, ): @@ -720,7 +789,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_get_lsn_by_timestamp( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, timestamp: datetime, with_lease: bool = False, @@ -739,7 +808,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_lsn_lease( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn ): data = { "lsn": str(lsn), @@ -755,7 +824,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_get_timestamp_of_lsn( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn ): log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}") res = self.get( @@ -765,9 +834,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res_json = res.json() return res_json - def timeline_layer_map_info( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ): + def timeline_layer_map_info(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}") res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer", @@ -778,13 +845,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_checkpoint( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, force_l0_compaction=False, wait_until_uploaded=False, - compact: Optional[bool] = None, + compact: bool | None = None, **kwargs, ): self.is_testing_enabled_or_skip() @@ -801,7 +868,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if compact is not None: query["compact"] = "true" if compact else "false" - log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + log.info( + f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}" + ) res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", params=query, @@ -814,7 +883,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_spawn_download_remote_layers( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, max_concurrent_downloads: int, ) -> dict[str, Any]: @@ -833,7 +902,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_poll_download_remote_layers_status( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, spawn_response: dict[str, Any], poll_state=None, @@ -855,7 +924,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_download_remote_layers( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, max_concurrent_downloads: int, errors_ok=False, @@ -905,7 +974,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, file_kind: str, op_kind: str, - ) -> Optional[int]: + ) -> int | None: metrics = [ "pageserver_remote_timeline_client_calls_started_total", "pageserver_remote_timeline_client_calls_finished_total", @@ -929,7 +998,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def layer_map_info( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, ) -> LayerMapInfo: res = self.get( @@ -939,7 +1008,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return LayerMapInfo.from_json(res.json()) def timeline_layer_scan_disposable_keys( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str ) -> ScanDisposableKeysResponse: res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}/scan_disposable_keys", @@ -949,7 +1018,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return ScanDisposableKeysResponse.from_json(res.json()) def download_layer( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str ): res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", @@ -958,9 +1027,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert res.status_code == 200 - def download_all_layers( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ): + def download_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) for layer in info.historic_layers: if not layer.remote: @@ -969,9 +1036,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def detach_ancestor( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - batch_size: Optional[int] = None, + batch_size: int | None = None, **kwargs, ) -> set[TimelineId]: params = {} @@ -987,7 +1054,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return set(map(TimelineId, json["reparented_timelines"])) def evict_layer( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str ): res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", @@ -996,7 +1063,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert res.status_code in (200, 304) - def evict_all_layers(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + def evict_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) for layer in info.historic_layers: self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) @@ -1009,7 +1076,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return res.json() - def tenant_break(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_break(self, tenant_id: TenantId | TenantShardId): res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") self.verbose_error(res) @@ -1058,7 +1125,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def perf_info( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, ): self.is_testing_enabled_or_skip() diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 37b4246d40..b6d19af84c 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -13,7 +13,8 @@ from fixtures.neon_fixtures import ( from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind if TYPE_CHECKING: - from typing import Any, Callable + from collections.abc import Callable + from typing import Any def single_timeline( diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index ac7497ee6c..46700e3fe3 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -17,14 +17,14 @@ from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage from fixtures.utils import wait_until if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any def assert_tenant_state( pageserver_http: PageserverHttpClient, tenant: TenantId, expected_state: str, - message: Optional[str] = None, + message: str | None = None, ) -> None: tenant_status = pageserver_http.tenant_status(tenant) log.info(f"tenant_status: {tenant_status}") @@ -33,7 +33,7 @@ def assert_tenant_state( def remote_consistent_lsn( pageserver_http: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, ) -> Lsn: detail = pageserver_http.timeline_detail(tenant, timeline) @@ -51,7 +51,7 @@ def remote_consistent_lsn( def wait_for_upload( pageserver_http: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, lsn: Lsn, ): @@ -138,7 +138,7 @@ def wait_until_all_tenants_state( def wait_until_timeline_state( pageserver_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, expected_state: str, iterations: int, @@ -188,7 +188,7 @@ def wait_until_tenant_active( def last_record_lsn( pageserver_http_client: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -200,7 +200,7 @@ def last_record_lsn( def wait_for_last_record_lsn( pageserver_http: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, lsn: Lsn, ) -> Lsn: @@ -267,10 +267,10 @@ def wait_for_upload_queue_empty( def wait_timeline_detail_404( pageserver_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, iterations: int, - interval: Optional[float] = None, + interval: float | None = None, ): if interval is None: interval = 0.25 @@ -292,10 +292,10 @@ def wait_timeline_detail_404( def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, iterations: int = 20, - interval: Optional[float] = None, + interval: float | None = None, **delete_args, ) -> None: pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) @@ -304,9 +304,9 @@ def timeline_delete_wait_completed( # remote_storage must not be None, but that's easier for callers to make mypy happy def assert_prefix_empty( - remote_storage: Optional[RemoteStorage], - prefix: Optional[str] = None, - allowed_postfix: Optional[str] = None, + remote_storage: RemoteStorage | None, + prefix: str | None = None, + allowed_postfix: str | None = None, delimiter: str = "/", ) -> None: assert remote_storage is not None @@ -348,8 +348,8 @@ def assert_prefix_empty( # remote_storage must not be None, but that's easier for callers to make mypy happy def assert_prefix_not_empty( - remote_storage: Optional[RemoteStorage], - prefix: Optional[str] = None, + remote_storage: RemoteStorage | None, + prefix: str | None = None, delimiter: str = "/", ): assert remote_storage is not None @@ -358,7 +358,7 @@ def assert_prefix_not_empty( def list_prefix( - remote: RemoteStorage, prefix: Optional[str] = None, delimiter: str = "/" + remote: RemoteStorage, prefix: str | None = None, delimiter: str = "/" ) -> ListObjectsV2OutputTypeDef: """ Note that this function takes into account prefix_in_bucket. diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 1131bf090f..f57c0f801f 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -11,7 +11,7 @@ from _pytest.python import Metafunc from fixtures.pg_version import PgVersion if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any """ @@ -20,31 +20,31 @@ Dynamically parametrize tests by different parameters @pytest.fixture(scope="function", autouse=True) -def pg_version() -> Optional[PgVersion]: +def pg_version() -> PgVersion | None: return None @pytest.fixture(scope="function", autouse=True) -def build_type() -> Optional[str]: +def build_type() -> str | None: return None @pytest.fixture(scope="session", autouse=True) -def platform() -> Optional[str]: +def platform() -> str | None: return None @pytest.fixture(scope="function", autouse=True) -def pageserver_virtual_file_io_engine() -> Optional[str]: +def pageserver_virtual_file_io_engine() -> str | None: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") @pytest.fixture(scope="function", autouse=True) -def pageserver_virtual_file_io_mode() -> Optional[str]: +def pageserver_virtual_file_io_mode() -> str | None: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") -def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: +def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: return None @@ -54,7 +54,7 @@ def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict @pytest.fixture(scope="function", autouse=True) -def pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: +def pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: return get_pageserver_default_tenant_config_compaction_algorithm() @@ -66,6 +66,7 @@ def pytest_generate_tests(metafunc: Metafunc): metafunc.parametrize("build_type", build_types) + pg_versions: list[PgVersion] if (v := os.getenv("DEFAULT_PG_VERSION")) is None: pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] else: @@ -115,5 +116,6 @@ def pytest_runtest_makereport(*args, **kwargs): }.get(os.uname().machine, "UNKNOWN") arch = os.getenv("RUNNER_ARCH", uname_m) allure.dynamic.parameter("__arch", arch) + allure.dynamic.parameter("__lfc", os.getenv("USE_LFC") != "false") yield diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index 60221573eb..1c71abea19 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -18,7 +18,6 @@ from fixtures.utils import allure_attach_from_dir if TYPE_CHECKING: from collections.abc import Iterator - from typing import Optional BASE_DIR = Path(__file__).parents[2] @@ -26,9 +25,7 @@ COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc" DEFAULT_OUTPUT_DIR: str = "test_output" -def get_test_dir( - request: FixtureRequest, top_output_dir: Path, prefix: Optional[str] = None -) -> Path: +def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | None = None) -> Path: """Compute the path to a working directory for an individual test.""" test_name = request.node.name test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}" @@ -112,7 +109,7 @@ def compatibility_snapshot_dir() -> Iterator[Path]: @pytest.fixture(scope="session") -def compatibility_neon_binpath() -> Iterator[Optional[Path]]: +def compatibility_neon_binpath() -> Iterator[Path | None]: if os.getenv("REMOTE_ENV"): return comp_binpath = None @@ -133,7 +130,7 @@ def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: @pytest.fixture(scope="session") -def compatibility_pg_distrib_dir() -> Iterator[Optional[Path]]: +def compatibility_pg_distrib_dir() -> Iterator[Path | None]: compat_distrib_dir = None if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"): compat_distrib_dir = Path(env_compat_postgres_bin).resolve() @@ -197,7 +194,7 @@ class FileAndThreadLock: def __init__(self, path: Path): self.path = path self.thread_lock = threading.Lock() - self.fd: Optional[int] = None + self.fd: int | None = None def __enter__(self): self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY) @@ -208,9 +205,9 @@ class FileAndThreadLock: def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, ): assert self.fd is not None assert self.thread_lock.locked() # ... by us @@ -263,9 +260,9 @@ class SnapshotDir: def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, ): self._lock.__exit__(exc_type, exc_value, exc_traceback) @@ -277,7 +274,7 @@ def shared_snapshot_dir(top_output_dir: Path, ident: str) -> SnapshotDir: @pytest.fixture(scope="function") -def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: +def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path | None: """ Idempotently create a test's overlayfs mount state directory. If the functionality isn't enabled via env var, returns None. diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 798db1e8d9..46423e8c76 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -1,22 +1,16 @@ from __future__ import annotations -import enum -from typing import TYPE_CHECKING +from enum import StrEnum from typing_extensions import override -if TYPE_CHECKING: - from typing import Optional - - """ This fixture is used to determine which version of Postgres to use for tests. """ # Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument -# TODO: use enum.StrEnum for Python >= 3.11 -class PgVersion(str, enum.Enum): +class PgVersion(StrEnum): V14 = "14" V15 = "15" V16 = "16" @@ -34,7 +28,6 @@ class PgVersion(str, enum.Enum): def __repr__(self) -> str: return f"'{self.value}'" - # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums @override def __str__(self) -> str: return self.value @@ -47,16 +40,18 @@ class PgVersion(str, enum.Enum): @classmethod @override - def _missing_(cls, value: object) -> Optional[PgVersion]: - known_values = {v.value for _, v in cls.__members__.items()} + def _missing_(cls, value: object) -> PgVersion | None: + if not isinstance(value, str): + return None - # Allow passing version as a string with "v" prefix (e.g. "v14") - if isinstance(value, str) and value.lower().startswith("v") and value[1:] in known_values: - return cls(value[1:]) - # Allow passing version as an int (e.g. 15 or 150002, both will be converted to PgVersion.V15) - elif isinstance(value, int) and str(value)[:2] in known_values: - return cls(str(value)[:2]) + known_values = set(cls.__members__.values()) + + # Allow passing version as v-prefixed string (e.g. "v14") + if value.lower().startswith("v") and (v := value[1:]) in known_values: + return cls(v) + + # Allow passing version as an int (i.e. both "15" and "150002" matches PgVersion.V15) + if value.isdigit() and (v := value[:2]) in known_values: + return cls(v) - # Make mypy happy - # See https://github.com/python/mypy/issues/3974 return None diff --git a/test_runner/fixtures/port_distributor.py b/test_runner/fixtures/port_distributor.py index df0eb2a809..6a829a9399 100644 --- a/test_runner/fixtures/port_distributor.py +++ b/test_runner/fixtures/port_distributor.py @@ -3,13 +3,9 @@ from __future__ import annotations import re import socket from contextlib import closing -from typing import TYPE_CHECKING from fixtures.log_helper import log -if TYPE_CHECKING: - from typing import Union - def can_bind(host: str, port: int) -> bool: """ @@ -49,17 +45,19 @@ class PortDistributor: "port range configured for test is exhausted, consider enlarging the range" ) - def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]: + def replace_with_new_port(self, value: int | str) -> int | str: """ Returns a new port for a port number in a string (like "localhost:1234") or int. Replacements are memorised, so a substitution for the same port is always the same. """ - # TODO: replace with structural pattern matching for Python >= 3.10 - if isinstance(value, int): - return self._replace_port_int(value) - - return self._replace_port_str(value) + match value: + case int(): + return self._replace_port_int(value) + case str(): + return self._replace_port_str(value) + case _: + raise TypeError(f"Unsupported type {type(value)}, should be int | str") def _replace_port_int(self, value: int) -> int: known_port = self.port_map.get(value) diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 7024953661..4e1e8a884f 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -6,8 +6,9 @@ import json import os import re from dataclasses import dataclass +from enum import StrEnum from pathlib import Path -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING import boto3 import toml @@ -20,7 +21,7 @@ from fixtures.log_helper import log from fixtures.pageserver.common_types import IndexPartDump if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" @@ -28,7 +29,7 @@ TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @enum.unique -class RemoteStorageUser(str, enum.Enum): +class RemoteStorageUser(StrEnum): """ Instead of using strings for the users, use a more strict enum. """ @@ -77,19 +78,19 @@ class MockS3Server: class LocalFsStorage: root: Path - def tenant_path(self, tenant_id: TenantId) -> Path: + def tenant_path(self, tenant_id: TenantId | TenantShardId) -> Path: return self.root / "tenants" / str(tenant_id) - def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: + def timeline_path(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Path: return self.tenant_path(tenant_id) / "timelines" / str(timeline_id) def timeline_latest_generation( - self, tenant_id: TenantId, timeline_id: TimelineId - ) -> Optional[int]: + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId + ) -> int | None: timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id)) index_parts = [f for f in timeline_files if f.startswith("index_part")] - def parse_gen(filename: str) -> Optional[int]: + def parse_gen(filename: str) -> int | None: log.info(f"parsing index_part '{filename}'") parts = filename.split("-") if len(parts) == 2: @@ -102,7 +103,7 @@ class LocalFsStorage: raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}") return generations[-1] - def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: + def index_path(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Path: latest_gen = self.timeline_latest_generation(tenant_id, timeline_id) if latest_gen is None: filename = TIMELINE_INDEX_PART_FILE_NAME @@ -116,7 +117,7 @@ class LocalFsStorage: tenant_id: TenantId, timeline_id: TimelineId, local_name: str, - generation: Optional[int] = None, + generation: int | None = None, ): if generation is None: generation = self.timeline_latest_generation(tenant_id, timeline_id) @@ -126,7 +127,7 @@ class LocalFsStorage: filename = f"{local_name}-{generation:08x}" return self.timeline_path(tenant_id, timeline_id) / filename - def index_content(self, tenant_id: TenantId, timeline_id: TimelineId) -> Any: + def index_content(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Any: with self.index_path(tenant_id, timeline_id).open("r") as f: return json.load(f) @@ -158,17 +159,17 @@ class LocalFsStorage: class S3Storage: bucket_name: str bucket_region: str - access_key: Optional[str] - secret_key: Optional[str] - aws_profile: Optional[str] + access_key: str | None + secret_key: str | None + aws_profile: str | None prefix_in_bucket: str client: S3Client cleanup: bool """Is this MOCK_S3 (false) or REAL_S3 (true)""" real: bool - endpoint: Optional[str] = None + endpoint: str | None = None """formatting deserialized with humantime crate, for example "1s".""" - custom_timeout: Optional[str] = None + custom_timeout: str | None = None def access_env_vars(self) -> dict[str, str]: if self.aws_profile is not None: @@ -266,12 +267,10 @@ class S3Storage: def tenants_path(self) -> str: return f"{self.prefix_in_bucket}/tenants" - def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str: + def tenant_path(self, tenant_id: TenantShardId | TenantId) -> str: return f"{self.tenants_path()}/{tenant_id}" - def timeline_path( - self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId - ) -> str: + def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" def get_latest_index_key(self, index_keys: list[str]) -> str: @@ -309,11 +308,11 @@ class S3Storage: assert self.real is False -RemoteStorage = Union[LocalFsStorage, S3Storage] +RemoteStorage = LocalFsStorage | S3Storage @enum.unique -class RemoteStorageKind(str, enum.Enum): +class RemoteStorageKind(StrEnum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" @@ -325,8 +324,8 @@ class RemoteStorageKind(str, enum.Enum): run_id: str, test_name: str, user: RemoteStorageUser, - bucket_name: Optional[str] = None, - bucket_region: Optional[str] = None, + bucket_name: str | None = None, + bucket_region: str | None = None, ) -> RemoteStorage: if self == RemoteStorageKind.LOCAL_FS: return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user)) diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 5d9a3bd149..094188c0b5 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -13,7 +13,7 @@ from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.utils import wait_until if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any # Walreceiver as returned by sk's timeline status endpoint. @@ -72,7 +72,7 @@ class TermBumpResponse: class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError - def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): + def __init__(self, port: int, auth_token: str | None = None, is_testing_enabled=False): super().__init__() self.port = port self.auth_token = auth_token @@ -98,7 +98,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): if not self.is_testing_enabled: pytest.skip("safekeeper was built without 'testing' feature") - def configure_failpoints(self, config_strings: Union[tuple[str, str], list[tuple[str, str]]]): + def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]): self.is_testing_enabled_or_skip() if isinstance(config_strings, tuple): @@ -195,7 +195,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def debug_dump(self, params: Optional[dict[str, str]] = None) -> dict[str, Any]: + def debug_dump(self, params: dict[str, str] | None = None) -> dict[str, Any]: params = params or {} res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) res.raise_for_status() @@ -204,7 +204,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return res_json def debug_dump_timeline( - self, timeline_id: TimelineId, params: Optional[dict[str, str]] = None + self, timeline_id: TimelineId, params: dict[str, str] | None = None ) -> Any: params = params or {} params["timeline_id"] = str(timeline_id) @@ -285,7 +285,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): self, tenant_id: TenantId, timeline_id: TimelineId, - term: Optional[int], + term: int | None, ) -> TermBumpResponse: body = {} if term is not None: diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py index c174358ef5..be95a98ff9 100644 --- a/test_runner/fixtures/storage_controller_proxy.py +++ b/test_runner/fixtures/storage_controller_proxy.py @@ -13,14 +13,14 @@ from werkzeug.wrappers.response import Response from fixtures.log_helper import log if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any class StorageControllerProxy: def __init__(self, server: HTTPServer): self.server: HTTPServer = server self.listen: str = f"http://{server.host}:{server.port}" - self.routing_to: Optional[str] = None + self.routing_to: str | None = None def route_to(self, storage_controller_api: str): self.routing_to = storage_controller_api diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 96a651f0db..04e98fe494 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -8,10 +8,10 @@ import subprocess import tarfile import threading import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from hashlib import sha256 from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar from urllib.parse import urlencode import allure @@ -29,7 +29,7 @@ from fixtures.pg_version import PgVersion if TYPE_CHECKING: from collections.abc import Iterable - from typing import IO, Optional + from typing import IO from fixtures.common_types import TimelineId from fixtures.neon_fixtures import PgBin @@ -57,6 +57,10 @@ VERSIONS_COMBINATIONS = ( ) # fmt: on +# If the environment variable USE_LFC is set and its value is "false", then LFC is disabled for tests. +# If it is not set or set to a value not equal to "false", LFC is enabled by default. +USE_LFC = os.environ.get("USE_LFC") != "false" + def subprocess_capture( capture_dir: Path, @@ -66,10 +70,10 @@ def subprocess_capture( echo_stderr: bool = False, echo_stdout: bool = False, capture_stdout: bool = False, - timeout: Optional[float] = None, + timeout: float | None = None, with_command_header: bool = True, **popen_kwargs: Any, -) -> tuple[str, Optional[str], int]: +) -> tuple[str, str | None, int]: """Run a process and bifurcate its output to files and the `log` logger stderr and stdout are always captured in files. They are also optionally @@ -495,8 +499,14 @@ def scan_log_for_errors(input: Iterable[str], allowed_errors: list[str]) -> list # It's an ERROR or WARN. Is it in the allow-list? for a in allowed_errors: - if re.match(a, line): - break + try: + if re.match(a, line): + break + # We can switch `re.error` with `re.PatternError` after 3.13 + # https://docs.python.org/3/library/re.html#re.PatternError + except re.error: + log.error(f"Invalid regex: '{a}'") + raise else: errors.append((lineno, line)) return errors @@ -530,7 +540,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str """ started_at = time.time() - def hash_extracted(reader: Optional[IO[bytes]]) -> bytes: + def hash_extracted(reader: IO[bytes] | None) -> bytes: assert reader is not None digest = sha256(usedforsecurity=False) while True: @@ -557,7 +567,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str mismatching: set[str] = set() - for left_tuple, right_tuple in zip(left_list, right_list): + for left_tuple, right_tuple in zip(left_list, right_list, strict=False): left_path, left_hash = left_tuple right_path, right_hash = right_tuple assert ( @@ -589,7 +599,7 @@ class PropagatingThread(threading.Thread): self.exc = e @override - def join(self, timeout: Optional[float] = None) -> Any: + def join(self, timeout: float | None = None) -> Any: super().join(timeout) if self.exc: raise self.exc @@ -647,6 +657,23 @@ def allpairs_versions(): return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids} +def size_to_bytes(hr_size: str) -> int: + """ + Gets human-readable size from postgresql.conf (e.g. 512kB, 10MB) + returns size in bytes + """ + units = {"B": 1, "kB": 1024, "MB": 1024**2, "GB": 1024**3, "TB": 1024**4, "PB": 1024**5} + match = re.search(r"^\'?(\d+)\s*([kMGTP]?B)?\'?$", hr_size) + assert match is not None, f'"{hr_size}" is not a well-formatted human-readable size' + number, unit = match.groups() + + if unit: + amp = units[unit] + else: + amp = 8192 + return int(number) * amp + + def skip_on_postgres(version: PgVersion, reason: str): return pytest.mark.skipif( PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version, @@ -668,6 +695,13 @@ def run_only_on_default_postgres(reason: str): ) +def run_only_on_postgres(versions: Iterable[PgVersion], reason: str): + return pytest.mark.skipif( + PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) not in versions, + reason=reason, + ) + + def skip_in_debug_build(reason: str): return pytest.mark.skipif( os.getenv("BUILD_TYPE", "debug") == "debug", diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index e869c43185..1b8c9fef44 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -15,7 +15,7 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_for_last_record_lsn if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex # to ensure we don't do that: this enables running lots of Workloads in parallel safely. @@ -36,8 +36,8 @@ class Workload: env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId, - branch_name: Optional[str] = None, - endpoint_opts: Optional[dict[str, Any]] = None, + branch_name: str | None = None, + endpoint_opts: dict[str, Any] | None = None, ): self.env = env self.tenant_id = tenant_id @@ -50,10 +50,10 @@ class Workload: self.expect_rows = 0 self.churn_cursor = 0 - self._endpoint: Optional[Endpoint] = None + self._endpoint: Endpoint | None = None self._endpoint_opts = endpoint_opts or {} - def reconfigure(self): + def reconfigure(self) -> None: """ Request the endpoint to reconfigure based on location reported by storage controller """ @@ -61,7 +61,7 @@ class Workload: with ENDPOINT_LOCK: self._endpoint.reconfigure() - def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint: + def endpoint(self, pageserver_id: int | None = None) -> Endpoint: # We may be running alongside other Workloads for different tenants. Full TTID is # obnoxiously long for use here, but a cut-down version is still unique enough for tests. endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}" @@ -94,16 +94,17 @@ class Workload: def __del__(self): self.stop() - def init(self, pageserver_id: Optional[int] = None): + def init(self, pageserver_id: int | None = None, allow_recreate=False): endpoint = self.endpoint(pageserver_id) - + if allow_recreate: + endpoint.safe_psql(f"DROP TABLE IF EXISTS {self.table};") endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);") endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") last_flush_lsn_upload( self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id ) - def write_rows(self, n: int, pageserver_id: Optional[int] = None, upload: bool = True): + def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) start = self.expect_rows end = start + n - 1 @@ -125,7 +126,7 @@ class Workload: return False def churn_rows( - self, n: int, pageserver_id: Optional[int] = None, upload: bool = True, ingest: bool = True + self, n: int, pageserver_id: int | None = None, upload: bool = True, ingest: bool = True ): assert self.expect_rows >= n @@ -190,9 +191,9 @@ class Workload: else: log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") - def validate(self, pageserver_id: Optional[int] = None): + def validate(self, pageserver_id: int | None = None): endpoint = self.endpoint(pageserver_id) - endpoint.clear_shared_buffers() + endpoint.clear_buffers() result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}") log.info(f"validate({self.expect_rows}): {result}") diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 70d75a6dcf..85096d3770 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -15,6 +15,7 @@ Some handy pytest flags for local development: - `-k` selects a test to run - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`) - `--preserve-database-files` to skip cleanup +- `--out-dir` to produce a JSON with the recorded test metrics # What performance tests do we have and how we run them @@ -36,6 +37,6 @@ All tests run only once. Usually to obtain more consistent performance numbers, ## Results collection -Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. +Local test results for main branch, and results of daily performance tests, are stored in a [neon project](https://console.neon.tech/app/projects/withered-sky-69117821) deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with neon-local-ci and neon-staging variants. I.e. some tests under neon-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]` which is highly confusing. diff --git a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py b/test_runner/performance/pageserver/test_pageserver_getpage_merge.py new file mode 100644 index 0000000000..34cce9900b --- /dev/null +++ b/test_runner/performance/pageserver/test_pageserver_getpage_merge.py @@ -0,0 +1,307 @@ +import dataclasses +import json +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.utils import humantime_to_ms + +TARGET_RUNTIME = 60 + + +@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095") +@pytest.mark.parametrize( + "tablesize_mib, batch_timeout, target_runtime, effective_io_concurrency, readhead_buffer_size, name", + [ + # the next 4 cases demonstrate how not-batchable workloads suffer from batching timeout + (50, None, TARGET_RUNTIME, 1, 128, "not batchable no batching"), + (50, "10us", TARGET_RUNTIME, 1, 128, "not batchable 10us timeout"), + (50, "1ms", TARGET_RUNTIME, 1, 128, "not batchable 1ms timeout"), + # the next 4 cases demonstrate how batchable workloads benefit from batching + (50, None, TARGET_RUNTIME, 100, 128, "batchable no batching"), + (50, "10us", TARGET_RUNTIME, 100, 128, "batchable 10us timeout"), + (50, "100us", TARGET_RUNTIME, 100, 128, "batchable 100us timeout"), + (50, "1ms", TARGET_RUNTIME, 100, 128, "batchable 1ms timeout"), + ], +) +def test_getpage_merge_smoke( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + tablesize_mib: int, + batch_timeout: str | None, + target_runtime: int, + effective_io_concurrency: int, + readhead_buffer_size: int, + name: str, +): + """ + Do a bunch of sequential scans and ensure that the pageserver does some merging. + """ + + # + # record perf-related parameters as metrics to simplify processing of results + # + params: dict[str, tuple[float | int, dict[str, Any]]] = {} + + params.update( + { + "tablesize_mib": (tablesize_mib, {"unit": "MiB"}), + "batch_timeout": ( + -1 if batch_timeout is None else 1e3 * humantime_to_ms(batch_timeout), + {"unit": "us"}, + ), + # target_runtime is just a polite ask to the workload to run for this long + "effective_io_concurrency": (effective_io_concurrency, {}), + "readhead_buffer_size": (readhead_buffer_size, {}), + # name is not a metric + } + ) + + log.info("params: %s", params) + + for param, (value, kwargs) in params.items(): + zenbenchmark.record( + param, + metric_value=value, + unit=kwargs.pop("unit", ""), + report=MetricReport.TEST_PARAM, + **kwargs, + ) + + # + # Setup + # + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + endpoint = env.endpoints.create_start("main") + conn = endpoint.connect() + cur = conn.cursor() + + cur.execute("SET max_parallel_workers_per_gather=0") # disable parallel backends + cur.execute(f"SET effective_io_concurrency={effective_io_concurrency}") + cur.execute( + f"SET neon.readahead_buffer_size={readhead_buffer_size}" + ) # this is the current default value, but let's hard-code that + + cur.execute("CREATE EXTENSION IF NOT EXISTS neon;") + cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") + + log.info("Filling the table") + cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)") + tablesize = tablesize_mib * 1024 * 1024 + npages = tablesize // (8 * 1024) + cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) + # TODO: can we force postgres to do sequential scans? + + # + # Run the workload, collect `Metrics` before and after, calculate difference, normalize. + # + + @dataclass + class Metrics: + time: float + pageserver_getpage_count: float + pageserver_vectored_get_count: float + compute_getpage_count: float + pageserver_cpu_seconds_total: float + + def __sub__(self, other: "Metrics") -> "Metrics": + return Metrics( + time=self.time - other.time, + pageserver_getpage_count=self.pageserver_getpage_count + - other.pageserver_getpage_count, + pageserver_vectored_get_count=self.pageserver_vectored_get_count + - other.pageserver_vectored_get_count, + compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count, + pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total + - other.pageserver_cpu_seconds_total, + ) + + def normalize(self, by) -> "Metrics": + return Metrics( + time=self.time / by, + pageserver_getpage_count=self.pageserver_getpage_count / by, + pageserver_vectored_get_count=self.pageserver_vectored_get_count / by, + compute_getpage_count=self.compute_getpage_count / by, + pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by, + ) + + def get_metrics() -> Metrics: + with conn.cursor() as cur: + cur.execute( + "select value from neon_perf_counters where metric='getpage_wait_seconds_count';" + ) + compute_getpage_count = cur.fetchall()[0][0] + pageserver_metrics = ps_http.get_metrics() + return Metrics( + time=time.time(), + pageserver_getpage_count=pageserver_metrics.query_one( + "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"} + ).value, + pageserver_vectored_get_count=pageserver_metrics.query_one( + "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"} + ).value, + compute_getpage_count=compute_getpage_count, + pageserver_cpu_seconds_total=pageserver_metrics.query_one( + "libmetrics_process_cpu_seconds_highres" + ).value, + ) + + def workload() -> Metrics: + start = time.time() + iters = 0 + while time.time() - start < target_runtime or iters < 2: + log.info("Seqscan %d", iters) + if iters == 1: + # round zero for warming up + before = get_metrics() + cur.execute( + "select clear_buffer_cache()" + ) # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests + cur.execute("select sum(data::bigint) from t") + assert cur.fetchall()[0][0] == npages * (npages + 1) // 2 + iters += 1 + after = get_metrics() + return (after - before).normalize(iters - 1) + + env.pageserver.patch_config_toml_nonrecursive({"server_side_batch_timeout": batch_timeout}) + env.pageserver.restart() + metrics = workload() + + log.info("Results: %s", metrics) + + # + # Sanity-checks on the collected data + # + # assert that getpage counts roughly match between compute and ps + assert metrics.pageserver_getpage_count == pytest.approx( + metrics.compute_getpage_count, rel=0.01 + ) + + # + # Record the results + # + + for metric, value in dataclasses.asdict(metrics).items(): + zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM) + + zenbenchmark.record( + "perfmetric.batching_factor", + metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count, + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + +@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095") +@pytest.mark.parametrize( + "batch_timeout", [None, "10us", "20us", "50us", "100us", "200us", "500us", "1ms"] +) +def test_timer_precision( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + batch_timeout: str | None, +): + """ + Determine the batching timeout precision (mean latency) and tail latency impact. + + The baseline is `None`; an ideal batching timeout implementation would increase + the mean latency by exactly `batch_timeout`. + + That is not the case with the current implementation, will be addressed in future changes. + """ + + # + # Setup + # + + def patch_ps_config(ps_config): + ps_config["server_side_batch_timeout"] = batch_timeout + + neon_env_builder.pageserver_config_override = patch_ps_config + + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + conn = endpoint.connect() + cur = conn.cursor() + + cur.execute("SET max_parallel_workers_per_gather=0") # disable parallel backends + cur.execute("SET effective_io_concurrency=1") + + cur.execute("CREATE EXTENSION IF NOT EXISTS neon;") + cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") + + log.info("Filling the table") + cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)") + tablesize = 50 * 1024 * 1024 + npages = tablesize // (8 * 1024) + cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) + # TODO: can we force postgres to do sequential scans? + + cur.close() + conn.close() + + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + + endpoint.stop() + + for sk in env.safekeepers: + sk.stop() + + # + # Run single-threaded pagebench (TODO: dedup with other benchmark code) + # + + env.pageserver.allowed_errors.append( + # https://github.com/neondatabase/neon/issues/6925 + r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" + ) + + ps_http = env.pageserver.http_client() + + cmd = [ + str(env.neon_binpath / "pagebench"), + "get-page-latest-lsn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--num-clients", + "1", + "--runtime", + "10s", + ] + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path) as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + total = results["total"] + + metric = "latency_mean" + zenbenchmark.record( + metric, + metric_value=humantime_to_ms(total[metric]), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "latency_percentiles" + for k, v in total[metric].items(): + zenbenchmark.record( + f"{metric}.{k}", + metric_value=humantime_to_ms(v), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 227319c425..bcc3db69f0 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -16,7 +16,8 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_until_all_tenants_state if TYPE_CHECKING: - from typing import Any, Callable, Optional + from collections.abc import Callable + from typing import Any def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): @@ -46,7 +47,7 @@ def setup_pageserver_with_tenants( name: str, n_tenants: int, setup: Callable[[NeonEnv], tuple[TenantId, TimelineId, dict[str, Any]]], - timeout_in_seconds: Optional[int] = None, + timeout_in_seconds: int | None = None, ) -> NeonEnv: """ Utility function to set up a pageserver with a given number of identical tenants. diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 36090dcad7..680eb62b39 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -56,7 +56,7 @@ def test_bulk_insert(neon_with_baseline: PgCompare): def measure_recovery_time(env: NeonCompare): client = env.env.pageserver.http_client() - pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"]) + pg_version = PgVersion(str(client.timeline_detail(env.tenant, env.timeline)["pg_version"])) # Delete the Tenant in the pageserver: this will drop local and remote layers, such that # when we "create" the Tenant again, we will replay the WAL from the beginning. diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 8868dddf39..0cd1080fa7 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -103,6 +103,9 @@ def test_compaction_l0_memory(neon_compare: NeonCompare): cur.execute(f"update tbl{i} set j = {j};") wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint( + tenant_id, timeline_id, compact=False + ) # ^1: flush all in-memory layers endpoint.stop() # Check we have generated the L0 stack we expected @@ -118,7 +121,9 @@ def test_compaction_l0_memory(neon_compare: NeonCompare): return v * 1024 before = rss_hwm() - pageserver_http.timeline_compact(tenant_id, timeline_id) + pageserver_http.timeline_compact( + tenant_id, timeline_id + ) # ^1: we must ensure during this process no new L0 layers are flushed after = rss_hwm() log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})") @@ -137,7 +142,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare): # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which # this memory estimate can be revised far downwards to something that doesn't scale # linearly with the layer sizes. - MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5 + MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25 # If we find that compaction is using more memory, this may indicate a regression assert compaction_mapped_rss < MEMORY_ESTIMATE diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index d571fab6b5..0e56fdc96f 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -2,7 +2,7 @@ from __future__ import annotations from contextlib import closing from io import BufferedReader, RawIOBase -from typing import Optional, final +from typing import final from fixtures.compare_fixtures import PgCompare from typing_extensions import override @@ -13,7 +13,7 @@ class CopyTestData(RawIOBase): def __init__(self, rows: int): self.rows = rows self.rownum = 0 - self.linebuf: Optional[bytes] = None + self.linebuf: bytes | None = None self.ptr = 0 @override diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index d56f6dce09..38b04b9114 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -18,7 +18,7 @@ from fixtures.neon_api import connection_parameters_to_env from fixtures.pg_version import PgVersion if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.neon_api import NeonAPI @@ -247,7 +247,7 @@ def test_replication_start_stop( ], env=master_env, ) - replica_pgbench: list[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)] + replica_pgbench: list[subprocess.Popen[Any] | None] = [None] * num_replicas # Use the bits of iconfig to tell us which configuration we are on. For example # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py index 77e8f2cf17..e965aae5a0 100644 --- a/test_runner/performance/test_sharded_ingest.py +++ b/test_runner/performance/test_sharded_ingest.py @@ -15,16 +15,21 @@ from fixtures.neon_fixtures import ( @pytest.mark.timeout(600) @pytest.mark.parametrize("shard_count", [1, 8, 32]) +@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"]) def test_sharded_ingest( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, shard_count: int, + wal_receiver_protocol: str, ): """ Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper and fanning out to a large number of shards on dedicated Pageservers. Comparing the base case (shard_count=1) to the sharded case indicates the overhead of sharding. """ + neon_env_builder.pageserver_config_override = ( + f"wal_receiver_protocol = '{wal_receiver_protocol}'" + ) ROW_COUNT = 100_000_000 # about 7 GB of WAL @@ -50,7 +55,6 @@ def test_sharded_ingest( # Start the endpoint. endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) - # Ingest data and measure WAL volume and duration. with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -68,4 +72,48 @@ def test_sharded_ingest( wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024)) zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM) + total_ingested = 0 + total_records_received = 0 + ingested_by_ps = [] + for pageserver in env.pageservers: + ingested = pageserver.http_client().get_metric_value( + "pageserver_wal_ingest_bytes_received_total" + ) + records_received = pageserver.http_client().get_metric_value( + "pageserver_wal_ingest_records_received_total" + ) + + if ingested is None: + ingested = 0 + + if records_received is None: + records_received = 0 + + ingested_by_ps.append( + ( + pageserver.id, + { + "ingested": ingested, + "records_received": records_received, + }, + ) + ) + + total_ingested += int(ingested) + total_records_received += int(records_received) + + total_ingested_mb = total_ingested / (1024 * 1024) + zenbenchmark.record("wal_ingested", total_ingested_mb, "MB", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record( + "records_received", total_records_received, "records", MetricReport.LOWER_IS_BETTER + ) + + ingested_by_ps.sort(key=lambda x: x[0]) + for _, stats in ingested_by_ps: + for k in stats: + if k != "records_received": + stats[k] /= 1024**2 + + log.info(f"WAL ingested by each pageserver {ingested_by_ps}") + assert tenant_get_shards(env, tenant_id) == shards, "shards moved" diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index dc051483f8..142bd3d669 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -4,7 +4,7 @@ import concurrent.futures import random import time from collections import defaultdict -from enum import Enum +from enum import StrEnum import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId @@ -139,7 +139,7 @@ def test_storage_controller_many_tenants( tenant_timelines_count = 100 # These lists are maintained for use with rng.choice - tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count)) + tenants_with_timelines = list(rng.sample(list(tenants.keys()), tenant_timelines_count)) tenants_without_timelines = list( tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines ) @@ -171,7 +171,7 @@ def test_storage_controller_many_tenants( # start timing on test nodes if we aren't a bit careful. create_concurrency = 16 - class Operation(str, Enum): + class Operation(StrEnum): TIMELINE_OPS = "timeline_ops" SHARD_MIGRATE = "shard_migrate" TENANT_PASSTHROUGH = "tenant_passthrough" diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 576a4f0467..c6d795ce4d 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -17,7 +17,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix if TYPE_CHECKING: - from typing import Any, Callable + from collections.abc import Callable + from typing import Any @pytest.fixture(params=["vanilla", "neon_off", "neon_on"]) diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 7d19ba3b5d..5744c445f6 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -2,7 +2,6 @@ from __future__ import annotations from collections.abc import Generator from dataclasses import dataclass -from typing import Optional import pytest from fixtures.common_types import TenantId @@ -105,7 +104,7 @@ def test_null_config(negative_env: NegativeTests): @pytest.mark.parametrize("content_type", [None, "application/json"]) -def test_empty_config(positive_env: NeonEnv, content_type: Optional[str]): +def test_empty_config(positive_env: NeonEnv, content_type: str | None): """ When the 'config' body attribute is omitted, the request should be accepted and the tenant should use the default configuration diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py index 57d5b2d8b3..2db16d9f64 100644 --- a/test_runner/regress/test_combocid.py +++ b/test_runner/regress/test_combocid.py @@ -5,12 +5,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver def do_combocid_op(neon_env_builder: NeonEnvBuilder, op): env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start( - "main", - config_lines=[ - "shared_buffers='1MB'", - ], - ) + endpoint = env.endpoints.create_start("main") conn = endpoint.connect() cur = conn.cursor() @@ -36,7 +31,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op): # Clear the cache, so that we exercise reconstructing the pages # from WAL - endpoint.clear_shared_buffers() + endpoint.clear_buffers() # Check that the cursor opened earlier still works. If the # combocids are not restored correctly, it won't. @@ -65,12 +60,7 @@ def test_combocid_lock(neon_env_builder: NeonEnvBuilder): def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start( - "main", - config_lines=[ - "shared_buffers='1MB'", - ], - ) + endpoint = env.endpoints.create_start("main") conn = endpoint.connect() cur = conn.cursor() @@ -98,7 +88,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder): cur.execute("delete from t") # Clear the cache, so that we exercise reconstructing the pages # from WAL - endpoint.clear_shared_buffers() + endpoint.clear_buffers() # Check that the cursor opened earlier still works. If the # combocids are not restored correctly, it won't. diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 48950a5a50..79fd256304 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,9 +1,8 @@ from __future__ import annotations -import enum import json import time -from typing import TYPE_CHECKING +from enum import StrEnum import pytest from fixtures.log_helper import log @@ -15,10 +14,6 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - - AGGRESIVE_COMPACTION_TENANT_CONF = { # Disable gc and compaction. The test runs compaction manually. "gc_period": "0s", @@ -32,7 +27,8 @@ AGGRESIVE_COMPACTION_TENANT_CONF = { @skip_in_debug_build("only run with release build") -def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"]) +def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: str): """ This is a smoke test that compaction kicks in. The workload repeatedly churns a small number of rows and manually instructs the pageserver to run compaction @@ -43,8 +39,8 @@ def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder): # Effectively disable the page cache to rely only on image layers # to shorten reads. - neon_env_builder.pageserver_config_override = """ -page_cache_size=10 + neon_env_builder.pageserver_config_override = f""" +page_cache_size=10; wal_receiver_protocol='{wal_receiver_protocol}' """ env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF) @@ -172,7 +168,7 @@ LARGE_STRIPES = 32768 def test_sharding_compaction( neon_env_builder: NeonEnvBuilder, stripe_size: int, - shard_count: Optional[int], + shard_count: int | None, gc_compaction: bool, ): """ @@ -277,7 +273,7 @@ def test_sharding_compaction( ) -class CompactionAlgorithm(str, enum.Enum): +class CompactionAlgorithm(StrEnum): LEGACY = "legacy" TIERED = "tiered" diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 96ba3dd5a4..ba7305148f 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -7,7 +7,6 @@ import subprocess import tempfile from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING import fixtures.utils import pytest @@ -28,10 +27,6 @@ from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - - # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. # - `test_create_snapshot` a script wrapped in a test that creates a data snapshot. @@ -385,7 +380,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r def dump_differs( - first: Path, second: Path, output: Path, allowed_diffs: Optional[list[str]] = None + first: Path, second: Path, output: Path, allowed_diffs: list[str] | None = None ) -> bool: """ Runs diff(1) command on two SQL dumps and write the output to the given output file. diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index c5e3034591..1b15c5f15e 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -3,6 +3,7 @@ from __future__ import annotations import enum import os import shutil +from enum import StrEnum from pathlib import Path from typing import TYPE_CHECKING, cast @@ -16,7 +17,7 @@ from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR if TYPE_CHECKING: from types import TracebackType - from typing import Optional, TypedDict, Union + from typing import Self, TypedDict from fixtures.neon_fixtures import NeonEnv from fixtures.pg_version import PgVersion @@ -26,15 +27,15 @@ if TYPE_CHECKING: metric_name: str type: str help: str - key_labels: Optional[list[str]] - values: Optional[list[str]] - query: Optional[str] - query_ref: Optional[str] + key_labels: list[str] | None + values: list[str] | None + query: str | None + query_ref: str | None class Collector(TypedDict): collector_name: str metrics: list[Metric] - queries: Optional[list[Query]] + queries: list[Query] | None class Query(TypedDict): query_name: str @@ -53,12 +54,12 @@ def __import_callback(dir: str, rel: str) -> tuple[str, bytes]: if not rel: raise RuntimeError("Empty filename") - full_path: Optional[str] = None + full_path: str | None = None if os.path.isabs(rel): full_path = rel else: for p in (dir, *JSONNET_PATH): - assert isinstance(p, (str, Path)), "for mypy" + assert isinstance(p, str | Path), "for mypy" full_path = os.path.join(p, rel) assert isinstance(full_path, str), "for mypy" @@ -82,9 +83,9 @@ def __import_callback(dir: str, rel: str) -> tuple[str, bytes]: def jsonnet_evaluate_file( - jsonnet_file: Union[str, Path], - ext_vars: Optional[Union[str, dict[str, str]]] = None, - tla_vars: Optional[Union[str, dict[str, str]]] = None, + jsonnet_file: str | Path, + ext_vars: str | dict[str, str] | None = None, + tla_vars: str | dict[str, str] | None = None, ) -> str: return cast( "str", @@ -102,7 +103,7 @@ def evaluate_collector(jsonnet_file: Path, pg_version: PgVersion) -> str: def evaluate_config( - jsonnet_file: Path, collector_name: str, collector_file: Union[str, Path], connstr: str + jsonnet_file: Path, collector_name: str, collector_file: str | Path, connstr: str ) -> str: return jsonnet_evaluate_file( jsonnet_file, @@ -115,7 +116,7 @@ def evaluate_config( @enum.unique -class SqlExporterProcess(str, enum.Enum): +class SqlExporterProcess(StrEnum): COMPUTE = "compute" AUTOSCALING = "autoscaling" @@ -184,16 +185,16 @@ class SqlExporterRunner: def stop(self) -> None: raise NotImplementedError() - def __enter__(self) -> SqlExporterRunner: + def __enter__(self) -> Self: self.start() return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop() @@ -241,8 +242,7 @@ if SQL_EXPORTER is None: self.with_volume_mapping(str(config_file), container_config_file, "z") self.with_volume_mapping(str(collector_file), container_collector_file, "z") - @override - def start(self) -> SqlExporterContainer: + def start(self) -> Self: super().start() log.info("Waiting for sql_exporter to be ready") diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 23c6fa3a5a..70e71d99cd 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -19,7 +19,14 @@ from fixtures.neon_fixtures import NeonEnvBuilder "wal_record_crossing_segment_followed_by_small_one", ], ) -def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): +@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"]) +def test_crafted_wal_end( + neon_env_builder: NeonEnvBuilder, wal_type: str, wal_receiver_protocol: str +): + neon_env_builder.pageserver_config_override = ( + f"wal_receiver_protocol = '{wal_receiver_protocol}'" + ) + env = neon_env_builder.init_start() env.create_branch("test_crafted_wal_end") env.pageserver.allowed_errors.extend( diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index e517e83e6f..1c5554c379 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -13,7 +13,7 @@ from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any, Self def handle_db(dbs, roles, operation): @@ -91,15 +91,15 @@ class DdlForwardingContext: lambda request: ddl_forward_handler(request, self.dbs, self.roles, self) ) - def __enter__(self): + def __enter__(self) -> Self: self.pg.start() return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.pg.stop() diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index c8d3b2ff3e..1807511008 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -5,6 +5,7 @@ import time from collections import Counter from collections.abc import Iterable from dataclasses import dataclass +from enum import StrEnum from typing import TYPE_CHECKING import pytest @@ -80,7 +81,7 @@ def test_min_resident_size_override_handling( @enum.unique -class EvictionOrder(str, enum.Enum): +class EvictionOrder(StrEnum): RELATIVE_ORDER_EQUAL = "relative_equal" RELATIVE_ORDER_SPARE = "relative_spare" diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py index 2128bd93dd..382556fd7e 100644 --- a/test_runner/regress/test_explain_with_lfc_stats.py +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -2,10 +2,13 @@ from __future__ import annotations from pathlib import Path +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import USE_LFC +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): env = neon_simple_env @@ -16,8 +19,6 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): endpoint = env.endpoints.create_start( "main", config_lines=[ - "shared_buffers='1MB'", - f"neon.file_cache_path='{cache_dir}/file.cache'", "neon.max_file_cache_size='128MB'", "neon.file_cache_size_limit='64MB'", ], diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index a906e7a243..0b1ac11c16 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -170,7 +170,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): # re-execute the query, it will make GetPage # requests. This does not clear the last-written LSN cache # so we still remember the LSNs of the pages. - secondary.clear_shared_buffers(cursor=s_cur) + secondary.clear_buffers(cursor=s_cur) if pause_apply: s_cur.execute("SELECT pg_wal_replay_pause()") diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py new file mode 100644 index 0000000000..29229b73c1 --- /dev/null +++ b/test_runner/regress/test_import_pgdata.py @@ -0,0 +1,307 @@ +import json +import re +import time +from enum import Enum + +import psycopg2 +import psycopg2.errors +import pytest +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, VanillaPostgres +from fixtures.pageserver.http import ( + ImportPgdataIdemptencyKey, + PageserverApiException, +) +from fixtures.pg_version import PgVersion +from fixtures.remote_storage import RemoteStorageKind +from fixtures.utils import run_only_on_postgres +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +num_rows = 1000 + + +class RelBlockSize(Enum): + ONE_STRIPE_SIZE = 1 + TWO_STRPES_PER_SHARD = 2 + MULTIPLE_RELATION_SEGMENTS = 3 + + +smoke_params = [ + # unsharded (the stripe size needs to be given for rel block size calculations) + *[(None, 1024, s) for s in RelBlockSize], + # many shards, small stripe size to speed up test + *[(8, 1024, s) for s in RelBlockSize], +] + + +@run_only_on_postgres( + [PgVersion.V14, PgVersion.V15, PgVersion.V16], + "newer control file catalog version and struct format isn't supported", +) +@pytest.mark.parametrize("shard_count,stripe_size,rel_block_size", smoke_params) +def test_pgdata_import_smoke( + vanilla_pg: VanillaPostgres, + neon_env_builder: NeonEnvBuilder, + shard_count: int | None, + stripe_size: int, + rel_block_size: RelBlockSize, + make_httpserver: HTTPServer, +): + # + # Setup fake control plane for import progress + # + def handler(request: Request) -> Response: + log.info(f"control plane request: {request.json}") + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler) + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + + env.pageserver.patch_config_toml_nonrecursive( + { + "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api" + } + ) + env.pageserver.stop() + env.pageserver.start() + + # + # Put data in vanilla pg + # + + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + + log.info("create relblock data") + if rel_block_size == RelBlockSize.ONE_STRIPE_SIZE: + target_relblock_size = stripe_size * 8192 + elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: + target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 + elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: + target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) + else: + raise ValueError + + # fillfactor so we don't need to produce that much data + # 900 byte per row is > 10% => 1 row per page + vanilla_pg.safe_psql("""create table t (data char(900)) with (fillfactor = 10)""") + + nrows = 0 + while True: + relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')") + log.info( + f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages" + ) + if relblock_size >= target_relblock_size: + break + addrows = int((target_relblock_size - relblock_size) // 8192) + assert addrows >= 1, "forward progress" + vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})") + nrows += addrows + expect_nrows = nrows + expect_sum = ( + (nrows) * (nrows + 1) // 2 + ) # https://stackoverflow.com/questions/43901484/sum-of-the-integers-from-1-to-n + + def validate_vanilla_equivalence(ep): + # TODO: would be nicer to just compare pgdump + assert ep.safe_psql("select count(*), sum(data::bigint)::bigint from t") == [ + (expect_nrows, expect_sum) + ] + + validate_vanilla_equivalence(vanilla_pg) + + vanilla_pg.stop() + + # + # We have a Postgres data directory now. + # Make a localfs remote storage that looks like how after `fast_import` ran. + # TODO: actually exercise fast_import here + # TODO: test s3 remote storage + # + importbucket = neon_env_builder.repo_dir / "importbucket" + importbucket.mkdir() + # what cplane writes before scheduling fast_import + specpath = importbucket / "spec.json" + specpath.write_text(json.dumps({"branch_id": "somebranch", "project_id": "someproject"})) + # what fast_import writes + vanilla_pg.pgdatadir.rename(importbucket / "pgdata") + statusdir = importbucket / "status" + statusdir.mkdir() + (statusdir / "pgdata").write_text(json.dumps({"done": True})) + + # + # Do the import + # + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, shard_count=shard_count, shard_stripe_size=stripe_size + ) + + timeline_id = TimelineId.generate() + log.info("starting import") + start = time.monotonic() + + idempotency = ImportPgdataIdemptencyKey.random() + log.info(f"idempotency key {idempotency}") + # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop + # and check for 429 + + import_branch_name = "imported" + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": {"LocalFs": {"path": str(importbucket.absolute())}}, + }, + }, + ) + env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) + + while True: + locations = env.storage_controller.locate(tenant_id) + active_count = 0 + for location in locations: + shard_id = TenantShardId.parse(location["shard_id"]) + ps = env.get_pageserver(location["node_id"]) + try: + detail = ps.http_client().timeline_detail(shard_id, timeline_id) + state = detail["state"] + log.info(f"shard {shard_id} state: {state}") + if state == "Active": + active_count += 1 + except PageserverApiException as e: + if e.status_code == 404: + log.info("not found, import is in progress") + continue + elif e.status_code == 429: + log.info("import is in progress") + continue + else: + raise + + shard_status_file = statusdir / f"shard-{shard_id.shard_index}" + if state == "Active": + shard_status_file_contents = ( + shard_status_file.read_text() + ) # Active state implies import is done + shard_status = json.loads(shard_status_file_contents) + assert shard_status["done"] is True + + if active_count == len(locations): + log.info("all shards are active") + break + time.sleep(1) + + import_duration = time.monotonic() - start + log.info(f"import complete; duration={import_duration:.2f}s") + + # + # Get some timeline details for later. + # + locations = env.storage_controller.locate(tenant_id) + [shard_zero] = [ + loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0 + ] + shard_zero_ps = env.get_pageserver(shard_zero["node_id"]) + shard_zero_http = shard_zero_ps.http_client() + shard_zero_timeline_info = shard_zero_http.timeline_detail(shard_zero["shard_id"], timeline_id) + initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) + latest_gc_cutoff_lsn = Lsn(shard_zero_timeline_info["latest_gc_cutoff_lsn"]) + last_record_lsn = Lsn(shard_zero_timeline_info["last_record_lsn"]) + disk_consistent_lsn = Lsn(shard_zero_timeline_info["disk_consistent_lsn"]) + _remote_consistent_lsn = Lsn(shard_zero_timeline_info["remote_consistent_lsn"]) + remote_consistent_lsn_visible = Lsn(shard_zero_timeline_info["remote_consistent_lsn_visible"]) + # assert remote_consistent_lsn_visible == remote_consistent_lsn TODO: this fails initially and after restart, presumably because `UploadQueue::clean.1` is still `None` + assert remote_consistent_lsn_visible == disk_consistent_lsn + assert initdb_lsn == latest_gc_cutoff_lsn + assert disk_consistent_lsn == initdb_lsn + 8 + assert last_record_lsn == disk_consistent_lsn + # TODO: assert these values are the same everywhere + + # + # Validate the resulting remote storage state. + # + + # + # Validate the imported data + # + + ro_endpoint = env.endpoints.create_start( + branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn + ) + + validate_vanilla_equivalence(ro_endpoint) + + # ensure the import survives restarts + ro_endpoint.stop() + env.pageserver.stop(immediate=True) + env.pageserver.start() + ro_endpoint.start() + validate_vanilla_equivalence(ro_endpoint) + + # + # validate the layer files in each shard only have the shard-specific data + # (the implementation would be functional but not efficient without this characteristic) + # + + shards = env.storage_controller.locate(tenant_id) + for shard in shards: + shard_ps = env.get_pageserver(shard["node_id"]) + result = shard_ps.timeline_scan_no_disposable_keys(shard["shard_id"], timeline_id) + assert result.tally.disposable_count == 0 + assert ( + result.tally.not_disposable_count > 0 + ), "sanity check, each shard should have some data" + + # + # validate that we can write + # + rw_endpoint = env.endpoints.create_start( + branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id + ) + rw_endpoint.safe_psql("create table othertable(values text)") + rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + + # TODO: consider using `class Workload` here + # to do compaction and whatnot? + + # + # validate that we can branch (important use case) + # + + # ... at the tip + _ = env.create_branch( + new_branch_name="br-tip", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=rw_lsn, + ) + br_tip_endpoint = env.endpoints.create_start( + branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_tip_endpoint) + br_tip_endpoint.safe_psql("select * from othertable") + + # ... at the initdb lsn + _ = env.create_branch( + new_branch_name="br-initdb", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=initdb_lsn, + ) + br_initdb_endpoint = env.endpoints.create_start( + branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_initdb_endpoint) + with pytest.raises(psycopg2.errors.UndefinedTable): + br_initdb_endpoint.safe_psql("select * from othertable") diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py index 2916748925..9c9bc5b519 100644 --- a/test_runner/regress/test_ingestion_layer_size.py +++ b/test_runner/regress/test_ingestion_layer_size.py @@ -2,16 +2,12 @@ from __future__ import annotations from collections.abc import Iterable from dataclasses import dataclass -from typing import TYPE_CHECKING from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo from fixtures.utils import human_bytes, skip_in_debug_build -if TYPE_CHECKING: - from typing import Union - @skip_in_debug_build("debug run is unnecessarily slow") def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): @@ -109,14 +105,12 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): @dataclass class Histogram: - buckets: list[Union[int, float]] + buckets: list[int | float] counts: list[int] sums: list[int] -def histogram_historic_layers( - infos: LayerMapInfo, minimum_sizes: list[Union[int, float]] -) -> Histogram: +def histogram_historic_layers(infos: LayerMapInfo, minimum_sizes: list[int | float]) -> Histogram: def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo: log.info( f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)" @@ -128,7 +122,7 @@ def histogram_historic_layers( return histogram(sizes, minimum_sizes) -def histogram(sizes: Iterable[int], minimum_sizes: list[Union[int, float]]) -> Histogram: +def histogram(sizes: Iterable[int], minimum_sizes: list[int | float]) -> Histogram: assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1)) buckets = list(enumerate(minimum_sizes)) counts = [0 for _ in buckets] diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 54ce7c8340..04ccec5875 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -99,11 +99,15 @@ def test_installed_extensions(neon_simple_env: NeonEnv): res = client.metrics() info("Metrics: %s", res) m = parse_metrics(res) - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 2 - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 1 @@ -116,7 +120,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv): try: res = client.metrics() timeout = -1 - if len(parse_metrics(res).query_all("installed_extensions")) < 4: + if len(parse_metrics(res).query_all("compute_installed_extensions")) < 4: # Assume that not all metrics that are collected yet time.sleep(1) timeout -= 1 @@ -128,17 +132,21 @@ def test_installed_extensions(neon_simple_env: NeonEnv): continue assert ( - len(parse_metrics(res).query_all("installed_extensions")) >= 4 + len(parse_metrics(res).query_all("compute_installed_extensions")) >= 4 ), "Not all metrics are collected" info("After restart metrics: %s", res) m = parse_metrics(res) - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 1 - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 1 diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 309e0f3015..761ec7568f 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -2,6 +2,7 @@ from __future__ import annotations import time +import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver @@ -19,7 +20,11 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import query_scalar, wait_until -def test_issue_5878(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize( + "attach_mode", + ["default_generation", "same_generation"], +) +def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): """ Regression test for issue https://github.com/neondatabase/neon/issues/5878 . @@ -168,11 +173,32 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): tenant_conf = ps_http.tenant_config(tenant_id) generation_before_detach = get_generation_number() env.pageserver.tenant_detach(tenant_id) - failpoint_name = "before-delete-layer-pausable" + failpoint_deletion_queue = "deletion-queue-before-execute-pause" - ps_http.configure_failpoints((failpoint_name, "pause")) - env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides) - generation_after_reattach = get_generation_number() + ps_http.configure_failpoints((failpoint_deletion_queue, "pause")) + + if attach_mode == "default_generation": + env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides) + elif attach_mode == "same_generation": + # Attach with the same generation number -- this is possible with timeline offload and detach ancestor + env.pageserver.tenant_attach( + tenant_id, + tenant_conf.tenant_specific_overrides, + generation=generation_before_detach, + # We want to avoid the generation bump and don't want to talk with the storcon + override_storage_controller_generation=False, + ) + else: + raise AssertionError(f"Unknown attach_mode: {attach_mode}") + + # Get it from pageserver API instead of storcon API b/c we might not have attached using the storcon + # API if attach_mode == "same_generation" + tenant_location = env.pageserver.http_client().tenant_get_location(tenant_id) + generation_after_reattach = tenant_location["generation"] + + if attach_mode == "same_generation": + # The generation number should be the same as before the detach + assert generation_before_detach == generation_after_reattach wait_until_tenant_active(ps_http, tenant_id) # Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue. @@ -182,15 +208,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): wait_until(10, 0.5, future_layer_is_gone_from_index_part) - # NB: the layer file is unlinked index part now, but, because we made the delete - # operation stuck, the layer file itself is still in the remote_storage - wait_until( - 10, - 0.5, - lambda: env.pageserver.assert_log_contains( - f".*{tenant_id}.*at failpoint.*{failpoint_name}" - ), - ) + # We already make deletion stuck here, but we don't necessarily hit the failpoint + # because deletions are batched. future_layer_path = env.pageserver_remote_storage.remote_layer_path( tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach ) @@ -224,11 +243,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): break time.sleep(1) - # Window has passed, unstuck the delete, let upload queue drain. + # Window has passed, unstuck the delete, let deletion queue drain; the upload queue should + # have drained because we put these layer deletion operations into the deletion queue and + # have consumed the operation from the upload queue. log.info("unstuck the DELETE") - ps_http.configure_failpoints(("before-delete-layer-pausable", "off")) - + ps_http.configure_failpoints((failpoint_deletion_queue, "off")) wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + env.pageserver.http_client().deletion_queue_flush(True) # Examine the resulting S3 state. log.info("integrity-check the remote storage") @@ -247,3 +268,12 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): final_stat = future_layer_path.stat() log.info(f"future layer path: {future_layer_path}") assert final_stat.st_mtime != pre_stat.st_mtime + + # Ensure no weird errors in the end... + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + + if attach_mode == "same_generation": + # we should have detected a race upload and deferred it + env.pageserver.assert_log_contains( + "waiting for deletion queue flush to complete before uploading layer" + ) diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 3083128d87..377b0fb4d4 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import random import re import subprocess @@ -10,20 +9,24 @@ import time import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, PgBin +from fixtures.utils import USE_LFC @pytest.mark.timeout(600) +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): """ Test resizing the Local File Cache """ env = neon_simple_env + cache_dir = env.repo_dir / "file_cache" + cache_dir.mkdir(exist_ok=True) + env.create_branch("test_lfc_resize") endpoint = env.endpoints.create_start( "main", config_lines=[ - "neon.file_cache_path='file.cache'", - "neon.max_file_cache_size=512MB", - "neon.file_cache_size_limit=512MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", ], ) n_resize = 10 @@ -63,8 +66,8 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): cur.execute("select pg_reload_conf()") nretries = 10 while True: - lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache" - lfc_file_size = os.path.getsize(lfc_file_path) + lfc_file_path = endpoint.lfc_path() + lfc_file_size = lfc_file_path.stat().st_size res = subprocess.run( ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True ) diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 36dfec969f..17068849d4 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -3,11 +3,13 @@ from __future__ import annotations import time from pathlib import Path +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv -from fixtures.utils import query_scalar +from fixtures.utils import USE_LFC, query_scalar +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): env = neon_simple_env @@ -18,8 +20,6 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): endpoint = env.endpoints.create_start( "main", config_lines=[ - "shared_buffers='1MB'", - f"neon.file_cache_path='{cache_dir}/file.cache'", "neon.max_file_cache_size='128MB'", "neon.file_cache_size_limit='64MB'", ], @@ -72,9 +72,10 @@ WITH (fillfactor='100'); # verify working set size after some index access of a few select pages only blocks = query_scalar(cur, "select approximate_working_set_size(true)") log.info(f"working set size after some index access of a few select pages only {blocks}") - assert blocks < 10 + assert blocks < 12 +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): env = neon_simple_env diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index fbf018a167..94c630ffcf 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -6,10 +6,12 @@ import random import threading import time +import pytest from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.utils import query_scalar +from fixtures.utils import USE_LFC, query_scalar +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() @@ -19,8 +21,6 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): endpoint = env.endpoints.create_start( "main", config_lines=[ - "shared_buffers='1MB'", - f"neon.file_cache_path='{cache_dir}/file.cache'", "neon.max_file_cache_size='64MB'", "neon.file_cache_size_limit='10MB'", ], diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index df83ca1c44..ba471b7147 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -12,7 +12,7 @@ from fixtures.neon_fixtures import ( logical_replication_sync, wait_for_last_flush_lsn, ) -from fixtures.utils import wait_until +from fixtures.utils import USE_LFC, wait_until if TYPE_CHECKING: from fixtures.neon_fixtures import ( @@ -576,7 +576,15 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van # We want all data to fit into shared_buffers because later we stop # safekeeper and insert more; this shouldn't cause page requests as they # will be stuck. - sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"]) + sub = env.endpoints.create( + "subscriber", + config_lines=[ + "neon.max_file_cache_size = 32MB", + "neon.file_cache_size_limit = 32MB", + ] + if USE_LFC + else [], + ) sub.start() with vanilla_pg.cursor() as pcur: diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 8b41d0cb1c..7f0b541128 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -3,7 +3,7 @@ from __future__ import annotations import re import time from concurrent.futures import ThreadPoolExecutor -from datetime import datetime, timedelta, timezone +from datetime import UTC, datetime, timedelta import pytest from fixtures.common_types import Lsn @@ -207,7 +207,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): for i in range(1000): cur.execute("INSERT INTO foo VALUES(%s)", (i,)) # Get the timestamp at UTC - after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc) + after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=UTC) after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()") tbl.append([i, after_timestamp, after_lsn]) time.sleep(0.02) @@ -273,11 +273,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): ) log.info("result: %s, after_ts: %s", result, after_timestamp) - # TODO use fromisoformat once we have Python 3.11+ - # which has https://github.com/python/cpython/pull/92177 - timestamp = datetime.strptime(result, "%Y-%m-%dT%H:%M:%S.%f000Z").replace( - tzinfo=timezone.utc - ) + timestamp = datetime.fromisoformat(result).replace(tzinfo=UTC) assert timestamp < after_timestamp, "after_timestamp after timestamp" if i > 1: before_timestamp = tbl[i - step_size][1] diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py index f69c1112c7..e2bde8be6f 100644 --- a/test_runner/regress/test_oid_overflow.py +++ b/test_runner/regress/test_oid_overflow.py @@ -39,7 +39,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder): oid = cur.fetchall()[0][0] log.info(f"t2.relfilenode={oid}") - endpoint.clear_shared_buffers(cursor=cur) + endpoint.clear_buffers(cursor=cur) cur.execute("SELECT x from t1") assert cur.fetchone() == (1,) diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py index 5eaba78331..f0f12290cc 100644 --- a/test_runner/regress/test_ondemand_slru_download.py +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Optional - import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log @@ -13,7 +11,7 @@ from fixtures.utils import query_scalar # Test on-demand download of the pg_xact SLRUs # @pytest.mark.parametrize("shard_count", [None, 4]) -def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: int | None): if shard_count is not None: neon_env_builder.num_pageservers = shard_count @@ -79,7 +77,7 @@ def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count @pytest.mark.parametrize("shard_count", [None, 4]) -def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: int | None): if shard_count is not None: neon_env_builder.num_pageservers = shard_count diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index d1b70b9ee6..05e81b82e0 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Optional - from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, @@ -82,7 +80,7 @@ def expect_updated_msg_lsn( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, - prev_msg_lsn: Optional[Lsn], + prev_msg_lsn: Lsn | None, ) -> Lsn: timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 4f59efb8b3..6ba5753420 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -11,11 +11,10 @@ of the pageserver are: from __future__ import annotations -import enum import os import re import time -from typing import TYPE_CHECKING +from enum import StrEnum import pytest from fixtures.common_types import TenantId, TimelineId @@ -41,10 +40,6 @@ from fixtures.remote_storage import ( from fixtures.utils import run_only_on_default_postgres, wait_until from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - - # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. TENANT_CONF = { @@ -65,7 +60,7 @@ TENANT_CONF = { def read_all( - env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None + env: NeonEnv, tenant_id: TenantId | None = None, timeline_id: TimelineId | None = None ): if tenant_id is None: tenant_id = env.initial_tenant @@ -286,12 +281,12 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert get_deletion_queue_unexpected_errors(ps_http) == 0 -class KeepAttachment(str, enum.Enum): +class KeepAttachment(StrEnum): KEEP = "keep" LOSE = "lose" -class ValidateBefore(str, enum.Enum): +class ValidateBefore(StrEnum): VALIDATE = "validate" NO_VALIDATE = "no-validate" @@ -464,7 +459,11 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env.pageserver.start() # The pageserver should provide service to clients - generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) + # Because it is in emergency mode, it will not attempt to validate deletions required by the initial barrier, and therefore + # other files cannot be uploaded b/c it's waiting for the initial barrier to be validated. + generate_uploads_and_deletions( + env, init=False, pageserver=env.pageserver, wait_until_uploaded=False + ) # The pageserver should neither validate nor execute any deletions, it should have # loaded the DeletionLists from before though diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index 200a323a3a..f6a7bfa1ad 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -2,7 +2,6 @@ from __future__ import annotations import asyncio import time -from typing import TYPE_CHECKING import psutil import pytest @@ -17,17 +16,13 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.utils import skip_in_debug_build, wait_until -if TYPE_CHECKING: - from typing import Optional - - TIMELINE_COUNT = 10 ENTRIES_PER_TIMELINE = 10_000 CHECKPOINT_TIMEOUT_SECONDS = 60 async def run_worker_for_tenant( - env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None + env: NeonEnv, entries: int, tenant: TenantId, offset: int | None = None ) -> Lsn: if offset is None: offset = 0 @@ -136,7 +131,7 @@ def test_pageserver_small_inmemory_layers( wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) ps_http_client = env.pageserver.http_client() total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) @@ -144,7 +139,7 @@ def test_pageserver_small_inmemory_layers( # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they # must be uploaded to remain visible to the pageserver after restart. @@ -185,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # Stop the safekeepers, so that we cannot have any more WAL receiver connections for sk in env.safekeepers: @@ -198,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # The code below verifies that we do not flush on the first write # after an idle period longer than the checkpoint timeout. @@ -215,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) ) - dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # We shouldn't flush since we've just opened a new layer waited_for = 0 @@ -317,4 +312,4 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): dirty_bytes = get_dirty_bytes(env) assert dirty_bytes < max_dirty_data - wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) # type: ignore + wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index fb6050689c..4bf5705517 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -2,7 +2,6 @@ from __future__ import annotations import random from contextlib import closing -from typing import Optional import pytest from fixtures.log_helper import log @@ -156,7 +155,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): @pytest.mark.timeout(540) @pytest.mark.parametrize("shard_count", [None, 4]) @skip_in_debug_build("times out in debug builds") -def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: int | None): # same rationale as with the immediate stop; we might leave orphan layers behind. neon_env_builder.disable_scrub_on_exit() neon_env_builder.enable_pageserver_remote_storage(s3_storage()) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index d4aef96735..a264f4d3c9 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -23,7 +23,7 @@ from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any # A tenant configuration that is convenient for generating uploads and deletions @@ -199,7 +199,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, # state if it was running attached with a stale generation last_state[pageserver.id] = ("Detached", None) else: - secondary_conf: Optional[dict[str, Any]] = None + secondary_conf: dict[str, Any] | None = None if mode == "Secondary": secondary_conf = {"warm": rng.choice([True, False])} @@ -365,6 +365,19 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): workload.validate(pageserver_a.id) workload.validate(pageserver_b.id) + # Force compaction on destination pageserver + pageserver_b.http_client().timeline_compact(tenant_id, timeline_id, force_l0_compaction=True) + + # Destination pageserver is in AttachedMulti, it should have generated deletions but + # not enqueued them yet. + # Check deletion metrics via prometheus - should be 0 since we're in AttachedMulti + assert ( + pageserver_b.http_client().get_metric_value( + "pageserver_deletion_queue_submitted_total", + ) + == 0 + ) + # Revert the origin to secondary log.info("Setting origin to Secondary") pageserver_a.tenant_location_configure( @@ -389,6 +402,17 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): }, ) + # Transition to AttachedSingle should have drained deletions generated by doing a compaction + # while in AttachedMulti. + def blocked_deletions_drained(): + submitted = pageserver_b.http_client().get_metric_value( + "pageserver_deletion_queue_submitted_total" + ) + assert submitted is not None + assert submitted > 0 + + wait_until(10, 0.1, blocked_deletions_drained) + workload.churn_rows(64, pageserver_b.id) workload.validate(pageserver_b.id) del workload @@ -445,7 +469,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): def list_elegible_layers( - pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + pageserver, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId ) -> list[Path]: """ The subset of layer filenames that are elegible for secondary download: at time of writing this @@ -678,7 +702,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): else: timeout = int(deadline - now) + 1 try: - wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore + wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) except: log.error(f"Timed out waiting for '{expression}'") raise diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index f4698191eb..2877f14e0e 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -21,8 +21,6 @@ from fixtures.remote_storage import s3_storage from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: - from typing import Optional - from fixtures.neon_fixtures import PgBin from pytest import CaptureFixture @@ -48,7 +46,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End data properly. """ - ignored_files: Optional[list[str]] = None + ignored_files: list[str] | None = None # Neon handles unlogged relations in a special manner. During a # basebackup, we ship the init fork as the main fork. This presents a @@ -110,13 +108,15 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) - # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. + # Ensure that compaction/GC works, on a timeline containing all the diversity that postgres regression tests create. # There should have been compactions mid-test as well, this final check is in addition those. for shard, pageserver in tenant_get_shards(env, env.initial_tenant): pageserver.http_client().timeline_checkpoint( shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True ) + pageserver.http_client().timeline_gc(shard, env.initial_timeline, None) + # Run the main PostgreSQL regression tests, in src/test/regress. # @@ -129,7 +129,7 @@ def test_pg_regress( capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, - shard_count: Optional[int], + shard_count: int | None, ): DBNAME = "regression" @@ -203,7 +203,7 @@ def test_isolation( capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, - shard_count: Optional[int], + shard_count: int | None, ): DBNAME = "isolation_regression" @@ -272,7 +272,7 @@ def test_sql_regress( capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, - shard_count: Optional[int], + shard_count: int | None, ): DBNAME = "regression" diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index e59d46e352..5a01d90d85 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -13,7 +13,7 @@ import requests from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any GET_CONNECTION_PID_QUERY = "SELECT pid FROM pg_stat_activity WHERE state = 'active'" @@ -228,7 +228,7 @@ def test_sql_over_http_serverless_driver(static_proxy: NeonProxy): def test_sql_over_http(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") - def q(sql: str, params: Optional[list[Any]] = None) -> Any: + def q(sql: str, params: list[Any] | None = None) -> Any: params = params or [] connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" response = requests.post( @@ -291,7 +291,7 @@ def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): ) ) - def q(sql: str, params: Optional[list[Any]] = None) -> Any: + def q(sql: str, params: list[Any] | None = None) -> Any: params = params or [] connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}" response = requests.post( @@ -310,7 +310,7 @@ def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): def test_sql_over_http_output_options(static_proxy: NeonProxy): static_proxy.safe_psql("create role http2 with login password 'http2' superuser") - def q(sql: str, raw_text: bool, array_mode: bool, params: Optional[list[Any]] = None) -> Any: + def q(sql: str, raw_text: bool, array_mode: bool, params: list[Any] | None = None) -> Any: params = params or [] connstr = ( f"postgresql://http2:http2@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" @@ -346,7 +346,7 @@ def test_sql_over_http_batch(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") def qq( - queries: list[tuple[str, Optional[list[Any]]]], + queries: list[tuple[str, list[Any] | None]], read_only: bool = False, deferrable: bool = False, ) -> Any: diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 471a3b406a..70a7a675df 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -54,7 +54,7 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Clear buffer cache to ensure no stale pages are brought into the cache") - endpoint.clear_shared_buffers(cursor=c) + endpoint.clear_buffers(cursor=c) cache_entries = query_scalar( c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 826136d5f9..70d558ac5a 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,7 +1,6 @@ from __future__ import annotations import time -from typing import Union import pytest from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId @@ -122,7 +121,6 @@ def test_readonly_node(neon_simple_env: NeonEnv): ) -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/9754") def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): """ Test static endpoint is protected from GC by acquiring and renewing lsn leases. @@ -175,7 +173,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): def get_layers_protected_by_lease( ps_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lease_lsn: Lsn, ) -> set[str]: @@ -232,7 +230,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): return offset # Insert some records on main branch - with env.endpoints.create_start("main") as ep_main: + with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main: with ep_main.cursor() as cur: cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)") lsn = Lsn(0) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 79b5ebe39a..137e75f784 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -5,7 +5,6 @@ import queue import shutil import threading import time -from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TenantId, TimelineId @@ -37,9 +36,6 @@ from fixtures.utils import ( ) from requests import ReadTimeout -if TYPE_CHECKING: - from typing import Optional - # # Tests that a piece of data is backed up and restored correctly: @@ -452,7 +448,7 @@ def test_remote_timeline_client_calls_started_metric( for (file_kind, op_kind), observations in calls_started.items(): log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}") assert all( - x < y for x, y in zip(observations, observations[1:]) + x < y for x, y in zip(observations, observations[1:], strict=False) ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" def churn(data_pass1, data_pass2): @@ -731,7 +727,7 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv # sleep a bit to force the upload task go into exponential backoff time.sleep(1) - q: queue.Queue[Optional[PageserverApiException]] = queue.Queue() + q: queue.Queue[PageserverApiException | None] = queue.Queue() barrier = threading.Barrier(2) def create_in_background(): diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 7a9e6d62b2..8764da3c2f 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -1,7 +1,7 @@ from __future__ import annotations import time -from datetime import datetime, timezone +from datetime import UTC, datetime from fixtures.common_types import Lsn from fixtures.log_helper import log @@ -77,7 +77,7 @@ def test_tenant_s3_restore( # These sleeps are important because they fend off differences in clocks between us and S3 time.sleep(4) - ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_before_deletion = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) assert ( @@ -104,7 +104,7 @@ def test_tenant_s3_restore( ) time.sleep(4) - ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_after_deletion = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) ps_http.tenant_time_travel_remote_storage( diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 0a4a53356d..411574bd86 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -3,7 +3,7 @@ from __future__ import annotations import os import time from collections import defaultdict -from typing import TYPE_CHECKING, Any +from typing import Any import pytest import requests @@ -19,7 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty -from fixtures.remote_storage import s3_storage +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, s3_storage from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload from pytest_httpserver import HTTPServer @@ -27,9 +27,6 @@ from typing_extensions import override from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response -if TYPE_CHECKING: - from typing import Optional, Union - def test_sharding_smoke( neon_env_builder: NeonEnvBuilder, @@ -189,7 +186,7 @@ def test_sharding_split_unsharded( ], ) def test_sharding_split_compaction( - neon_env_builder: NeonEnvBuilder, failpoint: Optional[str], build_type: str + neon_env_builder: NeonEnvBuilder, failpoint: str | None, build_type: str ): """ Test that after a split, we clean up parent layer data in the child shards via compaction. @@ -515,11 +512,12 @@ def test_sharding_split_smoke( """ - # We will start with 4 shards and split into 8, then migrate all those - # 8 shards onto separate pageservers - shard_count = 4 - split_shard_count = 8 - neon_env_builder.num_pageservers = split_shard_count * 2 + # Shard count we start with + shard_count = 2 + # Shard count we split into + split_shard_count = 4 + # We will have 2 shards per pageserver once done (including secondaries) + neon_env_builder.num_pageservers = split_shard_count # 1MiB stripes: enable getting some meaningful data distribution without # writing large quantities of data in this test. The stripe size is given @@ -591,7 +589,7 @@ def test_sharding_split_smoke( workload.validate() - assert len(pre_split_pageserver_ids) == 4 + assert len(pre_split_pageserver_ids) == shard_count def shards_on_disk(shard_ids): for pageserver in env.pageservers: @@ -654,9 +652,9 @@ def test_sharding_split_smoke( # - shard_count reconciles for the original setup of the tenant # - shard_count reconciles for detaching the original secondary locations during split # - split_shard_count reconciles during shard splitting, for setting up secondaries. - # - shard_count of the child shards will need to fail over to their secondaries - # - shard_count of the child shard secondary locations will get moved to emptier nodes - expect_reconciles = shard_count * 2 + split_shard_count + shard_count * 2 + # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move) + expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2 + reconcile_ok = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) @@ -720,22 +718,10 @@ def test_sharding_split_smoke( # dominated by shard count. log.info(f"total: {total}") assert total == { - 1: 1, - 2: 1, - 3: 1, - 4: 1, - 5: 1, - 6: 1, - 7: 1, - 8: 1, - 9: 1, - 10: 1, - 11: 1, - 12: 1, - 13: 1, - 14: 1, - 15: 1, - 16: 1, + 1: 2, + 2: 2, + 3: 2, + 4: 2, } # The controller is not required to lay out the attached locations in any particular way, but @@ -793,7 +779,7 @@ def test_sharding_split_stripe_size( tenant_id = env.initial_tenant assert len(notifications) == 1 - expect: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], @@ -809,7 +795,7 @@ def test_sharding_split_stripe_size( # Check that we ended up with the stripe size that we expected, both on the pageserver # and in the notifications to compute assert len(notifications) == 2 - expect_after: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect_after: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": new_stripe_size, "shards": [ @@ -1057,7 +1043,7 @@ def test_sharding_ingest_gaps( class Failure: - pageserver_id: Optional[int] + pageserver_id: int | None def apply(self, env: NeonEnv): raise NotImplementedError() @@ -1381,7 +1367,7 @@ def test_sharding_split_failures( assert attached_count == initial_shard_count - def assert_split_done(exclude_ps_id: Optional[int] = None) -> None: + def assert_split_done(exclude_ps_id: int | None = None) -> None: secondary_count = 0 attached_count = 0 for ps in env.pageservers: @@ -1419,7 +1405,7 @@ def test_sharding_split_failures( # e.g. while waiting for a storage controller to re-attach a parent shard if we failed # inside the pageserver and the storage controller responds by detaching children and attaching # parents concurrently (https://github.com/neondatabase/neon/issues/7148) - wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) # type: ignore + wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) workload.validate() @@ -1685,3 +1671,111 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder): ) assert len(top["shards"]) == n_tenants - 4 assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:]) + + +def test_sharding_gc( + neon_env_builder: NeonEnvBuilder, +): + """ + Exercise GC in a sharded tenant: because only shard 0 holds SLRU content, it acts as + the "leader" for GC, and other shards read its index to learn what LSN they should + GC up to. + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": 128 * 1024, + "compaction_threshold": 1, + "compaction_target_size": 128 * 1024, + # A short PITR horizon, so that we won't have to sleep too long in the test to wait for it to + # happen. + "pitr_interval": "1s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # Disable automatic creation of image layers, as we will create them explicitly when we want them + "image_creation_threshold": 9999, + "image_layer_creation_check_threshold": 0, + "lsn_lease_length": "0s", + } + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_conf=TENANT_CONF + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Create a branch and write some data + workload = Workload(env, tenant_id, timeline_id) + initial_lsn = Lsn(workload.endpoint().safe_psql("SELECT pg_current_wal_lsn()")[0][0]) + log.info(f"Started at LSN: {initial_lsn}") + + workload.init() + + # Write enough data to generate multiple layers + for _i in range(10): + last_lsn = workload.write_rows(32) + + assert last_lsn > initial_lsn + + log.info(f"Wrote up to last LSN: {last_lsn}") + + # Do full image layer generation. When we subsequently wait for PITR, all historic deltas + # should be GC-able + for shard_number in range(shard_count): + shard = TenantShardId(tenant_id, shard_number, shard_count) + env.get_tenant_pageserver(shard).http_client().timeline_compact( + shard, timeline_id, force_image_layer_creation=True + ) + + workload.churn_rows(32) + + time.sleep(5) + + # Invoke GC on a non-zero shard and verify its GC cutoff LSN does not advance + shard_one = TenantShardId(tenant_id, 1, shard_count) + env.get_tenant_pageserver(shard_one).http_client().timeline_gc( + shard_one, timeline_id, gc_horizon=None + ) + + # Check shard 1's index - GC cutoff LSN should not have advanced + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + shard_1_index = env.pageserver_remote_storage.index_content( + tenant_id=shard_one, timeline_id=timeline_id + ) + shard_1_gc_cutoff_lsn = Lsn(shard_1_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) + log.info(f"Shard 1 cutoff LSN: {shard_1_gc_cutoff_lsn}") + assert shard_1_gc_cutoff_lsn <= last_lsn + + shard_zero = TenantShardId(tenant_id, 0, shard_count) + env.get_tenant_pageserver(shard_zero).http_client().timeline_gc( + shard_zero, timeline_id, gc_horizon=None + ) + + # TODO: observe that GC LSN of shard 0 has moved forward in remote storage + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + shard_0_index = env.pageserver_remote_storage.index_content( + tenant_id=shard_zero, timeline_id=timeline_id + ) + shard_0_gc_cutoff_lsn = Lsn(shard_0_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) + log.info(f"Shard 0 cutoff LSN: {shard_0_gc_cutoff_lsn}") + assert shard_0_gc_cutoff_lsn >= last_lsn + + # Invoke GC on all other shards and verify their GC cutoff LSNs + for shard_number in range(1, shard_count): + shard = TenantShardId(tenant_id, shard_number, shard_count) + env.get_tenant_pageserver(shard).http_client().timeline_gc( + shard, timeline_id, gc_horizon=None + ) + + # Verify GC cutoff LSN advanced to match shard 0 + shard_index = env.pageserver_remote_storage.index_content( + tenant_id=shard, timeline_id=timeline_id + ) + shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) + log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") + assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 402f27b384..2a26fef59a 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -3,7 +3,6 @@ from __future__ import annotations import socket import subprocess from pathlib import Path -from types import TracebackType from typing import TYPE_CHECKING import backoff @@ -12,7 +11,8 @@ from fixtures.neon_fixtures import PgProtocol, VanillaPostgres from fixtures.port_distributor import PortDistributor if TYPE_CHECKING: - from typing import Optional + from types import TracebackType + from typing import Self def generate_tls_cert(cn, certout, keyout): @@ -55,10 +55,10 @@ class PgSniRouter(PgProtocol): self.destination = destination self.tls_cert = tls_cert self.tls_key = tls_key - self._popen: Optional[subprocess.Popen[bytes]] = None + self._popen: subprocess.Popen[bytes] | None = None self.test_output_dir = test_output_dir - def start(self) -> PgSniRouter: + def start(self) -> Self: assert self._popen is None args = [ str(self.neon_binpath / "pg_sni_router"), @@ -91,14 +91,14 @@ class PgSniRouter(PgProtocol): if self._popen: self._popen.wait(timeout=2) - def __enter__(self) -> PgSniRouter: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): if self._popen is not None: self._popen.terminate() diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 2c3d79b18a..13bc54a114 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -5,7 +5,7 @@ import json import threading import time from collections import defaultdict -from datetime import datetime, timezone +from datetime import UTC, datetime from enum import Enum from typing import TYPE_CHECKING @@ -56,7 +56,7 @@ from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any def get_node_shard_counts(env: NeonEnv, tenant_ids): @@ -593,7 +593,7 @@ def test_storage_controller_compute_hook( # Initial notification from tenant creation assert len(notifications) == 1 - expect: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], @@ -708,7 +708,7 @@ def test_storage_controller_stuck_compute_hook( # Initial notification from tenant creation assert len(notifications) == 1 - expect: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], @@ -1048,7 +1048,7 @@ def test_storage_controller_s3_time_travel_recovery( ) time.sleep(4) - ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_before_disaster = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) # Simulate a "disaster": delete some random files from remote storage for one of the shards @@ -1072,7 +1072,7 @@ def test_storage_controller_s3_time_travel_recovery( pass time.sleep(4) - ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_after_disaster = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) # Do time travel recovery @@ -2274,7 +2274,7 @@ def test_storage_controller_node_deletion( @pytest.mark.parametrize("shard_count", [None, 2]) def test_storage_controller_metadata_health( neon_env_builder: NeonEnvBuilder, - shard_count: Optional[int], + shard_count: int | None, ): """ Create three tenants A, B, C. @@ -2494,14 +2494,14 @@ def start_env(env: NeonEnv, storage_controller_port: int): for pageserver in env.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) for safekeeper in env.safekeepers: futs.append( executor.submit( - lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 11ad2173ae..3991bd7061 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -6,7 +6,6 @@ import shutil import threading import time from concurrent.futures import ThreadPoolExecutor -from typing import TYPE_CHECKING import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId @@ -20,12 +19,9 @@ from fixtures.remote_storage import S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - @pytest.mark.parametrize("shard_count", [None, 4]) -def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: int | None): """ Test the `tenant-snapshot` subcommand, which grabs data from remote storage @@ -131,7 +127,7 @@ def drop_local_state(env: NeonEnv, tenant_id: TenantId): @pytest.mark.parametrize("shard_count", [None, 4]) -def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: int | None): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.num_pageservers = 2 @@ -179,9 +175,7 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt @pytest.mark.parametrize("shard_count", [None, 2]) -def test_scrubber_physical_gc_ancestors( - neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] -): +def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_count: int | None): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.num_pageservers = 2 @@ -499,7 +493,7 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("shard_count", [None, 4]) def test_scrubber_scan_pageserver_metadata( - neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] + neon_env_builder: NeonEnvBuilder, shard_count: int | None ): """ Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect. diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index 7a46f0140c..1d86c353be 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -1,6 +1,7 @@ from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, check_restored_datadir_content # Test subtransactions @@ -9,8 +10,13 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content # maintained in the pageserver, so subtransactions are not very exciting for # Neon. They are included in the commit record though and updated in the # CLOG. -def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): - env = neon_simple_env +@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"]) +def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir, wal_receiver_protocol): + neon_env_builder.pageserver_config_override = ( + f"wal_receiver_protocol = '{wal_receiver_protocol}'" + ) + + env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") pg_conn = endpoint.connect() diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 59c14b3263..8d7ca7bc4e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,11 +1,10 @@ from __future__ import annotations import asyncio -import enum import random import time +from enum import StrEnum from threading import Thread -from typing import TYPE_CHECKING import asyncpg import pytest @@ -28,10 +27,6 @@ from fixtures.remote_storage import ( from fixtures.utils import query_scalar, wait_until from prometheus_client.samples import Sample -if TYPE_CHECKING: - from typing import Optional - - # In tests that overlap endpoint activity with tenant attach/detach, there are # a variety of warnings that the page service may emit when it cannot acquire # an active tenant to serve a request @@ -57,7 +52,7 @@ def do_gc_target( log.info("gc http thread returning") -class ReattachMode(str, enum.Enum): +class ReattachMode(StrEnum): REATTACH_EXPLICIT = "explicit" REATTACH_RESET = "reset" REATTACH_RESET_DROP = "reset_drop" @@ -498,7 +493,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( r".* Changing Active tenant to Broken state, reason: broken from test" ) - def only_int(samples: list[Sample]) -> Optional[int]: + def only_int(samples: list[Sample]) -> int | None: if len(samples) == 1: return int(samples[0].value) assert len(samples) == 0 diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index fc9adb14c9..bf6120aa0a 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -28,7 +28,7 @@ from fixtures.utils import ( ) if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -78,7 +78,7 @@ def populate_branch( tenant_id: TenantId, ps_http: PageserverHttpClient, create_table: bool, - expected_sum: Optional[int], + expected_sum: int | None, ) -> tuple[TimelineId, Lsn]: # insert some data with pg_cur(endpoint) as cur: diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 0650f12cd1..bc2e048f69 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -4,7 +4,6 @@ import json import random import threading import time -from typing import Optional import pytest import requests @@ -661,7 +660,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): ], ) def test_timeline_retain_lsn( - neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: Optional[str] + neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: str | None ): """ Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index ef0eb05612..9c7e851ba8 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -5,6 +5,7 @@ import enum import threading import time from concurrent.futures import ThreadPoolExecutor +from enum import StrEnum from queue import Empty, Queue from threading import Barrier @@ -22,7 +23,8 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.utils import assert_pageserver_backups_equal, wait_until +from fixtures.utils import assert_pageserver_backups_equal, skip_in_debug_build, wait_until +from fixtures.workload import Workload from requests import ReadTimeout @@ -36,7 +38,7 @@ def layer_name(info: HistoricLayerInfo) -> str: @enum.unique -class Branchpoint(str, enum.Enum): +class Branchpoint(StrEnum): """ Have branches at these Lsns possibly relative to L0 layer boundary. """ @@ -414,7 +416,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None - ep.clear_shared_buffers() + ep.clear_buffers() assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0 ep.stop() @@ -1549,6 +1551,57 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) +@skip_in_debug_build("only run with release build") +def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBuilder): + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": f"{1024 ** 2}", + "lsn_lease_length": "0s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 2, + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 50 + + ps_http = env.pageserver.http_client() + + workload_parent = Workload(env, tenant_id, timeline_id) + workload_parent.init(env.pageserver.id) + log.info("Writing initial data ...") + workload_parent.write_rows(row_count, env.pageserver.id) + branch_id = env.create_branch("child") + workload_child = Workload(env, tenant_id, branch_id, branch_name="child") + workload_child.init(env.pageserver.id, allow_recreate=True) + log.info("Writing initial data on child...") + workload_child.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + if i % 10 == 0: + log.info(f"Running churn round {i}/{churn_rounds} ...") + + workload_parent.churn_rows(row_count, env.pageserver.id) + workload_child.churn_rows(row_count, env.pageserver.id) + + ps_http.detach_ancestor(tenant_id, branch_id) + + log.info("Validating at workload end ...") + workload_parent.validate(env.pageserver.id) + workload_child.validate(env.pageserver.id) + + # TODO: # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index c19c78e251..5a5ca3290a 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -3,7 +3,6 @@ from __future__ import annotations import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log @@ -14,9 +13,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.utils import wait_timeline_detail_404 -if TYPE_CHECKING: - from typing import Optional - @pytest.mark.parametrize("sharded", [True, False]) def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool): @@ -89,7 +85,7 @@ def wait_for_another_gc_round(): @dataclass class ScrollableLog: pageserver: NeonPageserver - offset: Optional[LogCursor] + offset: LogCursor | None def assert_log_contains(self, what: str): msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 85c6d17142..4528bc6180 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -7,7 +7,6 @@ import time from collections import defaultdict from contextlib import closing from pathlib import Path -from typing import Optional import psycopg2.errors import psycopg2.extras @@ -668,7 +667,7 @@ def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder): class TimelinePhysicalSizeValues: api_current_physical: int prometheus_resident_physical: float - prometheus_remote_physical: Optional[float] = None + prometheus_remote_physical: float | None = None python_timelinedir_layerfiles_physical: int layer_map_file_size_sum: int diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index d4c2ca7e07..f93fc6bd8b 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -63,7 +63,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # Clear the buffer cache, to force the VM page to be re-fetched from # the page server - endpoint.clear_shared_buffers(cursor=cur) + endpoint.clear_buffers(cursor=cur) # Check that an index-only scan doesn't see the deleted row. If the # clearing of the VM bit was not replayed correctly, this would incorrectly diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 0676b3dd9a..8fa33b81a9 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -61,7 +61,7 @@ from fixtures.utils import ( ) if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any, Self def wait_lsn_force_checkpoint( @@ -189,7 +189,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id)))) m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id)))) - for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): + for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns, strict=False): # Invariant. May be < when transaction is in progress. assert ( commit_lsn <= flush_lsn @@ -224,7 +224,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): def __init__(self) -> None: super().__init__(daemon=True) self.should_stop = threading.Event() - self.exception: Optional[BaseException] = None + self.exception: BaseException | None = None def run(self) -> None: try: @@ -521,7 +521,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): # Shut down subsequently each of safekeepers and fill a segment while sk is # down; ensure segment gets offloaded by others. offloaded_seg_end = [Lsn("0/2000000"), Lsn("0/3000000"), Lsn("0/4000000")] - for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): + for victim, seg_end in zip(env.safekeepers, offloaded_seg_end, strict=False): victim.stop() # roughly fills one segment cur.execute("insert into t select generate_series(1,250000), 'payload'") @@ -666,7 +666,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): # recreate timeline on pageserver from scratch ps_http.timeline_create( - pg_version=PgVersion(pg_version), + pg_version=PgVersion(str(pg_version)), tenant_id=tenant_id, new_timeline_id=timeline_id, ) @@ -1177,14 +1177,14 @@ def cmp_sk_wal(sks: list[Safekeeper], tenant_id: TenantId, timeline_id: Timeline # report/understand if WALs are different due to that. statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses] - for tfl, sk in zip(term_flush_lsns[1:], sks[1:]): + for tfl, sk in zip(term_flush_lsns[1:], sks[1:], strict=False): assert ( term_flush_lsns[0] == tfl ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" # check that WALs are identic. segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] - for cmp_segs, sk in zip(segs[1:], sks[1:]): + for cmp_segs, sk in zip(segs[1:], sks[1:], strict=False): assert ( segs[0] == cmp_segs ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}" @@ -1455,12 +1455,12 @@ class SafekeeperEnv: self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers self.bin_safekeeper = str(neon_binpath / "safekeeper") - self.safekeepers: Optional[list[subprocess.CompletedProcess[Any]]] = None - self.postgres: Optional[ProposerPostgres] = None - self.tenant_id: Optional[TenantId] = None - self.timeline_id: Optional[TimelineId] = None + self.safekeepers: list[subprocess.CompletedProcess[Any]] | None = None + self.postgres: ProposerPostgres | None = None + self.tenant_id: TenantId | None = None + self.timeline_id: TimelineId | None = None - def init(self) -> SafekeeperEnv: + def init(self) -> Self: assert self.postgres is None, "postgres is already initialized" assert self.safekeepers is None, "safekeepers are already initialized" @@ -1541,7 +1541,7 @@ class SafekeeperEnv: log.info(f"Killing safekeeper with pid {pid}") os.kill(pid, signal.SIGKILL) - def __enter__(self): + def __enter__(self) -> Self: return self def __exit__(self, exc_type, exc_value, traceback): @@ -1784,6 +1784,89 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): cur.execute("INSERT INTO t (key) VALUES (123)") +def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): + """ + Test deleting timelines on a safekeeper while they're under load. + + This should not happen under normal operation, but it can happen if + there is some rogue compute/pageserver that is writing/reading to a + safekeeper that we're migrating a timeline away from, or if the timeline + is being deleted while such a rogue client is running. + """ + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + + # Create two endpoints that will generate load + timeline_id_a = env.create_branch("deleteme_a") + timeline_id_b = env.create_branch("deleteme_b") + + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + endpoint_b = env.endpoints.create("deleteme_b") + endpoint_b.start() + + # Get tenant and timeline IDs + tenant_id = env.initial_tenant + + # Start generating load on both timelines + def generate_load(endpoint: Endpoint): + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") + while True: + try: + cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") + except: # noqa + # Ignore errors since timeline may be deleted + break + + t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) + t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) + try: + t_a.start() + t_b.start() + + # Let the load run for a bit + log.info("Warming up...") + time.sleep(2) + + # Safekeeper errors will propagate to the pageserver: it is correct that these are + # logged at error severity because they indicate the pageserver is trying to read + # a timeline that it shouldn't. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline.*was cancelled.*", + ".*Timeline.*was not found.*", + ] + ) + + # Try deleting timelines while under load + sk = env.safekeepers[0] + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + + # Delete first timeline + log.info(f"Deleting {timeline_id_a}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] + + # Delete second timeline + log.info(f"Deleting {timeline_id_b}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] + + # Verify timelines are gone from disk + sk_data_dir = sk.data_dir + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() + # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() + + finally: + log.info("Stopping endpoints...") + # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang + endpoint_a.stop(mode="immediate") + endpoint_b.stop(mode="immediate") + log.info("Joining threads...") + t_a.join() + t_b.join() + + # Basic pull_timeline test. # When live_sk_change is False, compute is restarted to change set of # safekeepers; otherwise it is live reload. @@ -2363,7 +2446,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder): # generate some data to commit WAL on safekeepers endpoint.safe_psql("insert into t select generate_series(1,100), 'action'") # clear the buffers - endpoint.clear_shared_buffers() + endpoint.clear_buffers() # read data to fetch pages from pageserver endpoint.safe_psql("select sum(i) from t") diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index d3e989afa8..094b10b576 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -5,7 +5,6 @@ import random import time from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING import asyncpg import pytest @@ -16,10 +15,6 @@ from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import skip_in_debug_build -if TYPE_CHECKING: - from typing import Optional - - log = getLogger("root.safekeeper_async") @@ -261,7 +256,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): def endpoint_create_start( - env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False + env: NeonEnv, branch: str, pgdir_name: str | None, allow_multiple: bool = False ): endpoint = Endpoint( env, @@ -287,7 +282,7 @@ async def exec_compute_query( env: NeonEnv, branch: str, query: str, - pgdir_name: Optional[str] = None, + pgdir_name: str | None = None, allow_multiple: bool = False, ): with endpoint_create_start( @@ -627,8 +622,12 @@ async def run_segment_init_failure(env: NeonEnv): # Test (injected) failure during WAL segment init. # https://github.com/neondatabase/neon/issues/6401 # https://github.com/neondatabase/neon/issues/6402 -def test_segment_init_failure(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"]) +def test_segment_init_failure(neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: str): neon_env_builder.num_safekeepers = 1 + neon_env_builder.pageserver_config_override = ( + f"wal_receiver_protocol = '{wal_receiver_protocol}'" + ) env = neon_env_builder.init_start() asyncio.run(run_segment_init_failure(env)) @@ -705,7 +704,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat # invalid, to make them unavailable to the endpoint. We use # ports 10, 11 and 12 to simulate unavailable safekeepers. config = toml.load(test_output_dir / "repo" / "config") - for i, (_sk, active) in enumerate(zip(env.safekeepers, active_sk)): + for i, (_sk, active) in enumerate(zip(env.safekeepers, active_sk, strict=False)): if active: config["safekeepers"][i]["pg_port"] = env.safekeepers[i].port.pg else: diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index aeecd27b1f..284ae56be2 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit aeecd27b1f0775b606409d1cbb9c8aa9853a82af +Subproject commit 284ae56be2397fd3eaf20777fa220b2d0ad968f5 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 544620db4c..aed79ee87b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 544620db4ca6945be4f1f686a7fbd2cdfb0bf96f +Subproject commit aed79ee87b94779cc52ec13e3b74eba6ada93f05 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 3cc152ae2d..f5cfc6fa89 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 3cc152ae2d17b19679c7102486bdb94677705c02 +Subproject commit f5cfc6fa898544050e821ac688adafece1ac3cff diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index e5d795a1a0..3c15b6565f 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit e5d795a1a0c25da907176d37c905badab70e00c0 +Subproject commit 3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f diff --git a/vendor/revisions.json b/vendor/revisions.json index a13ef29e45..4dae88e73d 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "e5d795a1a0c25da907176d37c905badab70e00c0" + "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f" ], "v16": [ "16.6", - "3cc152ae2d17b19679c7102486bdb94677705c02" + "f5cfc6fa898544050e821ac688adafece1ac3cff" ], "v15": [ "15.10", - "544620db4ca6945be4f1f686a7fbd2cdfb0bf96f" + "aed79ee87b94779cc52ec13e3b74eba6ada93f05" ], "v14": [ "14.15", - "aeecd27b1f0775b606409d1cbb9c8aa9853a82af" + "284ae56be2397fd3eaf20777fa220b2d0ad968f5" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 53d3a7364b..a73d9d6352 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,7 +19,8 @@ ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } axum = { version = "0.7", features = ["ws"] } axum-core = { version = "0.4", default-features = false, features = ["tracing"] } -base64 = { version = "0.21", features = ["alloc"] } +base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } +base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } camino = { version = "1", default-features = false, features = ["serde1"] } @@ -52,6 +53,7 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } +nix = { version = "0.26" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] }