diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 1d1b50e458..b7e0be761a 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -33,9 +33,14 @@ config-variables: - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_CICD_CHANNEL_ID + - SLACK_COMPUTE_CHANNEL_ID - SLACK_ON_CALL_DEVPROD_STREAM - SLACK_ON_CALL_QA_STAGING_STREAM - SLACK_ON_CALL_STORAGE_STAGING_STREAM + - SLACK_ONCALL_COMPUTE_GROUP + - SLACK_ONCALL_PROXY_GROUP + - SLACK_ONCALL_STORAGE_GROUP + - SLACK_PROXY_CHANNEL_ID - SLACK_RUST_CHANNEL_ID - SLACK_STORAGE_CHANNEL_ID - SLACK_UPCOMING_RELEASE_CHANNEL_ID diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index c27311f24e..fca6a0cfa5 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -7,7 +7,7 @@ inputs: type: boolean required: false default: false - aws-oicd-role-arn: + aws-oidc-role-arn: description: 'OIDC role arn to interract with S3' required: true @@ -88,7 +88,7 @@ runs: if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws-oicd-role-arn }} + role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 687bfd49af..2844344739 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -8,7 +8,7 @@ inputs: unique-key: description: 'string to distinguish different results in the same run' required: true - aws-oicd-role-arn: + aws-oidc-role-arn: description: 'OIDC role arn to interract with S3' required: true @@ -39,7 +39,7 @@ runs: if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws-oicd-role-arn }} + role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report - name: Upload test results diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index 14b2ef8eac..d3829618da 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -15,7 +15,7 @@ inputs: prefix: description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false - aws-oicd-role-arn: + aws-oidc-role-arn: description: 'OIDC role arn to interract with S3' required: true @@ -25,7 +25,7 @@ runs: - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws-oicd-role-arn }} + role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 - name: Download artifact diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 1c65244ef4..6f2b48444a 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -53,7 +53,7 @@ inputs: description: 'benchmark durations JSON' required: false default: '{}' - aws-oicd-role-arn: + aws-oidc-role-arn: description: 'OIDC role arn to interract with S3' required: true @@ -66,7 +66,7 @@ runs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon - aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} + aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - name: Download Neon binaries for the previous release if: inputs.build_type != 'remote' @@ -75,7 +75,7 @@ runs: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon-previous prefix: latest - aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} + aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - name: Download compatibility snapshot if: inputs.build_type != 'remote' @@ -87,7 +87,7 @@ runs: # The lack of compatibility snapshot (for example, for the new Postgres version) # shouldn't fail the whole job. Only relevant test should fail. skip-if-does-not-exist: true - aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} + aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - name: Checkout if: inputs.needs_postgres_source == 'true' @@ -228,13 +228,13 @@ runs: # The lack of compatibility snapshot shouldn't fail the job # (for example if we didn't run the test for non build-and-test workflow) skip-if-does-not-exist: true - aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} + aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - uses: aws-actions/configure-aws-credentials@v4 if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws-oicd-role-arn }} + role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report - name: Upload test results @@ -243,4 +243,4 @@ runs: with: report-dir: /tmp/test_output/allure/results unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}-${{ runner.arch }} - aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} + aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml index 1bbea5400f..a0e2ab8521 100644 --- a/.github/actions/save-coverage-data/action.yml +++ b/.github/actions/save-coverage-data/action.yml @@ -14,11 +14,11 @@ runs: name: coverage-data-artifact path: /tmp/coverage skip-if-does-not-exist: true # skip if there's no previous coverage to download - aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} + aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - name: Upload coverage data uses: ./.github/actions/upload with: name: coverage-data-artifact path: /tmp/coverage - aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} + aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index ac5579ccea..ebb2443476 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -14,7 +14,7 @@ inputs: prefix: description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false - aws-oicd-role-arn: + aws-oidc-role-arn: description: "the OIDC role arn for aws auth" required: false default: "" @@ -61,7 +61,7 @@ runs: uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws-oicd-role-arn }} + role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 - name: Upload artifact diff --git a/.github/scripts/lint-release-pr.sh b/.github/scripts/lint-release-pr.sh index 6dc5b99f0e..d3badf9562 100755 --- a/.github/scripts/lint-release-pr.sh +++ b/.github/scripts/lint-release-pr.sh @@ -41,7 +41,7 @@ echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}" LAST_COMMIT=$(git rev-parse HEAD) MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}") -EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$" +EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} UTC$" if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then report_error "Merge commit message does not match expected pattern: ' release YYYY-MM-DD' diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index df107920c1..38d956bda7 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -81,7 +81,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # we create a table that has one row for each database that we want to restore with the status whether the restore is done - name: Create benchmark_restore_status table if it does not exist diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 4f7d6026f2..7cede309f3 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -323,7 +323,7 @@ jobs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Check diesel schema if: inputs.build-type == 'release' && inputs.arch == 'x64' @@ -394,7 +394,7 @@ jobs: rerun_failed: ${{ inputs.test-run-count == 1 }} pg_version: ${{ matrix.pg_version }} sanitizers: ${{ inputs.sanitizers }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. # Attempt to stop tests gracefully to generate test reports # until they are forcibly stopped by the stricter `timeout-minutes` limit. diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml deleted file mode 100644 index f96ed7d69b..0000000000 --- a/.github/workflows/_create-release-pr.yml +++ /dev/null @@ -1,103 +0,0 @@ -name: Create Release PR - -on: - workflow_call: - inputs: - component-name: - description: 'Component name' - required: true - type: string - source-branch: - description: 'Source branch' - required: true - type: string - secrets: - ci-access-token: - description: 'CI access token' - required: true - -defaults: - run: - shell: bash -euo pipefail {0} - -permissions: - contents: read - -jobs: - create-release-branch: - runs-on: ubuntu-22.04 - - permissions: - contents: write # for `git push` - - steps: - - name: Harden the runner (Audit all outbound calls) - uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 - with: - egress-policy: audit - - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - ref: ${{ inputs.source-branch }} - fetch-depth: 0 - - - name: Set variables - id: vars - env: - COMPONENT_NAME: ${{ inputs.component-name }} - RELEASE_BRANCH: >- - ${{ - false - || inputs.component-name == 'Storage' && 'release' - || inputs.component-name == 'Proxy' && 'release-proxy' - || inputs.component-name == 'Compute' && 'release-compute' - }} - run: | - now_date=$(date -u +'%Y-%m-%d') - now_time=$(date -u +'%H-%M-%Z') - { - echo "title=${COMPONENT_NAME} release ${now_date}" - echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}" - echo "release-branch=${RELEASE_BRANCH}" - } | tee -a ${GITHUB_OUTPUT} - - - name: Configure git - run: | - git config user.name "github-actions[bot]" - git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - - - name: Create RC branch - env: - RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} - RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} - TITLE: ${{ steps.vars.outputs.title }} - run: | - git switch -c "${RC_BRANCH}" - - # Manually create a merge commit on the current branch, keeping the - # tree and setting the parents to the current HEAD and the HEAD of the - # release branch. This commit is what we'll fast-forward the release - # branch to when merging the release branch. - # For details on why, look at - # https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs - current_tree=$(git rev-parse 'HEAD^{tree}') - release_head=$(git rev-parse "origin/${RELEASE_BRANCH}") - current_head=$(git rev-parse HEAD) - merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}") - - # Fast-forward the current branch to the newly created merge_commit - git merge --ff-only ${merge_commit} - - git push origin "${RC_BRANCH}" - - - name: Create a PR into ${{ steps.vars.outputs.release-branch }} - env: - GH_TOKEN: ${{ secrets.ci-access-token }} - RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} - RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} - TITLE: ${{ steps.vars.outputs.title }} - run: | - gh pr create --title "${TITLE}" \ - --body "" \ - --head "${RC_BRANCH}" \ - --base "${RELEASE_BRANCH}" diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 8af23820f4..79371ec704 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -53,6 +53,77 @@ concurrency: cancel-in-progress: true jobs: + cleanup: + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: ghcr.io/neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + env: + ORG_ID: org-solitary-dew-09443886 + LIMIT: 100 + SEARCH: "GITHUB_RUN_ID=" + BASE_URL: https://console-stage.neon.build/api/v2 + DRY_RUN: "false" # Set to "true" to just test out the workflow + + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Cleanup inactive Neon projects left over from prior runs + env: + API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + run: | + set -euo pipefail + + NOW=$(date -u +%s) + DAYS_AGO=$((NOW - 5 * 86400)) + + REQUEST_URL="$BASE_URL/projects?limit=$LIMIT&search=$(printf '%s' "$SEARCH" | jq -sRr @uri)&org_id=$ORG_ID" + + echo "Requesting project list from:" + echo "$REQUEST_URL" + + response=$(curl -s -X GET "$REQUEST_URL" \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" ) + + echo "Response:" + echo "$response" | jq . + + projects_to_delete=$(echo "$response" | jq --argjson cutoff "$DAYS_AGO" ' + .projects[] + | select(.compute_last_active_at != null) + | select((.compute_last_active_at | fromdateiso8601) < $cutoff) + | {id, name, compute_last_active_at} + ') + + if [ -z "$projects_to_delete" ]; then + echo "No projects eligible for deletion." + exit 0 + fi + + echo "Projects that will be deleted:" + echo "$projects_to_delete" | jq -r '.id' + + if [ "$DRY_RUN" = "false" ]; then + echo "$projects_to_delete" | jq -r '.id' | while read -r project_id; do + echo "Deleting project: $project_id" + curl -s -X DELETE "$BASE_URL/projects/$project_id" \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" + done + else + echo "Dry run enabled — no projects were deleted." + fi bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: @@ -114,7 +185,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -132,7 +203,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests @@ -165,7 +236,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -222,8 +293,8 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + - name: Verify that cumulative statistics are preserved uses: ./.github/actions/run-python-test-set with: @@ -233,7 +304,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 3600 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -282,7 +353,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Run Logical Replication benchmarks uses: ./.github/actions/run-python-test-set @@ -293,7 +364,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -310,7 +381,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -322,7 +393,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -505,7 +576,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project if: contains(fromJSON('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) @@ -557,7 +628,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -573,7 +644,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -588,7 +659,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -603,7 +674,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -621,7 +692,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -694,7 +765,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr @@ -726,7 +797,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -741,7 +812,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -752,7 +823,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -828,7 +899,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr @@ -871,7 +942,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 43200 -k test_clickbench pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -885,7 +956,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -954,7 +1025,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get Connstring Secret Name run: | @@ -1003,7 +1074,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -1015,7 +1086,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -1078,7 +1149,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr @@ -1121,7 +1192,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -1132,7 +1203,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index ecd135cc3d..0f7fa3e813 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -34,11 +34,10 @@ permissions: jobs: build-pgxn: if: | - (inputs.pg_versions != '[]' || inputs.rebuild_everything) && ( - contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || - github.ref_name == 'main' - ) + inputs.pg_versions != '[]' || inputs.rebuild_everything || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' timeout-minutes: 30 runs-on: macos-15 strategy: @@ -100,13 +99,21 @@ jobs: run: | make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu) + - name: Upload "pg_install/${{ matrix.postgres-version }}" artifact + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: pg_install--${{ matrix.postgres-version }} + path: pg_install/${{ matrix.postgres-version }} + # The artifact is supposed to be used by the next job in the same workflow, + # so there’s no need to store it for too long. + retention-days: 1 + build-walproposer-lib: if: | - (inputs.pg_versions != '[]' || inputs.rebuild_everything) && ( - contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || - github.ref_name == 'main' - ) + inputs.pg_versions != '[]' || inputs.rebuild_everything || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' timeout-minutes: 30 runs-on: macos-15 needs: [build-pgxn] @@ -127,12 +134,11 @@ jobs: id: pg_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}" - - name: Cache postgres v17 build - id: cache_pg - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + - name: Download "pg_install/v17" artifact + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: + name: pg_install--v17 path: pg_install/v17 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache walproposer-lib id: cache_walproposer_lib @@ -163,13 +169,21 @@ jobs: run: make walproposer-lib -j$(sysctl -n hw.ncpu) + - name: Upload "pg_install/build/walproposer-lib" artifact + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: pg_install--build--walproposer-lib + path: pg_install/build/walproposer-lib + # The artifact is supposed to be used by the next job in the same workflow, + # so there’s no need to store it for too long. + retention-days: 1 + cargo-build: if: | - (inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything) && ( - contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || - github.ref_name == 'main' - ) + inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' timeout-minutes: 30 runs-on: macos-15 needs: [build-pgxn, build-walproposer-lib] @@ -188,45 +202,43 @@ jobs: with: submodules: true - - name: Set pg v14 for caching - id: pg_rev_v14 - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) | tee -a "${GITHUB_OUTPUT}" - - name: Set pg v15 for caching - id: pg_rev_v15 - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) | tee -a "${GITHUB_OUTPUT}" - - name: Set pg v16 for caching - id: pg_rev_v16 - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) | tee -a "${GITHUB_OUTPUT}" - - name: Set pg v17 for caching - id: pg_rev_v17 - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}" - - - name: Cache postgres v14 build - id: cache_pg - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + - name: Download "pg_install/v14" artifact + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: + name: pg_install--v14 path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - name: Cache postgres v15 build - id: cache_pg_v15 - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - name: Cache postgres v16 build - id: cache_pg_v16 - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - name: Cache postgres v17 build - id: cache_pg_v17 - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 - with: - path: pg_install/v17 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - name: Cache cargo deps (only for v17) + - name: Download "pg_install/v15" artifact + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: pg_install--v15 + path: pg_install/v15 + + - name: Download "pg_install/v16" artifact + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: pg_install--v16 + path: pg_install/v16 + + - name: Download "pg_install/v17" artifact + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: pg_install--v17 + path: pg_install/v17 + + - name: Download "pg_install/build/walproposer-lib" artifact + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: pg_install--build--walproposer-lib + path: pg_install/build/walproposer-lib + + # `actions/download-artifact` doesn't preserve permissions: + # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss + - name: Make pg_install/v*/bin/* executable + run: | + chmod +x pg_install/v*/bin/* + + - name: Cache cargo deps uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 with: path: | @@ -236,13 +248,6 @@ jobs: target key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - - name: Cache walproposer-lib - id: cache_walproposer_lib - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 - with: - path: pg_install/build/walproposer-lib - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - name: Install build dependencies run: | brew install flex bison openssl protobuf icu4c @@ -252,8 +257,8 @@ jobs: echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - - name: Run cargo build (only for v17) + - name: Run cargo build run: cargo build --all --release -j$(sysctl -n hw.ncpu) - - name: Check that no warnings are produced (only for v17) + - name: Check that no warnings are produced run: ./run_clippy.sh diff --git a/.github/workflows/build_and_run_selected_test.yml b/.github/workflows/build_and_run_selected_test.yml index f22fe310ab..7f1eb991c4 100644 --- a/.github/workflows/build_and_run_selected_test.yml +++ b/.github/workflows/build_and_run_selected_test.yml @@ -93,7 +93,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_DEV }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3a334490f8..f27897e774 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -69,7 +69,7 @@ jobs: submodules: true - name: Check for file changes - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 + uses: step-security/paths-filter@v3 id: files-changed with: token: ${{ secrets.GITHUB_TOKEN }} @@ -317,7 +317,7 @@ jobs: extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} pg_version: v16 - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -384,7 +384,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -451,14 +451,14 @@ jobs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact path: /tmp/neon - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get coverage artifact uses: ./.github/actions/download with: name: coverage-data-artifact path: /tmp/coverage - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge @@ -824,7 +824,7 @@ jobs: - pg: v17 debian: bookworm env: - VM_BUILDER_VERSION: v0.42.2 + VM_BUILDER_VERSION: v0.46.0 steps: - name: Harden the runner (Audit all outbound calls) @@ -1434,10 +1434,10 @@ jobs: ;; esac - notify-storage-release-deploy-failure: - needs: [ deploy ] + notify-release-deploy-failure: + needs: [ meta, deploy ] # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs. - if: github.ref_name == 'release' && needs.deploy.result != 'success' && always() + if: contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.deploy.result != 'success' && always() runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) @@ -1445,15 +1445,40 @@ jobs: with: egress-policy: audit - - name: Post release-deploy failure to team-storage slack channel + - name: Post release-deploy failure to team slack channel uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + env: + TEAM_ONCALL: >- + ${{ + fromJSON(format('{ + "storage-release": "", + "compute-release": "", + "proxy-release": "" + }', + vars.SLACK_ONCALL_STORAGE_GROUP, + vars.SLACK_ONCALL_COMPUTE_GROUP, + vars.SLACK_ONCALL_PROXY_GROUP + ))[needs.meta.outputs.run-kind] + }} + CHANNEL: >- + ${{ + fromJSON(format('{ + "storage-release": "{0}", + "compute-release": "{1}", + "proxy-release": "{2}" + }', + vars.SLACK_STORAGE_CHANNEL_ID, + vars.SLACK_COMPUTE_CHANNEL_ID, + vars.SLACK_PROXY_CHANNEL_ID + ))[needs.meta.outputs.run-kind] + }} with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | - channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} + channel: ${{ env.CHANNEL }} text: | - 🔴 : deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. + 🔴 ${{ env.TEAM_ONCALL }}: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml index c31b05fea2..c54448dedc 100644 --- a/.github/workflows/build_and_test_with_sanitizers.yml +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -117,7 +117,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} diff --git a/.github/workflows/cloud-extensions.yml b/.github/workflows/cloud-extensions.yml index 7d60469f92..4114f0f9b4 100644 --- a/.github/workflows/cloud-extensions.yml +++ b/.github/workflows/cloud-extensions.yml @@ -68,7 +68,7 @@ jobs: id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ inputs.region_id }} + region_id: ${{ inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ matrix.pg-version }} project_settings: ${{ steps.project-settings.outputs.settings }} # We need these settings to get the expected output results. diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 7ae8d46000..d96c595294 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -89,7 +89,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create a new branch id: create-branch @@ -105,7 +105,7 @@ jobs: test_selection: cloud_regress pg_version: ${{matrix.pg-version}} extra_params: -m remote_cluster - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}} @@ -122,7 +122,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index 35e4838a86..a8d2d69807 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -32,7 +32,7 @@ jobs: fail-fast: false # allow other variants to continue even if one fails matrix: include: - - target_project: new_empty_project_stripe_size_2048 + - target_project: new_empty_project_stripe_size_2048 stripe_size: 2048 # 16 MiB postgres_version: 16 disable_sharding: false @@ -98,7 +98,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} @@ -110,10 +110,10 @@ jobs: compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck api_key: ${{ secrets.NEON_STAGING_API_KEY }} shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }} - admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} + admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} shard_count: 8 stripe_size: ${{ matrix.stripe_size }} - disable_sharding: ${{ matrix.disable_sharding }} + disable_sharding: ${{ matrix.disable_sharding }} - name: Initialize Neon project if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} @@ -171,7 +171,7 @@ jobs: extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb pg_version: v${{ matrix.postgres_version }} save_perf_report: true - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }} TARGET_PROJECT_TYPE: ${{ matrix.target_project }} diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml index 2b63cbd044..42dcc8e918 100644 --- a/.github/workflows/large_oltp_benchmark.yml +++ b/.github/workflows/large_oltp_benchmark.yml @@ -33,9 +33,9 @@ jobs: fail-fast: false # allow other variants to continue even if one fails matrix: include: - - target: new_branch + - target: new_branch custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 - - target: reuse_branch + - target: reuse_branch custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results permissions: @@ -43,7 +43,7 @@ jobs: statuses: write id-token: write # aws-actions/configure-aws-credentials env: - TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h + TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install PG_VERSION: 16 # pre-determined by pre-determined project @@ -85,7 +85,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Branch for large tenant if: ${{ matrix.target == 'new_branch' }} @@ -129,7 +129,7 @@ jobs: ${PSQL} "${BENCHMARK_CONNSTR}" -c "SET statement_timeout = 0; DELETE FROM webhook.incoming_webhooks WHERE created_at > '2025-02-27 23:59:59+00';" echo "$(date '+%Y-%m-%d %H:%M:%S') - Finished deleting rows in table webhook.incoming_webhooks from prior runs" - - name: Benchmark pgbench with custom-scripts + - name: Benchmark pgbench with custom-scripts uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} @@ -138,7 +138,7 @@ jobs: save_perf_report: true extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_pgbench pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -153,7 +153,7 @@ jobs: save_perf_report: true extra_params: -m remote_cluster --timeout 172800 -k test_perf_oltp_large_tenant_maintenance pg_version: ${{ env.PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -179,8 +179,8 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 11aa4d2c94..9c504eb5bf 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -53,7 +53,7 @@ jobs: submodules: true - name: Check for Postgres changes - uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3 + uses: step-security/paths-filter@v3 id: files_changed with: token: ${{ github.token }} @@ -69,10 +69,6 @@ jobs: check-macos-build: needs: [ check-permissions, files-changed ] - if: | - contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || - github.ref_name == 'main' uses: ./.github/workflows/build-macos.yml with: pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 0fe002bc07..532da435c2 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -147,7 +147,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index fa4fd73b12..6efe0b4c8c 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -103,7 +103,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -122,7 +122,7 @@ jobs: run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} @@ -139,7 +139,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -178,7 +178,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -195,7 +195,7 @@ jobs: run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} @@ -212,7 +212,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} diff --git a/.github/workflows/random-ops-test.yml b/.github/workflows/random-ops-test.yml index 7c19537744..6098126e90 100644 --- a/.github/workflows/random-ops-test.yml +++ b/.github/workflows/random-ops-test.yml @@ -66,7 +66,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Run tests uses: ./.github/actions/run-python-test-set @@ -76,7 +76,7 @@ jobs: run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ matrix.pg-version }} - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} RANDOM_SEED: ${{ inputs.random_seed }} @@ -88,6 +88,6 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} diff --git a/.github/workflows/release-compute.yml b/.github/workflows/release-compute.yml new file mode 100644 index 0000000000..f123dd2f44 --- /dev/null +++ b/.github/workflows/release-compute.yml @@ -0,0 +1,12 @@ +name: Create compute release PR + +on: + schedule: + - cron: '0 7 * * FRI' + +jobs: + create-release-pr: + uses: ./.github/workflows/release.yml + with: + component: compute + secrets: inherit diff --git a/.github/workflows/release-proxy.yml b/.github/workflows/release-proxy.yml new file mode 100644 index 0000000000..d9055984d2 --- /dev/null +++ b/.github/workflows/release-proxy.yml @@ -0,0 +1,12 @@ +name: Create proxy release PR + +on: + schedule: + - cron: '0 6 * * TUE' + +jobs: + create-release-pr: + uses: ./.github/workflows/release.yml + with: + component: proxy + secrets: inherit diff --git a/.github/workflows/release-storage.yml b/.github/workflows/release-storage.yml new file mode 100644 index 0000000000..91f02fddda --- /dev/null +++ b/.github/workflows/release-storage.yml @@ -0,0 +1,12 @@ +name: Create storage release PR + +on: + schedule: + - cron: '0 6 * * FRI' + +jobs: + create-release-pr: + uses: ./.github/workflows/release.yml + with: + component: storage + secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4068eafb95..0f97cf7c87 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,25 +1,34 @@ -name: Create Release Branch +name: Create release PR on: - schedule: - # It should be kept in sync with if-condition in jobs - - cron: '0 6 * * TUE' # Proxy release - - cron: '0 6 * * FRI' # Storage release - - cron: '0 7 * * FRI' # Compute release workflow_dispatch: inputs: - create-storage-release-branch: - type: boolean - description: 'Create Storage release PR' + component: + description: "Component to release" + required: true + type: choice + options: + - compute + - proxy + - storage + cherry-pick: + description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)" required: false - create-proxy-release-branch: - type: boolean - description: 'Create Proxy release PR' - required: false - create-compute-release-branch: - type: boolean - description: 'Create Compute release PR' + type: string + default: '' + + workflow_call: + inputs: + component: + description: "Component to release" + required: true + type: string + cherry-pick: + description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)" required: false + type: string + default: '' + # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -29,41 +38,31 @@ defaults: shell: bash -euo pipefail {0} jobs: - create-storage-release-branch: - if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }} + create-release-pr: + runs-on: ubuntu-22.04 permissions: contents: write - uses: ./.github/workflows/_create-release-pr.yml - with: - component-name: 'Storage' - source-branch: ${{ github.ref_name }} - secrets: - ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit - create-proxy-release-branch: - if: ${{ github.event.schedule == '0 6 * * TUE' || inputs.create-proxy-release-branch }} + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 - permissions: - contents: write + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - uses: ./.github/workflows/_create-release-pr.yml - with: - component-name: 'Proxy' - source-branch: ${{ github.ref_name }} - secrets: - ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} - - create-compute-release-branch: - if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }} - - permissions: - contents: write - - uses: ./.github/workflows/_create-release-pr.yml - with: - component-name: 'Compute' - source-branch: ${{ github.ref_name }} - secrets: - ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} + - name: Create release PR + uses: neondatabase/dev-actions/release-pr@290dec821d86fa8a93f019e8c69720f5865b5677 + with: + component: ${{ inputs.component }} + cherry-pick: ${{ inputs.cherry-pick }} + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} diff --git a/Cargo.lock b/Cargo.lock index 4c464c62b8..fe4cc35029 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1284,6 +1284,7 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", + "async-compression", "aws-config", "aws-sdk-kms", "aws-sdk-s3", @@ -1420,6 +1421,7 @@ dependencies = [ "clap", "comfy-table", "compute_api", + "endpoint_storage", "futures", "http-utils", "humantime", diff --git a/Cargo.toml b/Cargo.toml index 1c203af9e0..8d4cc4a75a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -243,6 +243,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } +endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" } http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver = { path = "./pageserver" } diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 97c4756d27..a722fc0c51 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1086,6 +1086,23 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ USER root +######################################################################################### +# +# Layer "rust extensions pgrx14" +# +# Version 14 is now required by a few +# This layer should be used as a base for new pgrx extensions, +# and eventually get merged with `rust-extensions-build` +# +######################################################################################### +FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14 +ARG PG_VERSION + +RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ + /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + +USER root + ######################################################################################### # # Layers "pg-onnx-build" and "pgrag-build" @@ -1101,11 +1118,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar. mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ echo "#nothing to test here" > neon-test.sh -RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ - echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \ + echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . -FROM rust-extensions-build-pgrx12 AS pgrag-build +FROM rust-extensions-build-pgrx14 AS pgrag-build COPY --from=pgrag-src /ext-src/ /ext-src/ # Install build-time dependencies @@ -1125,19 +1142,19 @@ RUN . venv/bin/activate && \ WORKDIR /ext-src/pgrag-src RUN cd exts/rag && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control RUN cd exts/rag_bge_small_en_v15 && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \ cargo pgrx install --release --features remote_onnx && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control RUN cd exts/rag_jina_reranker_v1_tiny_en && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \ cargo pgrx install --release --features remote_onnx && \ @@ -1306,8 +1323,8 @@ ARG PG_VERSION # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs WORKDIR /ext-src -RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \ - echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.1.tar.gz -O pg_session_jwt.tar.gz && \ + echo "62fec9e472cb805c53ba24a0765afdb8ea2720cfc03ae7813e61687b36d1b0ad pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ @@ -1320,6 +1337,40 @@ COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_session_jwt-src RUN cargo pgrx install --release +######################################################################################### +# +# Layer "pg-anon-pg-build" +# compile anon extension +# +######################################################################################### +FROM pg-build AS pg_anon-src +ARG PG_VERSION +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +WORKDIR /ext-src +COPY compute/patches/anon_v2.patch . + +# This is an experimental extension, never got to real production. +# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. +ENV PATH="/usr/local/pgsql/bin/:$PATH" +RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/2.1.0/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \ + echo "48e7f5ae2f1ca516df3da86c5c739d48dd780a4e885705704ccaad0faa89d6c0 pg_anon.tar.gz" | sha256sum --check && \ + mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + patch -p1 < /ext-src/anon_v2.patch + +FROM rust-extensions-build-pgrx14 AS pg-anon-pg-build +ARG PG_VERSION +COPY --from=pg_anon-src /ext-src/ /ext-src/ +WORKDIR /ext-src +RUN cd pg_anon-src && \ + make -j $(getconf _NPROCESSORS_ONLN) extension PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ + chmod -R a+r ../pg_anon-src && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; + +######################################################################################## + ######################################################################################### # # Layer "wal2json-build" @@ -1616,6 +1667,7 @@ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql +COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index 449e1199d0..e64d907fe4 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -23,6 +23,8 @@ import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', import 'sql_exporter/getpage_prefetches_buffered.libsonnet', import 'sql_exporter/getpage_sync_requests_total.libsonnet', + import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet', + import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet', import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', import 'sql_exporter/getpage_wait_seconds_count.libsonnet', import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', diff --git a/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet new file mode 100644 index 0000000000..bc1100c832 --- /dev/null +++ b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'compute_getpage_max_inflight_stuck_time_ms', + type: 'gauge', + help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for', + values: [ + 'compute_getpage_max_inflight_stuck_time_ms', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet new file mode 100644 index 0000000000..5f72f43254 --- /dev/null +++ b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'compute_getpage_stuck_requests_total', + type: 'counter', + help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout', + values: [ + 'compute_getpage_stuck_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql index 4a36f3bf2f..39a9d03412 100644 --- a/compute/etc/sql_exporter/neon_perf_counters.sql +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -9,6 +9,8 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( getpage_wait_seconds_sum numeric, getpage_prefetch_requests_total numeric, getpage_sync_requests_total numeric, + compute_getpage_stuck_requests_total numeric, + compute_getpage_max_inflight_stuck_time_ms numeric, getpage_prefetch_misses_total numeric, getpage_prefetch_discards_total numeric, getpage_prefetches_buffered numeric, diff --git a/compute/patches/anon_v2.patch b/compute/patches/anon_v2.patch new file mode 100644 index 0000000000..e833a6dfd3 --- /dev/null +++ b/compute/patches/anon_v2.patch @@ -0,0 +1,129 @@ +diff --git a/sql/anon.sql b/sql/anon.sql +index 0cdc769..f6cc950 100644 +--- a/sql/anon.sql ++++ b/sql/anon.sql +@@ -1141,3 +1141,8 @@ $$ + -- TODO : https://en.wikipedia.org/wiki/L-diversity + + -- TODO : https://en.wikipedia.org/wiki/T-closeness ++ ++-- NEON Patches ++ ++GRANT ALL ON SCHEMA anon to neon_superuser; ++GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser; +diff --git a/sql/init.sql b/sql/init.sql +index 7da6553..9b6164b 100644 +--- a/sql/init.sql ++++ b/sql/init.sql +@@ -74,50 +74,49 @@ $$ + + SECURITY LABEL FOR anon ON FUNCTION anon.load_csv IS 'UNTRUSTED'; + +--- load fake data from a given path +-CREATE OR REPLACE FUNCTION anon.init( +- datapath TEXT +-) ++CREATE OR REPLACE FUNCTION anon.load_fake_data() + RETURNS BOOLEAN + AS $$ + DECLARE +- datapath_check TEXT; + success BOOLEAN; ++ sharedir TEXT; ++ datapath TEXT; + BEGIN + +- IF anon.is_initialized() THEN +- RAISE NOTICE 'The anon extension is already initialized.'; +- RETURN TRUE; +- END IF; ++ datapath := '/extension/anon/'; ++ -- find the local extension directory ++ SELECT setting INTO sharedir ++ FROM pg_catalog.pg_config ++ WHERE name = 'SHAREDIR'; + + SELECT bool_or(results) INTO success + FROM unnest(array[ +- anon.load_csv('anon.identifiers_category',datapath||'/identifiers_category.csv'), +- anon.load_csv('anon.identifier',datapath ||'/identifier.csv'), +- anon.load_csv('anon.address',datapath ||'/address.csv'), +- anon.load_csv('anon.city',datapath ||'/city.csv'), +- anon.load_csv('anon.company',datapath ||'/company.csv'), +- anon.load_csv('anon.country',datapath ||'/country.csv'), +- anon.load_csv('anon.email', datapath ||'/email.csv'), +- anon.load_csv('anon.first_name',datapath ||'/first_name.csv'), +- anon.load_csv('anon.iban',datapath ||'/iban.csv'), +- anon.load_csv('anon.last_name',datapath ||'/last_name.csv'), +- anon.load_csv('anon.postcode',datapath ||'/postcode.csv'), +- anon.load_csv('anon.siret',datapath ||'/siret.csv'), +- anon.load_csv('anon.lorem_ipsum',datapath ||'/lorem_ipsum.csv') ++ anon.load_csv('anon.identifiers_category',sharedir || datapath || '/identifiers_category.csv'), ++ anon.load_csv('anon.identifier',sharedir || datapath || '/identifier.csv'), ++ anon.load_csv('anon.address',sharedir || datapath || '/address.csv'), ++ anon.load_csv('anon.city',sharedir || datapath || '/city.csv'), ++ anon.load_csv('anon.company',sharedir || datapath || '/company.csv'), ++ anon.load_csv('anon.country',sharedir || datapath || '/country.csv'), ++ anon.load_csv('anon.email', sharedir || datapath || '/email.csv'), ++ anon.load_csv('anon.first_name',sharedir || datapath || '/first_name.csv'), ++ anon.load_csv('anon.iban',sharedir || datapath || '/iban.csv'), ++ anon.load_csv('anon.last_name',sharedir || datapath || '/last_name.csv'), ++ anon.load_csv('anon.postcode',sharedir || datapath || '/postcode.csv'), ++ anon.load_csv('anon.siret',sharedir || datapath || '/siret.csv'), ++ anon.load_csv('anon.lorem_ipsum',sharedir || datapath || '/lorem_ipsum.csv') + ]) results; + RETURN success; +- + END; + $$ +- LANGUAGE PLPGSQL ++ LANGUAGE plpgsql + VOLATILE + RETURNS NULL ON NULL INPUT +- PARALLEL UNSAFE -- because load_csv is unsafe +- SECURITY INVOKER ++ PARALLEL UNSAFE -- because of the EXCEPTION ++ SECURITY DEFINER + SET search_path='' + ; +-SECURITY LABEL FOR anon ON FUNCTION anon.init(TEXT) IS 'UNTRUSTED'; ++ ++SECURITY LABEL FOR anon ON FUNCTION anon.load_fake_data IS 'UNTRUSTED'; + + -- People tend to forget the anon.init() step + -- This is a friendly notice for them +@@ -144,7 +143,7 @@ SECURITY LABEL FOR anon ON FUNCTION anon.notice_if_not_init IS 'UNTRUSTED'; + CREATE OR REPLACE FUNCTION anon.load(TEXT) + RETURNS BOOLEAN AS + $$ +- SELECT anon.init($1); ++ SELECT anon.init(); + $$ + LANGUAGE SQL + VOLATILE +@@ -159,16 +158,16 @@ SECURITY LABEL FOR anon ON FUNCTION anon.load(TEXT) IS 'UNTRUSTED'; + CREATE OR REPLACE FUNCTION anon.init() + RETURNS BOOLEAN + AS $$ +- WITH conf AS ( +- -- find the local extension directory +- SELECT setting AS sharedir +- FROM pg_catalog.pg_config +- WHERE name = 'SHAREDIR' +- ) +- SELECT anon.init(conf.sharedir || '/extension/anon/') +- FROM conf; ++BEGIN ++ IF anon.is_initialized() THEN ++ RAISE NOTICE 'The anon extension is already initialized.'; ++ RETURN TRUE; ++ END IF; ++ ++ RETURN anon.load_fake_data(); ++END; + $$ +- LANGUAGE SQL ++ LANGUAGE plpgsql + VOLATILE + PARALLEL UNSAFE -- because init is unsafe + SECURITY INVOKER diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index ec24d73242..057099994a 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -22,7 +22,7 @@ commands: - name: local_proxy user: postgres sysvInitAction: respawn - shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' - name: postgres-exporter user: nobody sysvInitAction: respawn diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index b40bdecebc..d048e20b2e 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -22,7 +22,7 @@ commands: - name: local_proxy user: postgres sysvInitAction: respawn - shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' - name: postgres-exporter user: nobody sysvInitAction: respawn diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8c1e7ad149..8ee5dd0665 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,6 +10,7 @@ default = [] testing = ["fail/failpoints"] [dependencies] +async-compression.workspace = true base64.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index e337ee7b15..20b5e567a8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -60,12 +60,16 @@ use utils::failpoint_support; // Compatibility hack: if the control plane specified any remote-ext-config // use the default value for extension storage proxy gateway. // Remove this once the control plane is updated to pass the gateway URL -fn parse_remote_ext_config(arg: &str) -> Result { - if arg.starts_with("http") { - Ok(arg.trim_end_matches('/').to_string()) +fn parse_remote_ext_base_url(arg: &str) -> Result { + const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str = + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"; + + Ok(if arg.starts_with("http") { + arg } else { - Ok("http://pg-ext-s3-gateway".to_string()) + FALLBACK_PG_EXT_GATEWAY_BASE_URL } + .to_owned()) } #[derive(Parser)] @@ -74,8 +78,10 @@ struct Cli { #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] pub pgbin: String, - #[arg(short = 'r', long, value_parser = parse_remote_ext_config)] - pub remote_ext_config: Option, + /// The base URL for the remote extension storage proxy gateway. + /// Should be in the form of `http(s)://[:]`. + #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")] + pub remote_ext_base_url: Option, /// The port to bind the external listening HTTP server to. Clients running /// outside the compute will talk to the compute through this port. Keep @@ -164,7 +170,7 @@ fn main() -> Result<()> { pgversion: get_pg_version_string(&cli.pgbin), external_http_port: cli.external_http_port, internal_http_port: cli.internal_http_port, - ext_remote_storage: cli.remote_ext_config.clone(), + remote_ext_base_url: cli.remote_ext_base_url.clone(), resize_swap_on_bind: cli.resize_swap_on_bind, set_disk_quota_for_fs: cli.set_disk_quota_for_fs, #[cfg(target_os = "linux")] @@ -265,4 +271,18 @@ mod test { fn verify_cli() { Cli::command().debug_assert() } + + #[test] + fn parse_pg_ext_gateway_base_url() { + let arg = "http://pg-ext-s3-gateway2"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!(result, arg); + + let arg = "pg-ext-s3-gateway"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!( + result, + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local" + ); + } } diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 537028cde1..78acd78585 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -348,6 +348,7 @@ async fn run_dump_restore( "--no-security-labels".to_string(), "--no-subscriptions".to_string(), "--no-tablespaces".to_string(), + "--no-event-triggers".to_string(), // format "--format".to_string(), "directory".to_string(), diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 8834f0d63d..25920675c1 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,17 +1,10 @@ -use std::collections::HashMap; -use std::os::unix::fs::{PermissionsExt, symlink}; -use std::path::Path; -use std::process::{Command, Stdio}; -use std::str::FromStr; -use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::time::{Duration, Instant}; -use std::{env, fs}; - use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus}; +use compute_api::responses::{ + ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, + LfcPrewarmState, +}; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, }; @@ -25,6 +18,16 @@ use postgres; use postgres::NoTls; use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; +use std::collections::HashMap; +use std::net::SocketAddr; +use std::os::unix::fs::{PermissionsExt, symlink}; +use std::path::Path; +use std::process::{Command, Stdio}; +use std::str::FromStr; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Condvar, Mutex, RwLock}; +use std::time::{Duration, Instant}; +use std::{env, fs}; use tokio::spawn; use tracing::{Instrument, debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; @@ -92,7 +95,7 @@ pub struct ComputeNodeParams { pub internal_http_port: u16, /// the address of extension storage proxy gateway - pub ext_remote_storage: Option, + pub remote_ext_base_url: Option, } /// Compute node info shared across several `compute_ctl` threads. @@ -150,6 +153,9 @@ pub struct ComputeState { /// set up the span relationship ourselves. pub startup_span: Option, + pub lfc_prewarm_state: LfcPrewarmState, + pub lfc_offload_state: LfcOffloadState, + pub metrics: ComputeMetrics, } @@ -163,6 +169,8 @@ impl ComputeState { pspec: None, startup_span: None, metrics: ComputeMetrics::default(), + lfc_prewarm_state: LfcPrewarmState::default(), + lfc_offload_state: LfcOffloadState::default(), } } @@ -198,6 +206,8 @@ pub struct ParsedSpec { pub pageserver_connstr: String, pub safekeeper_connstrings: Vec, pub storage_auth_token: Option, + pub endpoint_storage_addr: Option, + pub endpoint_storage_token: Option, } impl TryFrom for ParsedSpec { @@ -251,6 +261,18 @@ impl TryFrom for ParsedSpec { .or(Err("invalid timeline id"))? }; + let endpoint_storage_addr: Option = spec + .endpoint_storage_addr + .clone() + .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr")) + .unwrap_or_default() + .parse() + .ok(); + let endpoint_storage_token = spec + .endpoint_storage_token + .clone() + .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token")); + Ok(ParsedSpec { spec, pageserver_connstr, @@ -258,6 +280,8 @@ impl TryFrom for ParsedSpec { storage_auth_token, tenant_id, timeline_id, + endpoint_storage_addr, + endpoint_storage_token, }) } } @@ -305,11 +329,39 @@ struct StartVmMonitorResult { impl ComputeNode { pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result { let connstr = params.connstr.as_str(); - let conn_conf = postgres::config::Config::from_str(connstr) + let mut conn_conf = postgres::config::Config::from_str(connstr) .context("cannot build postgres config from connstr")?; - let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) + let mut tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) .context("cannot build tokio postgres config from connstr")?; + // Users can set some configuration parameters per database with + // ALTER DATABASE ... SET ... + // + // There are at least these parameters: + // + // - role=some_other_role + // - default_transaction_read_only=on + // - statement_timeout=1, i.e., 1ms, which will cause most of the queries to fail + // - search_path=non_public_schema, this should be actually safe because + // we don't call any functions in user databases, but better to always reset + // it to public. + // + // that can affect `compute_ctl` and prevent it from properly configuring the database schema. + // Unset them via connection string options before connecting to the database. + // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. + // + // TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane + // as well. After rolling out this code, we can remove this parameter from control plane. + // In the meantime, double-passing is fine, the last value is applied. + // See: + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + let options = match conn_conf.get_options() { + Some(options) => format!("{} {}", options, EXTRA_OPTIONS), + None => EXTRA_OPTIONS.to_string(), + }; + conn_conf.options(&options); + tokio_conn_conf.options(&options); + let mut new_state = ComputeState::new(); if let Some(spec) = config.spec { let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; @@ -736,6 +788,9 @@ impl ComputeNode { // Log metrics so that we can search for slow operations in logs info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished"); + if pspec.spec.prewarm_lfc_on_startup { + self.prewarm_lfc(); + } Ok(()) } @@ -1422,15 +1477,20 @@ impl ComputeNode { Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { - // Connect with zenith_admin if cloud_admin could not authenticate + // Connect with `zenith_admin` if `cloud_admin` could not authenticate info!( - "cannot connect to postgres: {}, retrying with `zenith_admin` username", + "cannot connect to Postgres: {}, retrying with 'zenith_admin' username", e ); let mut zenith_admin_conf = postgres::config::Config::from(conf.clone()); zenith_admin_conf.application_name("compute_ctl:apply_config"); zenith_admin_conf.user("zenith_admin"); + // It doesn't matter what were the options before, here we just want + // to connect and create a new superuser role. + const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + zenith_admin_conf.options(ZENITH_OPTIONS); + let mut client = zenith_admin_conf.connect(NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; @@ -1596,9 +1656,7 @@ impl ComputeNode { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { - let mut conf = - tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap(); - conf.application_name("apply_config"); + let conf = self.get_tokio_conn_conf(Some("compute_ctl:reconfigure")); let conf = Arc::new(conf); let spec = Arc::new(spec.clone()); @@ -1838,9 +1896,9 @@ LIMIT 100", real_ext_name: String, ext_path: RemotePath, ) -> Result { - let ext_remote_storage = + let remote_ext_base_url = self.params - .ext_remote_storage + .remote_ext_base_url .as_ref() .ok_or(DownloadError::BadInput(anyhow::anyhow!( "Remote extensions storage is not configured", @@ -1902,7 +1960,7 @@ LIMIT 100", let download_size = extension_server::download_extension( &real_ext_name, &ext_path, - ext_remote_storage, + remote_ext_base_url, &self.params.pgbin, ) .await @@ -2011,7 +2069,7 @@ LIMIT 100", &self, spec: &ComputeSpec, ) -> Result { - if self.params.ext_remote_storage.is_none() { + if self.params.remote_ext_base_url.is_none() { return Ok(RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs new file mode 100644 index 0000000000..a6a84b3f1f --- /dev/null +++ b/compute_tools/src/compute_prewarm.rs @@ -0,0 +1,202 @@ +use crate::compute::ComputeNode; +use anyhow::{Context, Result, bail}; +use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder}; +use compute_api::responses::LfcOffloadState; +use compute_api::responses::LfcPrewarmState; +use http::StatusCode; +use reqwest::Client; +use std::sync::Arc; +use tokio::{io::AsyncReadExt, spawn}; +use tracing::{error, info}; + +#[derive(serde::Serialize, Default)] +pub struct LfcPrewarmStateWithProgress { + #[serde(flatten)] + base: LfcPrewarmState, + total: i32, + prewarmed: i32, + skipped: i32, +} + +/// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks +struct EndpointStoragePair { + url: String, + token: String, +} + +const KEY: &str = "lfc_state"; +impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair { + type Error = anyhow::Error; + fn try_from(pspec: &crate::compute::ParsedSpec) -> Result { + let Some(ref endpoint_id) = pspec.spec.endpoint_id else { + bail!("pspec.endpoint_id missing") + }; + let Some(ref base_uri) = pspec.endpoint_storage_addr else { + bail!("pspec.endpoint_storage_addr missing") + }; + let tenant_id = pspec.tenant_id; + let timeline_id = pspec.timeline_id; + + let url = format!("http://{base_uri}/{tenant_id}/{timeline_id}/{endpoint_id}/{KEY}"); + let Some(ref token) = pspec.endpoint_storage_token else { + bail!("pspec.endpoint_storage_token missing") + }; + let token = token.clone(); + Ok(EndpointStoragePair { url, token }) + } +} + +impl ComputeNode { + // If prewarm failed, we want to get overall number of segments as well as done ones. + // However, this function should be reliable even if querying postgres failed. + pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress { + info!("requesting LFC prewarm state from postgres"); + let mut state = LfcPrewarmStateWithProgress::default(); + { + state.base = self.state.lock().unwrap().lfc_prewarm_state.clone(); + } + + let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await { + Ok(client) => client, + Err(err) => { + error!(%err, "connecting to postgres"); + return state; + } + }; + let row = match client + .query_one("select * from get_prewarm_info()", &[]) + .await + { + Ok(row) => row, + Err(err) => { + error!(%err, "querying LFC prewarm status"); + return state; + } + }; + state.total = row.try_get(0).unwrap_or_default(); + state.prewarmed = row.try_get(1).unwrap_or_default(); + state.skipped = row.try_get(2).unwrap_or_default(); + state + } + + pub fn lfc_offload_state(&self) -> LfcOffloadState { + self.state.lock().unwrap().lfc_offload_state.clone() + } + + /// Returns false if there is a prewarm request ongoing, true otherwise + pub fn prewarm_lfc(self: &Arc) -> bool { + crate::metrics::LFC_PREWARM_REQUESTS.inc(); + { + let state = &mut self.state.lock().unwrap().lfc_prewarm_state; + if let LfcPrewarmState::Prewarming = + std::mem::replace(state, LfcPrewarmState::Prewarming) + { + return false; + } + } + + let cloned = self.clone(); + spawn(async move { + let Err(err) = cloned.prewarm_impl().await else { + cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed; + return; + }; + error!(%err); + cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed { + error: err.to_string(), + }; + }); + true + } + + fn endpoint_storage_pair(&self) -> Result { + let state = self.state.lock().unwrap(); + state.pspec.as_ref().unwrap().try_into() + } + + async fn prewarm_impl(&self) -> Result<()> { + let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + info!(%url, "requesting LFC state from endpoint storage"); + + let request = Client::new().get(&url).bearer_auth(token); + let res = request.send().await.context("querying endpoint storage")?; + let status = res.status(); + if status != StatusCode::OK { + bail!("{status} querying endpoint storage") + } + + let mut uncompressed = Vec::new(); + let lfc_state = res + .bytes() + .await + .context("getting request body from endpoint storage")?; + ZstdDecoder::new(lfc_state.iter().as_slice()) + .read_to_end(&mut uncompressed) + .await + .context("decoding LFC state")?; + let uncompressed_len = uncompressed.len(); + info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres"); + + ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")? + .query_one("select prewarm_local_cache($1)", &[&uncompressed]) + .await + .context("loading LFC state into postgres") + .map(|_| ()) + } + + /// Returns false if there is an offload request ongoing, true otherwise + pub fn offload_lfc(self: &Arc) -> bool { + crate::metrics::LFC_OFFLOAD_REQUESTS.inc(); + { + let state = &mut self.state.lock().unwrap().lfc_offload_state; + if let LfcOffloadState::Offloading = + std::mem::replace(state, LfcOffloadState::Offloading) + { + return false; + } + } + + let cloned = self.clone(); + spawn(async move { + let Err(err) = cloned.offload_lfc_impl().await else { + cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; + return; + }; + error!(%err); + cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { + error: err.to_string(), + }; + }); + true + } + + async fn offload_lfc_impl(&self) -> Result<()> { + let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + info!(%url, "requesting LFC state from postgres"); + + let mut compressed = Vec::new(); + ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")? + .query_one("select get_local_cache_state()", &[]) + .await + .context("querying LFC state")? + .try_get::(0) + .context("deserializing LFC state") + .map(ZstdEncoder::new)? + .read_to_end(&mut compressed) + .await + .context("compressing LFC state")?; + let compressed_len = compressed.len(); + info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage"); + + let request = Client::new().put(url).bearer_auth(token).body(compressed); + match request.send().await { + Ok(res) if res.status() == StatusCode::OK => Ok(()), + Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()), + Err(err) => Err(err).context("writing to endpoint storage"), + } + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 71c6123c3b..42d245f55a 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -223,6 +223,9 @@ pub fn write_postgres_conf( // TODO: tune this after performance testing writeln!(file, "pgaudit.log_rotation_age=5")?; + // Enable audit logs for pg_session_jwt extension + writeln!(file, "pg_session_jwt.audit_log=on")?; + // Add audit shared_preload_libraries, if they are not present. // // The caller who sets the flag is responsible for ensuring that the necessary diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index ee889e0c40..3439383699 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -158,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion { pub async fn download_extension( ext_name: &str, ext_path: &RemotePath, - ext_remote_storage: &str, + remote_ext_base_url: &str, pgbin: &str, ) -> Result { info!("Download extension {:?} from {:?}", ext_name, ext_path); // TODO add retry logic let download_buffer = - match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await { + match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await { Ok(buffer) => buffer, Err(error_message) => { return Err(anyhow::anyhow!( @@ -272,8 +272,8 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { // Do request to extension storage proxy, e.g., // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst // using HTTP GET and return the response body as bytes. -async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { - let uri = format!("{}/{}", ext_remote_storage, ext_path); +async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result { + let uri = format!("{}/{}", remote_ext_base_url, ext_path); let filename = Path::new(ext_path) .file_name() .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs index 2d0f411d7a..a82f46e062 100644 --- a/compute_tools/src/http/middleware/authorize.rs +++ b/compute_tools/src/http/middleware/authorize.rs @@ -1,12 +1,10 @@ -use std::collections::HashSet; - use anyhow::{Result, anyhow}; use axum::{RequestExt, body::Body}; use axum_extra::{ TypedHeader, headers::{Authorization, authorization::Bearer}, }; -use compute_api::requests::ComputeClaims; +use compute_api::requests::{COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope}; use futures::future::BoxFuture; use http::{Request, Response, StatusCode}; use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; @@ -25,13 +23,14 @@ pub(in crate::http) struct Authorize { impl Authorize { pub fn new(compute_id: String, jwks: JwkSet) -> Self { let mut validation = Validation::new(Algorithm::EdDSA); - // Nothing is currently required - validation.required_spec_claims = HashSet::new(); validation.validate_exp = true; // Unused by the control plane - validation.validate_aud = false; - // Unused by the control plane validation.validate_nbf = false; + // Unused by the control plane + validation.validate_aud = false; + validation.set_audience(&[COMPUTE_AUDIENCE]); + // Nothing is currently required + validation.set_required_spec_claims(&[] as &[&str; 0]); Self { compute_id, @@ -64,11 +63,47 @@ impl AsyncAuthorizeRequest for Authorize { Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)), }; - if data.claims.compute_id != compute_id { - return Err(JsonResponse::error( - StatusCode::UNAUTHORIZED, - "invalid compute ID in authorization token claims", - )); + match data.claims.scope { + // TODO: We should validate audience for every token, but + // instead of this ad-hoc validation, we should turn + // [`Validation::validate_aud`] on. This is merely a stopgap + // while we roll out `aud` deployment. We return a 401 + // Unauthorized because when we eventually do use + // [`Validation`], we will hit the above `Err` match arm which + // returns 401 Unauthorized. + Some(ComputeClaimsScope::Admin) => { + let Some(ref audience) = data.claims.audience else { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "missing audience in authorization token claims", + )); + }; + + if !audience.iter().any(|a| a == COMPUTE_AUDIENCE) { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "invalid audience in authorization token claims", + )); + } + } + + // If the scope is not [`ComputeClaimsScope::Admin`], then we + // must validate the compute_id + _ => { + let Some(ref claimed_compute_id) = data.claims.compute_id else { + return Err(JsonResponse::error( + StatusCode::FORBIDDEN, + "missing compute_id in authorization token claims", + )); + }; + + if *claimed_compute_id != compute_id { + return Err(JsonResponse::error( + StatusCode::FORBIDDEN, + "invalid compute ID in authorization token claims", + )); + } + } } // Make claims available to any subsequent middleware or request diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 6508de6eee..e141a48b7f 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -22,7 +22,7 @@ pub(in crate::http) async fn download_extension( State(compute): State>, ) -> Response { // Don't even try to download extensions if no remote storage is configured - if compute.params.ext_remote_storage.is_none() { + if compute.params.remote_ext_base_url.is_none() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "remote storage is not configured", diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs new file mode 100644 index 0000000000..07bcc6bfb7 --- /dev/null +++ b/compute_tools/src/http/routes/lfc.rs @@ -0,0 +1,39 @@ +use crate::compute_prewarm::LfcPrewarmStateWithProgress; +use crate::http::JsonResponse; +use axum::response::{IntoResponse, Response}; +use axum::{Json, http::StatusCode}; +use compute_api::responses::LfcOffloadState; +type Compute = axum::extract::State>; + +pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json { + Json(compute.lfc_prewarm_state().await) +} + +// Following functions are marked async for axum, as it's more convenient than wrapping these +// in async lambdas at call site + +pub(in crate::http) async fn offload_state(compute: Compute) -> Json { + Json(compute.lfc_offload_state()) +} + +pub(in crate::http) async fn prewarm(compute: Compute) -> Response { + if compute.prewarm_lfc() { + StatusCode::ACCEPTED.into_response() + } else { + JsonResponse::error( + StatusCode::TOO_MANY_REQUESTS, + "Multiple requests for prewarm are not allowed", + ) + } +} + +pub(in crate::http) async fn offload(compute: Compute) -> Response { + if compute.offload_lfc() { + StatusCode::ACCEPTED.into_response() + } else { + JsonResponse::error( + StatusCode::TOO_MANY_REQUESTS, + "Multiple requests for prewarm offload are not allowed", + ) + } +} diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs index a67be7fd5a..432e66a830 100644 --- a/compute_tools/src/http/routes/mod.rs +++ b/compute_tools/src/http/routes/mod.rs @@ -11,6 +11,7 @@ pub(in crate::http) mod extensions; pub(in crate::http) mod failpoints; pub(in crate::http) mod grants; pub(in crate::http) mod insights; +pub(in crate::http) mod lfc; pub(in crate::http) mod metrics; pub(in crate::http) mod metrics_json; pub(in crate::http) mod status; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 10f767e97c..d5d2427971 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -23,7 +23,7 @@ use super::{ middleware::authorize::Authorize, routes::{ check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, insights, metrics, metrics_json, status, terminate, + grants, insights, lfc, metrics, metrics_json, status, terminate, }, }; use crate::compute::ComputeNode; @@ -85,6 +85,8 @@ impl From<&Server> for Router> { Router::>::new().route("/metrics", get(metrics::get_metrics)); let authenticated_router = Router::>::new() + .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) + .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) .route("/database_schema", get(database_schema::get_schema_dump)) diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index a681fad0b0..7218067a8a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -11,6 +11,7 @@ pub mod http; pub mod logger; pub mod catalog; pub mod compute; +pub mod compute_prewarm; pub mod disk_quota; pub mod extension_server; pub mod installed_extensions; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index e37d6120ac..90326b2074 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,7 +1,7 @@ use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge}; use metrics::proto::MetricFamily; use metrics::{ - IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter, + IntCounter, IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter, register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, }; use once_cell::sync::Lazy; @@ -97,6 +97,24 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy> = Lazy:: .expect("failed to define a metric") }); +/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm. +/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm +pub(crate) static LFC_PREWARM_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_prewarm_requests_total", + "Total number of LFC prewarm requests made by compute_ctl", + ) + .expect("failed to define a metric") +}); + +pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_offload_requests_total", + "Total number of LFC offload requests made by compute_ctl", + ) + .expect("failed to define a metric") +}); + pub fn collect() -> Vec { let mut metrics = COMPUTE_CTL_UP.collect(); metrics.extend(INSTALLED_EXTENSIONS.collect()); @@ -106,5 +124,7 @@ pub fn collect() -> Vec { metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics.extend(PG_CURR_DOWNTIME_MS.collect()); metrics.extend(PG_TOTAL_DOWNTIME_MS.collect()); + metrics.extend(LFC_PREWARM_REQUESTS.collect()); + metrics.extend(LFC_OFFLOAD_REQUESTS.collect()); metrics } diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 5a07eec833..3311ee47b3 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -424,10 +424,10 @@ pub fn launch_monitor(compute: &Arc) -> thread::JoinHandle<()> { experimental, }; - let span = span!(Level::INFO, "compute_monitor"); thread::Builder::new() .name("compute-monitor".into()) .spawn(move || { + let span = span!(Level::INFO, "compute_monitor"); let _enter = span.enter(); monitor.run(); }) diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index b72c1293ee..53f2ddad84 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -30,6 +30,7 @@ mod pg_helpers_tests { r#"fsync = off wal_level = logical hot_standby = on +prewarm_lfc_on_startup = off neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' wal_log_hints = on log_connections = on diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 92f0071bac..62c039047f 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -41,7 +41,7 @@ storage_broker.workspace = true http-utils.workspace = true utils.workspace = true whoami.workspace = true - +endpoint_storage.workspace = true compute_api.workspace = true workspace_hack.workspace = true tracing.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 6f55c0310f..191a22f1de 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -16,10 +16,11 @@ use std::time::Duration; use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; +use compute_api::requests::ComputeClaimsScope; use compute_api::spec::ComputeMode; use control_plane::broker::StorageBroker; use control_plane::endpoint::ComputeControlPlane; -use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage}; +use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, @@ -643,9 +644,10 @@ struct EndpointStartCmdArgs { #[clap( long, - help = "Configure the remote extensions storage proxy gateway to request for extensions." + help = "Configure the remote extensions storage proxy gateway URL to request for extensions.", + alias = "remote-ext-config" )] - remote_ext_config: Option, + remote_ext_base_url: Option, #[clap( long, @@ -705,6 +707,9 @@ struct EndpointStopCmdArgs { struct EndpointGenerateJwtCmdArgs { #[clap(help = "Postgres endpoint id")] endpoint_id: String, + + #[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)] + scope: Option, } #[derive(clap::Subcommand)] @@ -1018,7 +1023,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { }) .collect(), endpoint_storage: EndpointStorageConf { - port: ENDPOINT_STORAGE_DEFAULT_PORT, + listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, }, pg_distrib_dir: None, neon_distrib_dir: None, @@ -1410,9 +1415,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res EndpointCmd::Start(args) => { let endpoint_id = &args.endpoint_id; let pageserver_id = args.endpoint_pageserver_id; - let remote_ext_config = &args.remote_ext_config; + let remote_ext_base_url = &args.remote_ext_base_url; - let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); + let default_generation = env + .storage_controller + .timelines_onto_safekeepers + .then_some(1); + let safekeepers_generation = args + .safekeepers_generation + .or(default_generation) + .map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { @@ -1484,14 +1496,29 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res None }; + let exp = (std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)? + + Duration::from_secs(86400)) + .as_secs(); + let claims = endpoint_storage::claims::EndpointStorageClaims { + tenant_id: endpoint.tenant_id, + timeline_id: endpoint.timeline_id, + endpoint_id: endpoint_id.to_string(), + exp, + }; + + let endpoint_storage_token = env.generate_auth_token(&claims)?; + let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string(); + println!("Starting existing endpoint {endpoint_id}..."); endpoint .start( &auth_token, + endpoint_storage_token, + endpoint_storage_addr, safekeepers_generation, safekeepers, pageservers, - remote_ext_config.as_ref(), + remote_ext_base_url.as_ref(), stripe_size.0 as usize, args.create_test_user, args.start_timeout, @@ -1540,12 +1567,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res endpoint.stop(&args.mode, args.destroy)?; } EndpointCmd::GenerateJwt(args) => { - let endpoint_id = &args.endpoint_id; - let endpoint = cplane - .endpoints - .get(endpoint_id) - .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - let jwt = endpoint.generate_jwt()?; + let endpoint = { + let endpoint_id = &args.endpoint_id; + + cplane + .endpoints + .get(endpoint_id) + .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))? + }; + + let jwt = endpoint.generate_jwt(args.scope)?; print!("{jwt}"); } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 4071b620d6..708745446d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -45,7 +45,9 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::{Context, Result, anyhow, bail}; -use compute_api::requests::{ComputeClaims, ConfigurationRequest}; +use compute_api::requests::{ + COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest, +}; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig, }; @@ -630,9 +632,17 @@ impl Endpoint { } /// Generate a JWT with the correct claims. - pub fn generate_jwt(&self) -> Result { + pub fn generate_jwt(&self, scope: Option) -> Result { self.env.generate_auth_token(&ComputeClaims { - compute_id: self.endpoint_id.clone(), + audience: match scope { + Some(ComputeClaimsScope::Admin) => Some(vec![COMPUTE_AUDIENCE.to_owned()]), + _ => None, + }, + compute_id: match scope { + Some(ComputeClaimsScope::Admin) => None, + _ => Some(self.endpoint_id.clone()), + }, + scope, }) } @@ -640,10 +650,12 @@ impl Endpoint { pub async fn start( &self, auth_token: &Option, + endpoint_storage_token: String, + endpoint_storage_addr: String, safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, - remote_ext_config: Option<&String>, + remote_ext_base_url: Option<&String>, shard_stripe_size: usize, create_test_user: bool, start_timeout: Duration, @@ -733,6 +745,9 @@ impl Endpoint { drop_subscriptions_before_start: self.drop_subscriptions_before_start, audit_log_level: ComputeAudit::Disabled, logs_export_host: None::, + endpoint_storage_addr: Some(endpoint_storage_addr), + endpoint_storage_token: Some(endpoint_storage_token), + prewarm_lfc_on_startup: false, }; // this strange code is needed to support respec() in tests @@ -810,8 +825,8 @@ impl Endpoint { .stderr(logfile.try_clone()?) .stdout(logfile); - if let Some(remote_ext_config) = remote_ext_config { - cmd.args(["--remote-ext-config", remote_ext_config]); + if let Some(remote_ext_base_url) = remote_ext_base_url { + cmd.args(["--remote-ext-base-url", remote_ext_base_url]); } let child = cmd.spawn()?; @@ -903,7 +918,7 @@ impl Endpoint { self.external_http_address.port() ), ) - .bearer_auth(self.generate_jwt()?) + .bearer_auth(self.generate_jwt(None::)?) .send() .await?; @@ -980,7 +995,7 @@ impl Endpoint { self.external_http_address.port() )) .header(CONTENT_TYPE.as_str(), "application/json") - .bearer_auth(self.generate_jwt()?) + .bearer_auth(self.generate_jwt(None::)?) .body( serde_json::to_string(&ConfigurationRequest { spec, diff --git a/control_plane/src/endpoint_storage.rs b/control_plane/src/endpoint_storage.rs index 102db91a22..171aaeddb4 100644 --- a/control_plane/src/endpoint_storage.rs +++ b/control_plane/src/endpoint_storage.rs @@ -3,17 +3,19 @@ use crate::local_env::LocalEnv; use anyhow::{Context, Result}; use camino::Utf8PathBuf; use std::io::Write; +use std::net::SocketAddr; use std::time::Duration; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage"; -pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993; +pub const ENDPOINT_STORAGE_DEFAULT_ADDR: SocketAddr = + SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), 9993); pub struct EndpointStorage { pub bin: Utf8PathBuf, pub data_dir: Utf8PathBuf, pub pemfile: Utf8PathBuf, - pub port: u16, + pub addr: SocketAddr, } impl EndpointStorage { @@ -22,7 +24,7 @@ impl EndpointStorage { bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(), data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(), pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(), - port: env.endpoint_storage.port, + addr: env.endpoint_storage.listen_addr, } } @@ -31,7 +33,7 @@ impl EndpointStorage { } fn listen_addr(&self) -> Utf8PathBuf { - format!("127.0.0.1:{}", self.port).into() + format!("{}:{}", self.addr.ip(), self.addr.port()).into() } pub fn init(&self) -> Result<()> { diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index a18b34daa4..4a8892c6de 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -20,7 +20,9 @@ use utils::auth::encode_from_key_file; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use crate::broker::StorageBroker; -use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage}; +use crate::endpoint_storage::{ + ENDPOINT_STORAGE_DEFAULT_ADDR, ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage, +}; use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; @@ -151,10 +153,10 @@ pub struct NeonLocalInitConf { pub generate_local_ssl_certs: bool, } -#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct EndpointStorageConf { - pub port: u16, + pub listen_addr: SocketAddr, } /// Broker config for cluster internal communication. @@ -241,6 +243,14 @@ impl Default for NeonStorageControllerConf { } } +impl Default for EndpointStorageConf { + fn default() -> Self { + Self { + listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, + } + } +} + impl NeonBroker { pub fn client_url(&self) -> Url { let url = if let Some(addr) = self.listen_https_addr { diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 948e3c8c93..eec2c997e6 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -112,7 +112,7 @@ impl SafekeeperNode { } /// Initializes a safekeeper node by creating all necessary files, - /// e.g. SSL certificates. + /// e.g. SSL certificates and JWT token file. pub fn initialize(&self) -> anyhow::Result<()> { if self.env.generate_local_ssl_certs { self.env.generate_ssl_cert( @@ -120,6 +120,17 @@ impl SafekeeperNode { &self.datadir_path().join("server.key"), )?; } + + // Generate a token file for authentication with other safekeepers + if self.conf.auth_enabled { + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; + + let token_path = self.datadir_path().join("peer_jwt_token"); + std::fs::write(token_path, token)?; + } + Ok(()) } @@ -218,14 +229,26 @@ impl SafekeeperNode { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } + if self.conf.auth_enabled { + let token_path = self.datadir_path().join("peer_jwt_token"); + let token_path_str = token_path + .to_str() + .with_context(|| { + format!("Token path {token_path:?} cannot be represented as a unicode string") + })? + .to_owned(); + args.extend(["--auth-token-path".to_owned(), token_path_str]); + } + args.extend_from_slice(extra_opts); + let env_variables = Vec::new(); background_process::start_process( &format!("safekeeper-{id}"), &datadir, &self.env.safekeeper_bin(), &args, - self.safekeeper_env_variables()?, + env_variables, background_process::InitialPidFile::Expect(self.pid_file()), retry_timeout, || async { @@ -239,18 +262,6 @@ impl SafekeeperNode { .await } - fn safekeeper_env_variables(&self) -> anyhow::Result> { - // Generate a token to connect from safekeeper to peers - if self.conf.auth_enabled { - let token = self - .env - .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; - Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)]) - } else { - Ok(Vec::new()) - } - } - /// /// Stop the server. /// diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index a36815d27e..755d67a7ad 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -10,7 +10,8 @@ use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; use pageserver_api::controller_api::{ - NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, + SafekeeperSchedulingPolicyRequest, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; use pageserver_api::models::{ @@ -20,7 +21,7 @@ use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use pem::Pem; use postgres_backend::AuthType; -use reqwest::Method; +use reqwest::{Method, Response}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tokio::process::Command; @@ -570,6 +571,11 @@ impl StorageController { let peer_jwt_token = encode_from_key_file(&peer_claims, private_key) .expect("failed to generate jwt token"); args.push(format!("--peer-jwt-token={peer_jwt_token}")); + + let claims = Claims::new(None, Scope::SafekeeperData); + let jwt_token = + encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); + args.push(format!("--safekeeper-jwt-token={jwt_token}")); } if let Some(public_key) = &self.public_key { @@ -614,6 +620,10 @@ impl StorageController { self.env.base_data_dir.display() )); + if self.env.safekeepers.iter().any(|sk| sk.auth_enabled) && self.private_key.is_none() { + anyhow::bail!("Safekeeper set up for auth but no private key specified"); + } + if self.config.timelines_onto_safekeepers { args.push("--timelines-onto-safekeepers".to_string()); } @@ -640,6 +650,10 @@ impl StorageController { ) .await?; + if self.config.timelines_onto_safekeepers { + self.register_safekeepers().await?; + } + Ok(()) } @@ -743,6 +757,23 @@ impl StorageController { where RQ: Serialize + Sized, RS: DeserializeOwned + Sized, + { + let response = self.dispatch_inner(method, path, body).await?; + Ok(response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch_inner( + &self, + method: reqwest::Method, + path: String, + body: Option, + ) -> anyhow::Result + where + RQ: Serialize + Sized, { // In the special case of the `storage_controller start` subcommand, we wish // to use the API endpoint of the newly started storage controller in order @@ -785,10 +816,31 @@ impl StorageController { let response = builder.send().await?; let response = response.error_from_body().await?; - Ok(response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + Ok(response) + } + + /// Register the safekeepers in the storage controller + #[instrument(skip(self))] + async fn register_safekeepers(&self) -> anyhow::Result<()> { + for sk in self.env.safekeepers.iter() { + let sk_id = sk.id; + let body = serde_json::json!({ + "id": sk_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "127.0.0.1", + "port": sk.pg_port, + "http_port": sk.http_port, + "https_port": sk.https_port, + "version": 5957, + "availability_zone_id": format!("us-east-2b-{sk_id}"), + }); + self.upsert_safekeeper(sk_id, body).await?; + self.safekeeper_scheduling_policy(sk_id, SkSchedulingPolicy::Active) + .await?; + } + Ok(()) } /// Call into the attach_hook API, for use before handing out attachments to pageservers @@ -816,6 +868,42 @@ impl StorageController { Ok(response.generation) } + #[instrument(skip(self))] + pub async fn upsert_safekeeper( + &self, + node_id: NodeId, + request: serde_json::Value, + ) -> anyhow::Result<()> { + let resp = self + .dispatch_inner::( + Method::POST, + format!("control/v1/safekeeper/{node_id}"), + Some(request), + ) + .await?; + if !resp.status().is_success() { + anyhow::bail!( + "setting scheduling policy unsuccessful for safekeeper {node_id}: {}", + resp.status() + ); + } + Ok(()) + } + + #[instrument(skip(self))] + pub async fn safekeeper_scheduling_policy( + &self, + node_id: NodeId, + scheduling_policy: SkSchedulingPolicy, + ) -> anyhow::Result<()> { + self.dispatch::( + Method::POST, + format!("control/v1/safekeeper/{node_id}/scheduling_policy"), + Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }), + ) + .await + } + #[instrument(skip(self))] pub async fn inspect( &self, diff --git a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out index ca54864ecd..ff6a7404cb 100644 --- a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out +++ b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out @@ -12,6 +12,7 @@ ERROR: invalid JWT encoding -- Test creating a session with an expired JWT SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw'); ERROR: Token used after it has expired +DETAIL: exp=1742564432 -- Test creating a session with a valid JWT SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg'); jwt_session_init diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml index 76935453b6..7d603b6c65 100644 --- a/docker-compose/pageserver_config/pageserver.toml +++ b/docker-compose/pageserver_config/pageserver.toml @@ -3,3 +3,5 @@ pg_distrib_dir='/usr/local/' listen_pg_addr='0.0.0.0:6400' listen_http_addr='0.0.0.0:9898' remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } +control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address +control_plane_emergency_mode=true diff --git a/docs/consumption_metrics.md b/docs/consumption_metrics.md index 6bcd28ab10..eb211af646 100644 --- a/docs/consumption_metrics.md +++ b/docs/consumption_metrics.md @@ -38,11 +38,6 @@ Currently, the following metrics are collected: Amount of WAL produced , by a timeline, i.e. last_record_lsn This is an absolute, per-timeline metric. -- `resident_size` - -Size of all the layer files in the tenant's directory on disk on the pageserver. -This is an absolute, per-tenant metric. - - `remote_storage_size` Size of the remote storage (S3) directory. diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index f07ef06328..0bd7fe5f28 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -343,7 +343,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; fn token() -> String { - let claims = endpoint_storage::Claims { + let claims = endpoint_storage::claims::EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), @@ -489,16 +489,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH } fn delete_prefix_token(uri: &str) -> String { - use serde::Serialize; let parts = uri.split("/").collect::>(); - #[derive(Serialize)] - struct PrefixClaims { - tenant_id: TenantId, - timeline_id: Option, - endpoint_id: Option, - exp: u64, - } - let claims = PrefixClaims { + let claims = endpoint_storage::claims::DeletePrefixClaims { tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(), timeline_id: parts.get(2).map(|c| c.parse().unwrap()), endpoint_id: parts.get(3).map(ToString::to_string), diff --git a/endpoint_storage/src/claims.rs b/endpoint_storage/src/claims.rs new file mode 100644 index 0000000000..ef0f0eb0b4 --- /dev/null +++ b/endpoint_storage/src/claims.rs @@ -0,0 +1,52 @@ +use serde::{Deserialize, Serialize}; +use std::fmt::Display; +use utils::id::{EndpointId, TenantId, TimelineId}; + +/// Claims to add, remove, or retrieve endpoint data. Used by compute_ctl +#[derive(Deserialize, Serialize, PartialEq)] +pub struct EndpointStorageClaims { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub endpoint_id: EndpointId, + pub exp: u64, +} + +/// Claims to remove tenant, timeline, or endpoint data. Used by control plane +#[derive(Deserialize, Serialize, PartialEq)] +pub struct DeletePrefixClaims { + pub tenant_id: TenantId, + /// None when tenant is deleted (endpoint_id is also None in this case) + pub timeline_id: Option, + /// None when timeline is deleted + pub endpoint_id: Option, + pub exp: u64, +} + +impl Display for EndpointStorageClaims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "EndpointClaims(tenant_id={} timeline_id={} endpoint_id={} exp={})", + self.tenant_id, self.timeline_id, self.endpoint_id, self.exp + ) + } +} + +impl Display for DeletePrefixClaims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "DeletePrefixClaims(tenant_id={} timeline_id={} endpoint_id={}, exp={})", + self.tenant_id, + self.timeline_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.endpoint_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.exp + ) + } +} diff --git a/endpoint_storage/src/lib.rs b/endpoint_storage/src/lib.rs index eb6b80c487..d1625dc843 100644 --- a/endpoint_storage/src/lib.rs +++ b/endpoint_storage/src/lib.rs @@ -1,3 +1,5 @@ +pub mod claims; +use crate::claims::{DeletePrefixClaims, EndpointStorageClaims}; use anyhow::Result; use axum::extract::{FromRequestParts, Path}; use axum::response::{IntoResponse, Response}; @@ -13,7 +15,7 @@ use std::result::Result as StdResult; use std::sync::Arc; use tokio_util::sync::CancellationToken; use tracing::{debug, error}; -use utils::id::{TenantId, TimelineId}; +use utils::id::{EndpointId, TenantId, TimelineId}; // simplified version of utils::auth::JwtAuth pub struct JwtAuth { @@ -79,26 +81,6 @@ pub struct Storage { pub max_upload_file_limit: usize, } -pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc - -#[derive(Deserialize, Serialize, PartialEq)] -pub struct Claims { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub endpoint_id: EndpointId, - pub exp: u64, -} - -impl Display for Claims { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})", - self.tenant_id, self.timeline_id, self.endpoint_id, self.exp - ) - } -} - #[derive(Deserialize, Serialize)] struct KeyRequest { tenant_id: TenantId, @@ -107,6 +89,13 @@ struct KeyRequest { path: String, } +#[derive(Deserialize, Serialize, PartialEq)] +struct PrefixKeyRequest { + tenant_id: TenantId, + timeline_id: Option, + endpoint_id: Option, +} + #[derive(Debug, PartialEq)] pub struct S3Path { pub path: RemotePath, @@ -165,7 +154,7 @@ impl FromRequestParts> for S3Path { .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; - let claims: Claims = state + let claims: EndpointStorageClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "decoding token"))?; @@ -178,7 +167,7 @@ impl FromRequestParts> for S3Path { path.endpoint_id.clone() }; - let route = Claims { + let route = EndpointStorageClaims { tenant_id: path.tenant_id, timeline_id: path.timeline_id, endpoint_id, @@ -193,38 +182,13 @@ impl FromRequestParts> for S3Path { } } -#[derive(Deserialize, Serialize, PartialEq)] -pub struct PrefixKeyPath { - pub tenant_id: TenantId, - pub timeline_id: Option, - pub endpoint_id: Option, -} - -impl Display for PrefixKeyPath { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})", - self.tenant_id, - self.timeline_id - .as_ref() - .map(ToString::to_string) - .unwrap_or("".to_string()), - self.endpoint_id - .as_ref() - .map(ToString::to_string) - .unwrap_or("".to_string()) - ) - } -} - #[derive(Debug, PartialEq)] pub struct PrefixS3Path { pub path: RemotePath, } -impl From<&PrefixKeyPath> for PrefixS3Path { - fn from(path: &PrefixKeyPath) -> Self { +impl From<&DeletePrefixClaims> for PrefixS3Path { + fn from(path: &DeletePrefixClaims) -> Self { let timeline_id = path .timeline_id .as_ref() @@ -250,21 +214,27 @@ impl FromRequestParts> for PrefixS3Path { state: &Arc, ) -> Result { let Path(path) = parts - .extract::>() + .extract::>() .await .map_err(|e| bad_request(e, "invalid route"))?; let TypedHeader(Authorization(bearer)) = parts .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; - let claims: PrefixKeyPath = state + let claims: DeletePrefixClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "invalid token"))?; - if path != claims { - return Err(unauthorized(path, claims)); + let route = DeletePrefixClaims { + tenant_id: path.tenant_id, + timeline_id: path.timeline_id, + endpoint_id: path.endpoint_id, + exp: claims.exp, + }; + if route != claims { + return Err(unauthorized(route, claims)); } - Ok((&path).into()) + Ok((&route).into()) } } @@ -297,7 +267,7 @@ mod tests { #[test] fn s3_path() { - let auth = Claims { + let auth = EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), @@ -327,10 +297,11 @@ mod tests { #[test] fn prefix_s3_path() { - let mut path = PrefixKeyPath { + let mut path = DeletePrefixClaims { tenant_id: TENANT_ID, timeline_id: None, endpoint_id: None, + exp: 0, }; let prefix_path = |s: String| RemotePath::from_string(&s).unwrap(); assert_eq!( diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 98f2fc297c..bbab271474 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,16 +1,58 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. +use std::str::FromStr; + use serde::{Deserialize, Serialize}; use crate::privilege::Privilege; use crate::responses::ComputeCtlConfig; use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; +/// The value to place in the [`ComputeClaims::audience`] claim. +pub static COMPUTE_AUDIENCE: &str = "compute"; + +/// Available scopes for a compute's JWT. +#[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ComputeClaimsScope { + /// An admin-scoped token allows access to all of `compute_ctl`'s authorized + /// facilities. + Admin, +} + +impl FromStr for ComputeClaimsScope { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "admin" => Ok(ComputeClaimsScope::Admin), + _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")), + } + } +} + /// When making requests to the `compute_ctl` external HTTP server, the client /// must specify a set of claims in `Authorization` header JWTs such that /// `compute_ctl` can authorize the request. #[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename = "snake_case")] pub struct ComputeClaims { - pub compute_id: String, + /// The compute ID that will validate the token. The only case in which this + /// can be [`None`] is if [`Self::scope`] is + /// [`ComputeClaimsScope::Admin`]. + pub compute_id: Option, + + /// The scope of what the token authorizes. + pub scope: Option, + + /// The recipient the token is intended for. + /// + /// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for + /// more information. + /// + /// TODO: Remove the [`Option`] wrapper when control plane learns to send + /// the claim. + #[serde(rename = "aud")] + pub audience: Option>, } /// Request of the /configure API diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index b7d6b7ca34..24d371c6eb 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -46,6 +46,30 @@ pub struct ExtensionInstallResponse { pub version: ExtVersion, } +#[derive(Serialize, Default, Debug, Clone)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum LfcPrewarmState { + #[default] + NotPrewarmed, + Prewarming, + Completed, + Failed { + error: String, + }, +} + +#[derive(Serialize, Default, Debug, Clone)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum LfcOffloadState { + #[default] + NotOffloaded, + Offloading, + Completed, + Failed { + error: String, + }, +} + /// Response of the /status API #[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index ad246c48ec..09b550b96c 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -172,6 +172,15 @@ pub struct ComputeSpec { /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding. /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514 pub logs_export_host: Option, + + /// Address of endpoint storage service + pub endpoint_storage_addr: Option, + /// JWT for authorizing requests to endpoint storage service + pub endpoint_storage_token: Option, + + /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup + #[serde(default)] + pub prewarm_lfc_on_startup: bool, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 37de24be5b..30e788a601 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -84,6 +84,11 @@ "value": "on", "vartype": "bool" }, + { + "name": "prewarm_lfc_on_startup", + "value": "off", + "vartype": "bool" + }, { "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs index 13a745e031..f91800685f 100644 --- a/libs/metrics/src/more_process_metrics.rs +++ b/libs/metrics/src/more_process_metrics.rs @@ -16,6 +16,7 @@ pub struct Collector { const NMETRICS: usize = 2; static CLK_TCK_F64: Lazy = Lazy::new(|| { + // SAFETY: libc::sysconf is safe, it merely returns a value. let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; if long == -1 { panic!("sysconf(_SC_CLK_TCK) failed"); diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index b64c42a808..5b0c13dd89 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -182,6 +182,7 @@ pub struct ConfigToml { pub tracing: Option, pub enable_tls_page_service_api: bool, pub dev_mode: bool, + pub timeline_import_config: TimelineImportConfig, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -300,6 +301,12 @@ impl From for tracing_utils::Protocol { } } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct TimelineImportConfig { + pub import_job_concurrency: NonZeroUsize, + pub import_job_soft_size_limit: NonZeroUsize, +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -659,6 +666,10 @@ impl Default for ConfigToml { tracing: None, enable_tls_page_service_api: false, dev_mode: false, + timeline_import_config: TimelineImportConfig { + import_job_concurrency: NonZeroUsize::new(128).unwrap(), + import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(), + }, } } } diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 654dde8da6..714d8ac403 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -841,6 +841,10 @@ impl PostgresBackend { let expected_end = match &end { ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true, + // The timeline doesn't exist and we have been requested to not auto-create it. + // Compute requests for timelines that haven't been created yet + // might reach us before the storcon request to create those timelines. + TimelineNoCreate => true, CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) if is_expected_io_error(io_error) => { @@ -1059,6 +1063,8 @@ pub enum CopyStreamHandlerEnd { Terminate, #[error("EOF on COPY stream")] EOF, + #[error("timeline not found, and allow_timeline_creation is false")] + TimelineNoCreate, /// The connection was lost #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 51f88625da..cc31b38fe7 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -303,7 +303,8 @@ pub struct PullTimelineRequest { #[derive(Debug, Serialize, Deserialize)] pub struct PullTimelineResponse { - // Donor safekeeper host - pub safekeeper_host: String, + /// Donor safekeeper host. + /// None if no pull happened because the timeline already exists. + pub safekeeper_host: Option, // TODO: add more fields? } diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs index d54876ba2c..77e72035ee 100644 --- a/libs/tenant_size_model/src/calculation.rs +++ b/libs/tenant_size_model/src/calculation.rs @@ -77,7 +77,9 @@ impl StorageModel { } SizeResult { - total_size, + // If total_size is 0, it means that the tenant has all timelines offloaded; we need to report 1 + // here so that the data point shows up in the s3 files. + total_size: total_size.max(1), segments: segment_results, } } diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 6016c23a01..68cb1f0209 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -295,6 +295,9 @@ pub struct TenantId(Id); id_newtype!(TenantId); +/// If needed, reuse small string from proxy/src/types.rc +pub type EndpointId = String; + // A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TenantTimelineId { diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs index 945f710b1d..700cd5792b 100644 --- a/libs/utils/src/rate_limit.rs +++ b/libs/utils/src/rate_limit.rs @@ -17,7 +17,7 @@ impl std::fmt::Display for RateLimitStats { } impl RateLimit { - pub fn new(interval: Duration) -> Self { + pub const fn new(interval: Duration) -> Self { Self { last: None, interval, diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 293c01eff0..79f56a5a51 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -10,6 +10,7 @@ use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, ima use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; +use pageserver_api::key::Key; use utils::id::{TenantId, TimelineId}; use crate::layer_map_analyzer::parse_filename; @@ -27,6 +28,7 @@ pub(crate) enum LayerCmd { path: PathBuf, tenant: String, timeline: String, + key: Option, }, /// Dump all information of a layer file DumpLayer { @@ -100,6 +102,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { path, tenant, timeline, + key, } => { let timeline_path = path .join(TENANTS_SEGMENT_NAME) @@ -107,21 +110,37 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { .join(TIMELINES_SEGMENT_NAME) .join(timeline); let mut idx = 0; + let mut to_print = Vec::default(); for layer in fs::read_dir(timeline_path)? { let layer = layer?; if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) { - println!( - "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", - idx, - layer_file.key_range.start, - layer_file.key_range.end, - layer_file.lsn_range.start, - layer_file.lsn_range.end, - layer_file.is_delta, - ); + if let Some(key) = key { + if layer_file.key_range.start <= *key && *key < layer_file.key_range.end { + to_print.push((idx, layer_file)); + } + } else { + to_print.push((idx, layer_file)); + } idx += 1; } } + + if key.is_some() { + to_print + .sort_by_key(|(_idx, layer_file)| std::cmp::Reverse(layer_file.lsn_range.end)); + } + + for (idx, layer_file) in to_print { + println!( + "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", + idx, + layer_file.key_range.start, + layer_file.key_range.end, + layer_file.lsn_range.start, + layer_file.lsn_range.end, + layer_file.is_delta, + ); + } Ok(()) } LayerCmd::DumpLayer { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 6cfaec955b..4c2572a577 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -504,7 +504,7 @@ fn start_pageserver( // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( remote_storage.clone(), - StorageControllerUpcallClient::new(conf, &shutdown_pageserver)?, + StorageControllerUpcallClient::new(conf, &shutdown_pageserver), conf, ); deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle()); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 95143e58b7..7e773f56b3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -150,7 +150,7 @@ pub struct PageServerConf { /// not terrible. pub background_task_maximum_delay: Duration, - pub control_plane_api: Option, + pub control_plane_api: Url, /// JWT token for use with the control plane API. pub control_plane_api_token: Option, @@ -230,6 +230,8 @@ pub struct PageServerConf { /// such as authentication requirements for HTTP and PostgreSQL APIs. /// This is insecure and should only be used in development environments. pub dev_mode: bool, + + pub timeline_import_config: pageserver_api::config::TimelineImportConfig, } /// Token for authentication to safekeepers @@ -404,6 +406,7 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, } = config_toml; let mut conf = PageServerConf { @@ -438,7 +441,8 @@ impl PageServerConf { test_remote_failures, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, - control_plane_api, + control_plane_api: control_plane_api + .ok_or_else(|| anyhow::anyhow!("`control_plane_api` must be set"))?, control_plane_emergency_mode, heatmap_upload_concurrency, secondary_download_concurrency, @@ -456,6 +460,7 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, // ------------------------------------------------------------ // fields that require additional validation or custom handling @@ -573,6 +578,7 @@ impl PageServerConf { background_task_maximum_delay: Duration::ZERO, load_previous_heatmap: Some(true), generate_unarchival_heatmap: Some(true), + control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()), ..Default::default() }; PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() @@ -641,9 +647,12 @@ mod tests { use super::PageServerConf; #[test] - fn test_empty_config_toml_is_valid() { - // we use Default impl of everything in this situation + fn test_minimal_config_toml_is_valid() { + // The minimal valid config for running a pageserver: + // - control_plane_api is mandatory, as pageservers cannot run in isolation + // - we use Default impl of everything else in this situation let input = r#" + control_plane_api = "http://localhost:6666" "#; let config_toml = toml_edit::de::from_str::(input) .expect("empty config is valid"); diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 08ab69f349..acdf514101 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -30,9 +30,6 @@ pub(super) enum Name { /// Tenant remote size #[serde(rename = "remote_storage_size")] RemoteSize, - /// Tenant resident size - #[serde(rename = "resident_size")] - ResidentSize, /// Tenant synthetic size #[serde(rename = "synthetic_storage_size")] SyntheticSize, @@ -187,18 +184,6 @@ impl MetricsKey { .absolute_values() } - /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`. - /// - /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size - const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory { - MetricsKey { - tenant_id, - timeline_id: None, - metric: Name::ResidentSize, - } - .absolute_values() - } - /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`]. /// /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size @@ -261,10 +246,7 @@ where let mut tenants = std::pin::pin!(tenants); while let Some((tenant_id, tenant)) = tenants.next().await { - let mut tenant_resident_size = 0; - let timelines = tenant.list_timelines(); - let timelines_len = timelines.len(); for timeline in timelines { let timeline_id = timeline.timeline_id; @@ -287,16 +269,9 @@ where continue; } } - - tenant_resident_size += timeline.resident_physical_size(); } - if timelines_len == 0 { - // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded. - tenant_resident_size = 1; - } - - let snap = TenantSnapshot::collect(&tenant, tenant_resident_size); + let snap = TenantSnapshot::collect(&tenant); snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics); } @@ -305,19 +280,14 @@ where /// In-between abstraction to allow testing metrics without actual Tenants. struct TenantSnapshot { - resident_size: u64, remote_size: u64, synthetic_size: u64, } impl TenantSnapshot { /// Collect tenant status to have metrics created out of it. - /// - /// `resident_size` is calculated of the timelines we had access to for other metrics, so we - /// cannot just list timelines here. - fn collect(t: &Arc, resident_size: u64) -> Self { + fn collect(t: &Arc) -> Self { TenantSnapshot { - resident_size, remote_size: t.remote_size(), // Note that this metric is calculated in a separate bgworker // Here we only use cached value, which may lag behind the real latest one @@ -334,8 +304,6 @@ impl TenantSnapshot { ) { let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size); - let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size); - let synthetic_size = { let factory = MetricsKey::synthetic_size(tenant_id); let mut synthetic_size = self.synthetic_size; @@ -355,11 +323,7 @@ impl TenantSnapshot { } }; - metrics.extend( - [Some(remote_size), Some(resident_size), synthetic_size] - .into_iter() - .flatten(), - ); + metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten()); } } diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 52b4fb8680..5cfb361e40 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -224,7 +224,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { - resident_size: 1000, remote_size: 1000, // not yet calculated synthetic_size: 0, @@ -245,7 +244,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() { metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), - MetricsKey::resident_size(tenant_id).at(now, 1000), MetricsKey::synthetic_size(tenant_id).at(now, 1000), ] ); @@ -256,7 +254,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { - resident_size: 1000, remote_size: 1000, // not yet calculated synthetic_size: 0, @@ -274,7 +271,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() { metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), - MetricsKey::resident_size(tenant_id).at(now, 1000), // no synthetic size here ] ); @@ -295,14 +291,13 @@ pub(crate) const fn metric_examples_old( timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [RawMetric; 6] { +) -> [RawMetric; 5] { [ MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until_old_format(before, now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0), - MetricsKey::resident_size(tenant_id).at_old_format(now, 0), MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1), ] } @@ -312,13 +307,12 @@ pub(crate) const fn metric_examples( timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [NewRawMetric; 6] { +) -> [NewRawMetric; 5] { [ MetricsKey::written_size(tenant_id, timeline_id).at(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0), MetricsKey::remote_storage_size(tenant_id).at(now, 0), - MetricsKey::resident_size(tenant_id).at(now, 0), MetricsKey::synthetic_size(tenant_id).at(now, 1), ] } diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 59e0145a5b..19c5aec5b3 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -521,10 +521,6 @@ mod tests { line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, ), - ( - line!(), - r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, - ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#, @@ -564,7 +560,7 @@ mod tests { assert_eq!(upgraded_samples, new_samples); } - fn metric_samples_old() -> [RawMetric; 6] { + fn metric_samples_old() -> [RawMetric; 5] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); @@ -576,7 +572,7 @@ mod tests { super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before) } - fn metric_samples() -> [NewRawMetric; 6] { + fn metric_samples() -> [NewRawMetric; 5] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 59c94f1549..468e5463b0 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -58,14 +58,8 @@ pub trait StorageControllerUpcallApi { impl StorageControllerUpcallClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. - pub fn new( - conf: &'static PageServerConf, - cancel: &CancellationToken, - ) -> Result, reqwest::Error> { - let mut url = match conf.control_plane_api.as_ref() { - Some(u) => u.clone(), - None => return Ok(None), - }; + pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Self { + let mut url = conf.control_plane_api.clone(); if let Ok(mut segs) = url.path_segments_mut() { // This ensures that `url` ends with a slash if it doesn't already. @@ -85,15 +79,17 @@ impl StorageControllerUpcallClient { } for cert in &conf.ssl_ca_certs { - client = client.add_root_certificate(Certificate::from_der(cert.contents())?); + client = client.add_root_certificate( + Certificate::from_der(cert.contents()).expect("Invalid certificate in config"), + ); } - Ok(Some(Self { - http_client: client.build()?, + Self { + http_client: client.build().expect("Failed to construct HTTP client"), base_url: url, node_id: conf.id, cancel: cancel.clone(), - })) + } } #[tracing::instrument(skip_all)] diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 6dd7d741c1..4d62bc4ab5 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -585,7 +585,7 @@ impl DeletionQueue { /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice. pub fn new( remote_storage: GenericRemoteStorage, - controller_upcall_client: Option, + controller_upcall_client: C, conf: &'static PageServerConf, ) -> (Self, DeletionQueueWorkers) where @@ -701,7 +701,7 @@ mod test { async fn restart(&mut self) { let (deletion_queue, workers) = DeletionQueue::new( self.storage.clone(), - Some(self.mock_control_plane.clone()), + self.mock_control_plane.clone(), self.harness.conf, ); @@ -821,11 +821,8 @@ mod test { let mock_control_plane = MockStorageController::new(); - let (deletion_queue, worker) = DeletionQueue::new( - storage.clone(), - Some(mock_control_plane.clone()), - harness.conf, - ); + let (deletion_queue, worker) = + DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf); let worker_join = worker.spawn_with(&tokio::runtime::Handle::current()); diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index 4e775f15eb..363b1427f5 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -53,7 +53,7 @@ where tx: tokio::sync::mpsc::Sender, // Client for calling into control plane API for validation of deletes - controller_upcall_client: Option, + controller_upcall_client: C, // DeletionLists which are waiting generation validation. Not safe to // execute until [`validate`] has processed them. @@ -86,7 +86,7 @@ where conf: &'static PageServerConf, rx: tokio::sync::mpsc::Receiver, tx: tokio::sync::mpsc::Sender, - controller_upcall_client: Option, + controller_upcall_client: C, lsn_table: Arc>, cancel: CancellationToken, ) -> Self { @@ -137,20 +137,16 @@ where return Ok(()); } - let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client { - match controller_upcall_client - .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) - .await - { - Ok(tenants) => tenants, - Err(RetryForeverError::ShuttingDown) => { - // The only way a validation call returns an error is when the cancellation token fires - return Err(DeletionQueueError::ShuttingDown); - } + let tenants_valid = match self + .controller_upcall_client + .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) + .await + { + Ok(tenants) => tenants, + Err(RetryForeverError::ShuttingDown) => { + // The only way a validation call returns an error is when the cancellation token fires + return Err(DeletionQueueError::ShuttingDown); } - } else { - // Control plane API disabled. In legacy mode we consider everything valid. - tenant_generations.keys().map(|k| (*k, true)).collect() }; let mut validated_sequence: Option = None; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b16970c911..8e4dbd6c3e 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -497,6 +497,24 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::n .expect("failed to define a metric") }); +pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_bytes_total", + "Total bytes of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_count", + "Total count of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + pub(crate) mod wait_ondemand_download_time { use super::*; const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[ @@ -1774,8 +1792,12 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy = Lazy::new(| .expect("failed to define a metric") }); -// Alias so all histograms recording per-timeline smgr timings use the same buckets. -static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS; +/// Per-timeline smgr histogram buckets should be the same as the compute buckets, such that the +/// metrics are comparable across compute and Pageserver. See also: +/// +/// +static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = + &[0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.1, 1.0, 3.0]; static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { register_histogram_vec!( @@ -2176,6 +2198,10 @@ impl BasebackupQueryTimeOngoingRecording<'_> { // If you want to change categorize of a specific error, also change it in `log_query_error`. let metric = match res { Ok(_) => &self.parent.ok, + Err(QueryError::Shutdown) => { + // Do not observe ok/err for shutdown + return; + } Err(QueryError::Disconnected(ConnectionError::Io(io_error))) if is_expected_io_error(io_error) => { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d1a210a786..bca1cb5b49 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1035,10 +1035,27 @@ impl PageServerHandler { // avoid a somewhat costly Span::record() by constructing the entire span in one go. macro_rules! mkspan { (before shard routing) => {{ - tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn) + tracing::info_span!( + parent: &parent_span, + "handle_get_page_request", + request_id = %req.hdr.reqid, + rel = %req.rel, + blkno = %req.blkno, + req_lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, + ) }}; ($shard_id:expr) => {{ - tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id) + tracing::info_span!( + parent: &parent_span, + "handle_get_page_request", + request_id = %req.hdr.reqid, + rel = %req.rel, + blkno = %req.blkno, + req_lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, + shard_id = %$shard_id, + ) }}; } @@ -1102,6 +1119,7 @@ impl PageServerHandler { shard_id = %shard.get_shard_identity().shard_slug(), timeline_id = %timeline_id, lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, request_id = %req.hdr.reqid, key = %key, ) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 81e548a095..d770946580 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; -use crate::context::{PerfInstrumentFutureExt, RequestContext}; +use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::{ RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD, @@ -275,24 +275,30 @@ impl Timeline { continue; } - let nblocks = match self - .get_rel_size(*tag, Version::Lsn(lsn), &ctx) - .maybe_perf_instrument(&ctx, |crnt_perf_span| { - info_span!( - target: PERF_TRACE_TARGET, - parent: crnt_perf_span, - "GET_REL_SIZE", - reltag=%tag, - lsn=%lsn, - ) - }) - .await - { - Ok(nblocks) => nblocks, - Err(err) => { - result_slots[response_slot_idx].write(Err(err)); - slots_filled += 1; - continue; + let nblocks = { + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_REL_SIZE", + reltag=%tag, + lsn=%lsn, + ) + }) + .attached_child(); + + match self + .get_rel_size(*tag, Version::Lsn(lsn), &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .await + { + Ok(nblocks) => nblocks, + Err(err) => { + result_slots[response_slot_idx].write(Err(err)); + slots_filled += 1; + continue; + } } }; @@ -308,6 +314,17 @@ impl Timeline { let key = rel_block_to_key(*tag, *blknum); + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_BATCH", + batch_size = %page_count, + ) + }) + .attached_child(); + let key_slots = keys_slots.entry(key).or_default(); key_slots.push((response_slot_idx, ctx)); @@ -323,14 +340,7 @@ impl Timeline { let query = VersionedKeySpaceQuery::scattered(query); let res = self .get_vectored(query, io_concurrency, ctx) - .maybe_perf_instrument(ctx, |current_perf_span| { - info_span!( - target: PERF_TRACE_TARGET, - parent: current_perf_span, - "GET_BATCH", - batch_size = %page_count, - ) - }) + .maybe_perf_instrument(ctx, |current_perf_span| current_perf_span.clone()) .await; match res { @@ -1084,8 +1094,17 @@ impl Timeline { let mut result = HashMap::new(); for (k, v) in kv { let v = v?; + if v.is_empty() { + // This is a tombstone -- we can skip it. + // Originally, the replorigin code uses `Lsn::INVALID` to represent a tombstone. However, as it part of + // the sparse keyspace and the sparse keyspace uses an empty image to universally represent a tombstone, + // we also need to consider that. Such tombstones might be written on the detach ancestor code path to + // avoid the value going into the child branch. (See [`crate::tenant::timeline::detach_ancestor::generate_tombstone_image_layer`] for more details.) + continue; + } let origin_id = k.field6 as RepOriginId; - let origin_lsn = Lsn::des(&v).unwrap(); + let origin_lsn = Lsn::des(&v) + .with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?; if origin_lsn != Lsn::INVALID { result.insert(origin_id, origin_lsn); } @@ -2578,6 +2597,11 @@ impl DatadirModification<'_> { } } + #[cfg(test)] + pub fn put_for_unit_test(&mut self, key: Key, val: Value) { + self.put(key, val); + } + fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 698579e8fb..e59db74479 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4254,9 +4254,7 @@ impl TenantShard { deletion_queue_client: DeletionQueueClient, l0_flush_global_state: L0FlushGlobalState, ) -> TenantShard { - debug_assert!( - !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none() - ); + assert!(!attached_conf.location.generation.is_none()); let (state, mut rx) = watch::channel(state); @@ -5949,7 +5947,9 @@ mod tests { use itertools::Itertools; #[cfg(feature = "testing")] use models::CompactLsnRange; - use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; + use pageserver_api::key::{ + AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX, repl_origin_key, + }; use pageserver_api::keyspace::KeySpace; #[cfg(feature = "testing")] use pageserver_api::keyspace::KeySpaceRandomAccum; @@ -8185,6 +8185,54 @@ mod tests { assert_eq!(files.get("pg_logical/mappings/test2"), None); } + #[tokio::test] + async fn test_repl_origin_tombstones() { + let harness = TenantHarness::create("test_repl_origin_tombstones") + .await + .unwrap(); + + let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let repl_lsn = Lsn(0x10); + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification.put_for_unit_test(repl_origin_key(2), Value::Image(Bytes::new())); + modification.set_replorigin(1, repl_lsn).await.unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // we can read everything from the storage + let repl_origins = tline + .get_replorigins(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); + assert_eq!(repl_origins.len(), 1); + assert_eq!(repl_origins[&1], lsn); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification.put_for_unit_test( + repl_origin_key(3), + Value::Image(Bytes::copy_from_slice(b"cannot_decode_this")), + ); + modification.commit(&ctx).await.unwrap(); + } + let result = tline + .get_replorigins(lsn, &ctx, io_concurrency.clone()) + .await; + assert!(result.is_err()); + } + #[tokio::test] async fn test_metadata_image_creation() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_image_creation").await?; diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 8cf3c548c9..ed541c4f12 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -94,10 +94,23 @@ impl Header { pub enum WriteBlobError { #[error(transparent)] Flush(FlushTaskError), - #[error("blob too large ({len} bytes)")] - BlobTooLarge { len: usize }, #[error(transparent)] - WriteBlobRaw(anyhow::Error), + Other(anyhow::Error), +} + +impl WriteBlobError { + pub fn is_cancel(&self) -> bool { + match self { + WriteBlobError::Flush(e) => e.is_cancel(), + WriteBlobError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + WriteBlobError::Flush(e) => e.into_anyhow(), + WriteBlobError::Other(e) => e, + } + } } impl BlockCursor<'_> { @@ -327,7 +340,9 @@ where return ( ( io_buf.slice_len(), - Err(WriteBlobError::BlobTooLarge { len }), + Err(WriteBlobError::Other(anyhow::anyhow!( + "blob too large ({len} bytes)" + ))), ), srcbuf, ); @@ -391,7 +406,7 @@ where // Verify the header, to ensure we don't write invalid/corrupt data. let header = match Header::decode(&raw_with_header) .context("decoding blob header") - .map_err(WriteBlobError::WriteBlobRaw) + .map_err(WriteBlobError::Other) { Ok(header) => header, Err(err) => return (raw_with_header, Err(err)), @@ -401,7 +416,7 @@ where let raw_len = raw_with_header.len(); return ( raw_with_header, - Err(WriteBlobError::WriteBlobRaw(anyhow::anyhow!( + Err(WriteBlobError::Other(anyhow::anyhow!( "header length mismatch: {header_total_len} != {raw_len}" ))), ); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 2ae7e1e875..86aef9b42c 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -346,7 +346,8 @@ async fn init_load_generations( "Emergency mode! Tenants will be attached unsafely using their last known generation" ); emergency_generations(tenant_confs) - } else if let Some(client) = StorageControllerUpcallClient::new(conf, cancel)? { + } else { + let client = StorageControllerUpcallClient::new(conf, cancel); info!("Calling {} API to re-attach tenants", client.base_url()); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. match client.re_attach(conf).await { @@ -360,9 +361,6 @@ async fn init_load_generations( anyhow::bail!("Shut down while waiting for control plane re-attach response") } } - } else { - info!("Control plane API not configured, tenant generations are disabled"); - return Ok(None); }; // The deletion queue needs to know about the startup attachment state to decide which (if any) stored @@ -1153,17 +1151,8 @@ impl TenantManager { // Testing hack: if we are configured with no control plane, then drop the generation // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. - let attached_conf = if cfg!(feature = "testing") { - let mut conf = AttachedTenantConf::try_from(new_location_config) - .map_err(UpsertLocationError::BadRequest)?; - if self.conf.control_plane_api.is_none() { - conf.location.generation = Generation::none(); - } - conf - } else { - AttachedTenantConf::try_from(new_location_config) - .map_err(UpsertLocationError::BadRequest)? - }; + let attached_conf = AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)?; let tenant = tenant_spawn( self.conf, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 796ad01e54..5dfa961b71 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -2,6 +2,7 @@ pub mod batch_split_writer; pub mod delta_layer; +pub mod errors; pub mod filter_iterator; pub mod image_layer; pub mod inmemory_layer; diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 39cd02d101..51f2e909a2 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -10,6 +10,7 @@ use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::TenantShardId; +use super::errors::PutError; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, @@ -235,7 +236,7 @@ impl<'a> SplitImageLayerWriter<'a> { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. @@ -253,7 +254,8 @@ impl<'a> SplitImageLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); self.batches.add_unfinished_image_writer( prev_image_writer, @@ -346,7 +348,7 @@ impl<'a> SplitDeltaLayerWriter<'a> { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate // number, and therefore the final layer size could be a little bit larger or smaller than the target. // @@ -366,7 +368,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?, + .await + .map_err(PutError::Other)?, )); } let (_, inner) = self.inner.as_mut().unwrap(); @@ -386,7 +389,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let (start_key, prev_delta_writer) = self.inner.replace((key, next_delta_writer)).unwrap(); self.batches.add_unfinished_delta_writer( @@ -396,11 +400,11 @@ impl<'a> SplitDeltaLayerWriter<'a> { ); } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { // We have to produce a very large file b/c a key is updated too often. - anyhow::bail!( + return Err(PutError::Other(anyhow::anyhow!( "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", key, inner.estimated_size() - ); + ))); } } self.last_key_written = key; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 607b0d513c..2c1b27c8d5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -55,6 +55,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, @@ -477,12 +478,15 @@ impl DeltaLayerWriterInner { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { let (_, res) = self .put_value_bytes( key, lsn, - Value::ser(&val)?.slice_len(), + Value::ser(&val) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)? + .slice_len(), val.will_init(), ctx, ) @@ -497,7 +501,7 @@ impl DeltaLayerWriterInner { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { @@ -513,19 +517,24 @@ impl DeltaLayerWriterInner { .blob_writer .write_blob_maybe_compressed(val, ctx, compression) .await; + let res = res.map_err(PutError::WriteBlob); let off = match res { Ok((off, _)) => off, - Err(e) => return (val, Err(anyhow::anyhow!(e))), + Err(e) => return (val, Err(e)), }; let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); - let res = self.tree.append(&delta_key.0, blob_ref.0); + let res = self + .tree + .append(&delta_key.0, blob_ref.0) + .map_err(anyhow::Error::new) + .map_err(PutError::Other); self.num_keys += 1; - (val, res.map_err(|e| anyhow::anyhow!(e))) + (val, res) } fn size(&self) -> u64 { @@ -694,7 +703,7 @@ impl DeltaLayerWriter { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner .as_mut() .unwrap() @@ -709,7 +718,7 @@ impl DeltaLayerWriter { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { @@ -1441,14 +1450,6 @@ impl DeltaLayerInner { offset } - pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> { - self.iter_with_options( - ctx, - 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. - 1024, // The default value. Unit tests might use a different value - ) - } - pub fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, @@ -1634,7 +1635,6 @@ pub(crate) mod test { use crate::tenant::disk_btree::tests::TestDisk; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; - use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{TenantShard, Timeline}; /// Construct an index for a fictional delta layer and and then @@ -2311,8 +2311,7 @@ pub(crate) mod test { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined - let mut iter = delta_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); @@ -2329,8 +2328,7 @@ pub(crate) mod test { iter.key_values_batch.clear(); } // Test if the result is correct - let mut iter = delta_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_delta_iter_equal(&mut iter, &test_deltas).await; } } diff --git a/pageserver/src/tenant/storage_layer/errors.rs b/pageserver/src/tenant/storage_layer/errors.rs new file mode 100644 index 0000000000..591e489faa --- /dev/null +++ b/pageserver/src/tenant/storage_layer/errors.rs @@ -0,0 +1,24 @@ +use crate::tenant::blob_io::WriteBlobError; + +#[derive(Debug, thiserror::Error)] +pub enum PutError { + #[error(transparent)] + WriteBlob(WriteBlobError), + #[error(transparent)] + Other(anyhow::Error), +} + +impl PutError { + pub fn is_cancel(&self) -> bool { + match self { + PutError::WriteBlob(e) => e.is_cancel(), + PutError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + PutError::WriteBlob(e) => e.into_anyhow(), + PutError::Other(e) => e, + } + } +} diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index 8d172a1c19..1a330ecfc2 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -157,7 +157,7 @@ mod tests { .await .unwrap(); - let merge_iter = MergeIterator::create( + let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, @@ -182,7 +182,7 @@ mod tests { result.extend(test_deltas1[90..100].iter().cloned()); assert_filter_iter_equal(&mut filter_iter, &result).await; - let merge_iter = MergeIterator::create( + let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 2f7c5715bb..740f53f928 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -53,6 +53,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::layer_name::ImageLayerName; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, @@ -684,14 +685,6 @@ impl ImageLayerInner { } } - pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> { - self.iter_with_options( - ctx, - 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. - 1024, // The default value. Unit tests might use a different value - ) - } - pub(crate) fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, @@ -850,8 +843,14 @@ impl ImageLayerWriterInner { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { - ensure!(self.key_range.contains(&key)); + ) -> Result<(), PutError> { + if !self.key_range.contains(&key) { + return Err(PutError::Other(anyhow::anyhow!( + "key {:?} not in range {:?}", + key, + self.key_range + ))); + } let compression = self.conf.image_compression; let uncompressed_len = img.len() as u64; self.uncompressed_bytes += uncompressed_len; @@ -861,7 +860,7 @@ impl ImageLayerWriterInner { .write_blob_maybe_compressed(img.slice_len(), ctx, compression) .await; // TODO: re-use the buffer for `img` further upstack - let (off, compression_info) = res?; + let (off, compression_info) = res.map_err(PutError::WriteBlob)?; if compression_info.compressed_size.is_some() { // The image has been considered for compression at least self.uncompressed_bytes_eligible += uncompressed_len; @@ -873,7 +872,10 @@ impl ImageLayerWriterInner { let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); - self.tree.append(&keybuf, off)?; + self.tree + .append(&keybuf, off) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)?; #[cfg(feature = "testing")] { @@ -1093,7 +1095,7 @@ impl ImageLayerWriter { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner.as_mut().unwrap().put_image(key, img, ctx).await } @@ -1240,7 +1242,6 @@ mod test { use crate::context::RequestContext; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; - use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{TenantShard, Timeline}; #[tokio::test] @@ -1507,8 +1508,7 @@ mod test { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined - let mut iter = img_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); @@ -1525,8 +1525,7 @@ mod test { iter.key_values_batch.clear(); } // Test if the result is correct - let mut iter = img_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await; } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b7f6e5dc77..3d55972017 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,6 +4,7 @@ use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; use crate::PERF_TRACE_TARGET; +use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT}; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; @@ -22,7 +23,7 @@ use super::{ LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState, }; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::context::{RequestContext, RequestContextBuilder}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; use crate::tenant::Timeline; @@ -1075,24 +1076,17 @@ impl LayerInner { return Err(DownloadError::DownloadRequired); } - let ctx = if ctx.has_perf_span() { - let dl_ctx = RequestContextBuilder::from(ctx) - .task_kind(TaskKind::LayerDownload) - .download_behavior(DownloadBehavior::Download) - .root_perf_span(|| { - info_span!( - target: PERF_TRACE_TARGET, - "DOWNLOAD_LAYER", - layer = %self, - reason = %reason - ) - }) - .detached_child(); - ctx.perf_follows_from(&dl_ctx); - dl_ctx - } else { - ctx.attached_child() - }; + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "DOWNLOAD_LAYER", + layer = %self, + reason = %reason, + ) + }) + .attached_child(); async move { tracing::info!(%reason, "downloading on-demand"); @@ -1100,7 +1094,7 @@ impl LayerInner { let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let res = self .download_init_and_wait(timeline, permit, ctx.attached_child()) - .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone()) .await?; scopeguard::ScopeGuard::into_inner(init_cancelled); @@ -1255,6 +1249,14 @@ impl LayerInner { self.access_stats.record_residence_event(); + let task_kind: &'static str = ctx.task_kind().into(); + ONDEMAND_DOWNLOAD_BYTES + .with_label_values(&[task_kind]) + .inc_by(self.desc.file_size); + ONDEMAND_DOWNLOAD_COUNT + .with_label_values(&[task_kind]) + .inc(); + Ok(self.initialize_after_layer_is_on_disk(permit)) } Err(e) => { @@ -1700,7 +1702,7 @@ impl DownloadError { } } -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Copy, Clone)] pub(crate) enum NeedsDownload { NotFound, NotFile(std::fs::FileType), diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index e084e3d567..ea3dea50c3 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -19,14 +19,6 @@ pub(crate) enum LayerRef<'a> { } impl<'a> LayerRef<'a> { - #[allow(dead_code)] - fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> { - match self { - Self::Image(x) => LayerIterRef::Image(x.iter(ctx)), - Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)), - } - } - fn iter_with_options( self, ctx: &'a RequestContext, @@ -322,6 +314,28 @@ impl MergeIteratorItem for ((Key, Lsn, Value), Arc) { } impl<'a> MergeIterator<'a> { + #[cfg(test)] + pub(crate) fn create_for_testing( + deltas: &[&'a DeltaLayerInner], + images: &[&'a ImageLayerInner], + ctx: &'a RequestContext, + ) -> Self { + Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024) + } + + /// Create a new merge iterator with custom options. + /// + /// Adjust `max_read_size` and `max_batch_size` to trade memory usage for performance. The size should scale + /// with the number of layers to compact. If there are a lot of layers, consider reducing the values, so that + /// the buffer does not take too much memory. + /// + /// The default options for L0 compactions are: + /// - max_read_size: 1024 * 8192 (8MB) + /// - max_batch_size: 1024 + /// + /// The default options for gc-compaction are: + /// - max_read_size: 128 * 8192 (1MB) + /// - max_batch_size: 128 pub fn create_with_options( deltas: &[&'a DeltaLayerInner], images: &[&'a ImageLayerInner], @@ -351,14 +365,6 @@ impl<'a> MergeIterator<'a> { } } - pub fn create( - deltas: &[&'a DeltaLayerInner], - images: &[&'a ImageLayerInner], - ctx: &'a RequestContext, - ) -> Self { - Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024) - } - pub(crate) async fn next_inner(&mut self) -> anyhow::Result> { while let Some(mut iter) = self.heap.peek_mut() { if !iter.is_loaded() { @@ -477,7 +483,7 @@ mod tests { let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) .await .unwrap(); - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_2.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), @@ -549,7 +555,7 @@ mod tests { let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) .await .unwrap(); - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_2.get_as_delta(&ctx).await.unwrap(), @@ -670,7 +676,7 @@ mod tests { // Test with different layer order for MergeIterator::create to ensure the order // is stable. - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_4.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), @@ -682,7 +688,7 @@ mod tests { ); assert_merge_iter_equal(&mut merge_iter, &expect).await; - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_4.get_as_delta(&ctx).await.unwrap(), diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 1112a5330b..4709a6d616 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -340,7 +340,7 @@ pub(crate) fn log_compaction_error( } else { match level { Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"), - Level::ERROR => error!("Compaction failed: {err:#}"), + Level::ERROR => error!("Compaction failed: {err:?}"), Level::INFO => info!("Compaction failed: {err:#}"), level => unimplemented!("unexpected level {level:?}"), } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index cfeab77598..d7f5958128 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -987,6 +987,16 @@ impl From for CreateImageLayersError { } } +impl From for CreateImageLayersError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CreateImageLayersError::Cancelled + } else { + CreateImageLayersError::Other(e.into_anyhow()) + } + } +} + impl From for CreateImageLayersError { fn from(e: GetVectoredError) -> Self { match e { @@ -2117,22 +2127,14 @@ impl Timeline { debug_assert_current_span_has_tenant_and_timeline_id(); // Regardless of whether we're going to try_freeze_and_flush - // or not, stop ingesting any more data. Walreceiver only provides - // cancellation but no "wait until gone", because it uses the Timeline::gate. - // So, only after the self.gate.close() below will we know for sure that - // no walreceiver tasks are left. - // For `try_freeze_and_flush=true`, this means that we might still be ingesting - // data during the call to `self.freeze_and_flush()` below. - // That's not ideal, but, we don't have the concept of a ChildGuard, - // which is what we'd need to properly model early shutdown of the walreceiver - // task sub-tree before the other Timeline task sub-trees. + // or not, stop ingesting any more data. let walreceiver = self.walreceiver.lock().unwrap().take(); tracing::debug!( is_some = walreceiver.is_some(), "Waiting for WalReceiverManager..." ); if let Some(walreceiver) = walreceiver { - walreceiver.cancel(); + walreceiver.shutdown().await; } // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); @@ -5923,6 +5925,16 @@ impl From for CompactionError { } } +impl From for CompactionError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CompactionError::ShuttingDown + } else { + CompactionError::Other(e.into_anyhow()) + } + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9086d29d50..07cd274a41 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1994,7 +1994,13 @@ impl Timeline { let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; deltas.push(l); } - MergeIterator::create(&deltas, &[], ctx) + MergeIterator::create_with_options( + &deltas, + &[], + ctx, + 1024 * 8192, /* 8 MiB buffer per layer iterator */ + 1024, + ) }; // This iterator walks through all keys and is needed to calculate size used by each key @@ -2198,8 +2204,7 @@ impl Timeline { .as_mut() .unwrap() .put_value(key, lsn, value, ctx) - .await - .map_err(CompactionError::Other)?; + .await?; } else { let owner = self.shard_identity.get_shard_number(&key); @@ -2828,7 +2833,7 @@ impl Timeline { Ok(()) } - /// Check if the memory usage is within the limit. + /// Check to bail out of gc compaction early if it would use too much memory. async fn check_memory_usage( self: &Arc, layer_selection: &[Layer], @@ -2841,7 +2846,8 @@ impl Timeline { let layer_desc = layer.layer_desc(); if layer_desc.is_delta() { // Delta layers at most have 1MB buffer; 3x to make it safe (there're deltas as large as 16KB). - // Multiply the layer size so that tests can pass. + // Scale it by target_layer_size_bytes so that tests can pass (some tests, e.g., `test_pageserver_gc_compaction_preempt + // use 3MB layer size and we need to account for that). estimated_memory_usage_mb += 3.0 * (layer_desc.file_size / target_layer_size_bytes) as f64; num_delta_layers += 1; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 8e95c3a8ff..649b33e294 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -178,7 +178,7 @@ impl Attempt { } } -async fn generate_tombstone_image_layer( +pub(crate) async fn generate_tombstone_image_layer( detached: &Arc, ancestor: &Arc, ancestor_lsn: Lsn, diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index b917fdbfd8..c4a8df39a3 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -149,22 +149,14 @@ pub async fn doit( } .await?; - flow::run( - timeline.clone(), - base_lsn, - control_file, - storage.clone(), - ctx, - ) - .await?; + flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?; // // Communicate that shard is done. // Ensure at-least-once delivery of the upcall to storage controller // before we mark the task as done and never come here again. // - let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel)? - .expect("storcon configured"); + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); storcon_client .put_timeline_import_status( timeline.tenant_shard_id, diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index c6d2944769..34c073365d 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -34,7 +34,9 @@ use std::sync::Arc; use anyhow::{bail, ensure}; use bytes::Bytes; +use futures::stream::FuturesOrdered; use itertools::Itertools; +use pageserver_api::config::TimelineImportConfig; use pageserver_api::key::{ CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, @@ -46,8 +48,9 @@ use pageserver_api::shard::ShardIdentity; use postgres_ffi::relfile_utils::parse_relfilename; use postgres_ffi::{BLCKSZ, pg_constants}; use remote_storage::RemotePath; -use tokio::task::JoinSet; -use tracing::{Instrument, debug, info_span, instrument}; +use tokio::sync::Semaphore; +use tokio_stream::StreamExt; +use tracing::{debug, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; @@ -63,37 +66,39 @@ use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, - pgdata_lsn: Lsn, control_file: ControlFile, storage: RemoteStorageWrapper, ctx: &RequestContext, ) -> anyhow::Result<()> { - Flow { - timeline, - pgdata_lsn, + let planner = Planner { control_file, - tasks: Vec::new(), - storage, - } - .run(ctx) - .await + storage: storage.clone(), + shard: timeline.shard_identity, + tasks: Vec::default(), + }; + + let import_config = &timeline.conf.timeline_import_config; + let plan = planner.plan(import_config).await?; + plan.execute(timeline, import_config, ctx).await } -struct Flow { - timeline: Arc, - pgdata_lsn: Lsn, +struct Planner { control_file: ControlFile, - tasks: Vec, storage: RemoteStorageWrapper, + shard: ShardIdentity, + tasks: Vec, } -impl Flow { - /// Perform the ingestion into [`Self::timeline`]. - /// Assumes the timeline is empty (= no layers). - pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> { - let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); +struct Plan { + jobs: Vec, +} - self.pgdata_lsn = pgdata_lsn; +impl Planner { + /// Creates an import plan + /// + /// This function is and must remain pure: given the same input, it will generate the same import plan. + async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result { + let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); let datadir = PgDataDir::new(&self.storage).await?; @@ -115,7 +120,7 @@ impl Flow { } // Import SLRUs - if self.timeline.tenant_shard_id.is_shard_zero() { + if self.shard.is_shard_zero() { // pg_xact (01:00 keyspace) self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) .await?; @@ -166,14 +171,16 @@ impl Flow { let mut last_end_key = Key::MIN; let mut current_chunk = Vec::new(); let mut current_chunk_size: usize = 0; - let mut parallel_jobs = Vec::new(); + let mut jobs = Vec::new(); for task in std::mem::take(&mut self.tasks).into_iter() { - if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 { + if current_chunk_size + task.total_size() + > import_config.import_job_soft_size_limit.into() + { let key_range = last_end_key..task.key_range().start; - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( key_range.clone(), std::mem::take(&mut current_chunk), - &self, + pgdata_lsn, )); last_end_key = key_range.end; current_chunk_size = 0; @@ -181,45 +188,13 @@ impl Flow { current_chunk_size += task.total_size(); current_chunk.push(task); } - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( last_end_key..Key::MAX, current_chunk, - &self, + pgdata_lsn, )); - // Start all jobs simultaneosly - let mut work = JoinSet::new(); - // TODO: semaphore? - for job in parallel_jobs { - let ctx: RequestContext = - ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); - work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job"))); - } - let mut results = Vec::new(); - while let Some(result) = work.join_next().await { - match result { - Ok(res) => { - results.push(res); - } - Err(_joinset_err) => { - results.push(Err(anyhow::anyhow!( - "parallel job panicked or cancelled, check pageserver logs" - ))); - } - } - } - - if results.iter().all(|r| r.is_ok()) { - Ok(()) - } else { - let mut msg = String::new(); - for result in results { - if let Err(err) = result { - msg.push_str(&format!("{err:?}\n\n")); - } - } - bail!("Some parallel jobs failed:\n\n{msg}"); - } + Ok(Plan { jobs }) } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] @@ -266,7 +241,7 @@ impl Flow { let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32); self.tasks .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new( - *self.timeline.get_shard_identity(), + self.shard, start_key..end_key, &file.path, self.storage.clone(), @@ -289,7 +264,7 @@ impl Flow { } async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { - assert!(self.timeline.tenant_shard_id.is_shard_zero()); + assert!(self.shard.is_shard_zero()); let segments = self.storage.listfilesindir(path).await?; let segments: Vec<(String, u32, usize)> = segments @@ -344,6 +319,68 @@ impl Flow { } } +impl Plan { + async fn execute( + self, + timeline: Arc, + import_config: &TimelineImportConfig, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut work = FuturesOrdered::new(); + let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into())); + + let jobs_in_plan = self.jobs.len(); + + let mut jobs = self.jobs.into_iter().enumerate().peekable(); + let mut results = Vec::new(); + + // Run import jobs concurrently up to the limit specified by the pageserver configuration. + // Note that we process completed futures in the oreder of insertion. This will be the + // building block for resuming imports across pageserver restarts or tenant migrations. + while results.len() < jobs_in_plan { + tokio::select! { + permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => { + let permit = permit.expect("never closed"); + let (job_idx, job) = jobs.next().expect("we peeked"); + let job_timeline = timeline.clone(); + let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); + + work.push_back(tokio::task::spawn(async move { + let _permit = permit; + let res = job.run(job_timeline, &ctx).await; + (job_idx, res) + })); + }, + maybe_complete_job_idx = work.next() => { + match maybe_complete_job_idx { + Some(Ok((_job_idx, res))) => { + results.push(res); + }, + Some(Err(_)) => { + results.push(Err(anyhow::anyhow!( + "parallel job panicked or cancelled, check pageserver logs" + ))); + } + None => {} + } + } + } + } + + if results.iter().all(|r| r.is_ok()) { + Ok(()) + } else { + let mut msg = String::new(); + for result in results { + if let Err(err) = result { + msg.push_str(&format!("{err:?}\n\n")); + } + } + bail!("Some parallel jobs failed:\n\n{msg}"); + } + } +} + // // dbdir iteration tools // @@ -713,7 +750,6 @@ impl From for AnyImportTask { } struct ChunkProcessingJob { - timeline: Arc, range: Range, tasks: Vec, @@ -721,25 +757,24 @@ struct ChunkProcessingJob { } impl ChunkProcessingJob { - fn new(range: Range, tasks: Vec, env: &Flow) -> Self { - assert!(env.pgdata_lsn.is_valid()); + fn new(range: Range, tasks: Vec, pgdata_lsn: Lsn) -> Self { + assert!(pgdata_lsn.is_valid()); Self { - timeline: env.timeline.clone(), range, tasks, - pgdata_lsn: env.pgdata_lsn, + pgdata_lsn, } } - async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> { + async fn run(self, timeline: Arc, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = ImageLayerWriter::new( - self.timeline.conf, - self.timeline.timeline_id, - self.timeline.tenant_shard_id, + timeline.conf, + timeline.timeline_id, + timeline.tenant_shard_id, &self.range, self.pgdata_lsn, - &self.timeline.gate, - self.timeline.cancel.clone(), + &timeline.gate, + timeline.cancel.clone(), ctx, ) .await?; @@ -751,24 +786,20 @@ impl ChunkProcessingJob { let resident_layer = if nimages > 0 { let (desc, path) = writer.finish(ctx).await?; - Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)? + Layer::finish_creating(timeline.conf, &timeline, desc, &path)? } else { // dropping the writer cleans up return Ok(()); }; // this is sharing the same code as create_image_layers - let mut guard = self.timeline.layers.write().await; + let mut guard = timeline.layers.write().await; guard .open_mut()? - .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics); + .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); crate::tenant::timeline::drop_wlock(guard); - // Schedule the layer for upload but don't add barriers such as - // wait for completion or index upload, so we don't inhibit upload parallelism. - // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?) - // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level. - self.timeline + timeline .remote_client .schedule_layer_file_upload(resident_layer)?; diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 4f80073cc3..0f73eb839b 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -63,6 +63,7 @@ pub struct WalReceiver { /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. cancel: CancellationToken, + task: tokio::task::JoinHandle<()>, } impl WalReceiver { @@ -79,7 +80,7 @@ impl WalReceiver { let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); let cancel = timeline.cancel.child_token(); - WALRECEIVER_RUNTIME.spawn({ + let task = WALRECEIVER_RUNTIME.spawn({ let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -120,14 +121,25 @@ impl WalReceiver { Self { manager_status, cancel, + task, } } #[instrument(skip_all, level = tracing::Level::DEBUG)] - pub fn cancel(&self) { + pub async fn shutdown(self) { debug_assert_current_span_has_tenant_and_timeline_id(); debug!("cancelling walreceiver tasks"); self.cancel.cancel(); + match self.task.await { + Ok(()) => debug!("Shutdown success"), + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // already logged by panic hook + } + Err(je) => { + error!("shutdown walreceiver task join error: {je}") + } + } } pub(crate) fn status(&self) -> Option { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 58953407b1..f429e59ef3 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -14,8 +14,6 @@ use std::fs::File; use std::io::{Error, ErrorKind}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -#[cfg(target_os = "linux")] -use std::os::unix::fs::OpenOptionsExt; use std::sync::LazyLock; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; @@ -99,7 +97,7 @@ impl VirtualFile { pub async fn open_with_options_v2>( path: P, - open_options: &OpenOptions, + #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions, ctx: &RequestContext, ) -> Result { let mode = get_io_mode(); @@ -112,21 +110,16 @@ impl VirtualFile { #[cfg(target_os = "linux")] (IoMode::DirectRw, _) => true, }; - let open_options = open_options.clone(); - let open_options = if set_o_direct { + if set_o_direct { #[cfg(target_os = "linux")] { - let mut open_options = open_options; - open_options.custom_flags(nix::libc::O_DIRECT); - open_options + open_options = open_options.custom_flags(nix::libc::O_DIRECT); } #[cfg(not(target_os = "linux"))] unreachable!( "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined" ); - } else { - open_options - }; + } let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { inner, _mode: mode }) } @@ -530,7 +523,7 @@ impl VirtualFileInner { path: P, ctx: &RequestContext, ) -> Result { - Self::open_with_options(path.as_ref(), OpenOptions::new().read(true).clone(), ctx).await + Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } /// Open a file with given options. @@ -558,10 +551,11 @@ impl VirtualFileInner { // It would perhaps be nicer to check just for the read and write flags // explicitly, but OpenOptions doesn't contain any functions to read flags, // only to set them. - let mut reopen_options = open_options.clone(); - reopen_options.create(false); - reopen_options.create_new(false); - reopen_options.truncate(false); + let reopen_options = open_options + .clone() + .create(false) + .create_new(false) + .truncate(false); let vfile = VirtualFileInner { handle: RwLock::new(handle), @@ -1307,7 +1301,7 @@ mod tests { opts: OpenOptions, ctx: &RequestContext, ) -> Result { - let vf = VirtualFile::open_with_options_v2(&path, &opts, ctx).await?; + let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?; Ok(MaybeVirtualFile::VirtualFile(vf)) } } @@ -1374,7 +1368,7 @@ mod tests { let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err(); // Close the file and re-open for reading - let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; + let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?; // cannot write to a file opened in read-only mode let _ = file_a @@ -1393,8 +1387,7 @@ mod tests { .read(true) .write(true) .create(true) - .truncate(true) - .to_owned(), + .truncate(true), &ctx, ) .await?; @@ -1412,12 +1405,7 @@ mod tests { let mut vfiles = Vec::new(); for _ in 0..100 { - let mut vfile = A::open( - path_b.clone(), - OpenOptions::new().read(true).to_owned(), - &ctx, - ) - .await?; + let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?; assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?); vfiles.push(vfile); } @@ -1466,7 +1454,7 @@ mod tests { for _ in 0..VIRTUAL_FILES { let f = VirtualFileInner::open_with_options( &test_file_path, - OpenOptions::new().read(true).clone(), + OpenOptions::new().read(true), &ctx, ) .await?; diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index dd04fb561a..7827682498 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -111,13 +111,17 @@ pub(crate) fn get() -> IoEngine { use std::os::unix::prelude::FileExt; use std::sync::atomic::{AtomicU8, Ordering}; +#[cfg(target_os = "linux")] +use {std::time::Duration, tracing::info}; use super::owned_buffers_io::io_buf_ext::FullSlice; use super::owned_buffers_io::slice::SliceMutExt; use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] -fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { +pub(super) fn epoll_uring_error_to_std( + e: tokio_epoll_uring::Error, +) -> std::io::Error { match e { tokio_epoll_uring::Error::Op(e) => e, tokio_epoll_uring::Error::System(system) => { @@ -149,7 +153,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.read(file_guard, offset, slice).await; + let (resources, res) = + retry_ecanceled_once((file_guard, slice), |(file_guard, slice)| async { + system.read(file_guard, offset, slice).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -164,7 +172,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fsync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fsync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -182,7 +193,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fdatasync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fdatasync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -201,7 +215,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.statx(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.statx(file_guard).await + }) + .await; ( resources, res.map_err(epoll_uring_error_to_std).map(Metadata::from), @@ -224,6 +241,7 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { // TODO: ftruncate op for tokio-epoll-uring + // Don't forget to use retry_ecanceled_once let res = file_guard.with_std_file(|std_file| std_file.set_len(len)); (file_guard, res) } @@ -245,8 +263,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let ((file_guard, slice), res) = - system.write(file_guard, offset, buf.into_raw_slice()).await; + let ((file_guard, slice), res) = retry_ecanceled_once( + (file_guard, buf.into_raw_slice()), + async |(file_guard, buf)| system.write(file_guard, offset, buf).await, + ) + .await; ( (file_guard, FullSlice::must_new(slice)), res.map_err(epoll_uring_error_to_std), @@ -282,6 +303,56 @@ impl IoEngine { } } +/// We observe in tests that stop pageserver with SIGTERM immediately after it was ingesting data, +/// occasionally buffered writers fail (and get retried by BufferedWriter) with ECANCELED. +/// The problem is believed to be a race condition in how io_uring handles punted async work (io-wq) and signals. +/// Investigation ticket: +/// +/// This function retries the operation once if it fails with ECANCELED. +/// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations. +#[cfg(target_os = "linux")] +pub(super) async fn retry_ecanceled_once( + resources: T, + f: F, +) -> (T, Result>) +where + F: Fn(T) -> Fut, + Fut: std::future::Future>)>, + T: Send, + V: Send, +{ + let (resources, res) = f(resources).await; + let Err(e) = res else { + return (resources, res); + }; + let tokio_epoll_uring::Error::Op(err) = e else { + return (resources, Err(e)); + }; + if err.raw_os_error() != Some(nix::libc::ECANCELED) { + return (resources, Err(tokio_epoll_uring::Error::Op(err))); + } + { + static RATE_LIMIT: std::sync::Mutex = + std::sync::Mutex::new(utils::rate_limit::RateLimit::new(Duration::from_secs(1))); + let mut guard = RATE_LIMIT.lock().unwrap(); + guard.call2(|rate_limit_stats| { + info!( + %rate_limit_stats, "ECANCELED observed, assuming it is due to a signal being received by the submitting thread, retrying after a delay; this message is rate-limited" + ); + }); + drop(guard); + } + tokio::time::sleep(Duration::from_millis(100)).await; // something big enough to beat even heavily overcommitted CI runners + let (resources, res) = f(resources).await; + (resources, res) +} + +pub(super) fn panic_operation_must_be_idempotent() { + panic!( + "unsupported; io_engine may retry operations internally and thus needs them to be idempotent (retry_ecanceled_once)" + ) +} + pub enum FeatureTestResult { PlatformPreferred(IoEngineKind), Worse { diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 7d323f3d8f..a40dfed4a4 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -1,6 +1,7 @@ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; use std::os::fd::OwnedFd; +use std::os::unix::fs::OpenOptionsExt; use std::path::Path; use super::io_engine::IoEngine; @@ -43,7 +44,7 @@ impl OpenOptions { self.write } - pub fn read(&mut self, read: bool) -> &mut OpenOptions { + pub fn read(mut self, read: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.read(read); @@ -56,7 +57,7 @@ impl OpenOptions { self } - pub fn write(&mut self, write: bool) -> &mut OpenOptions { + pub fn write(mut self, write: bool) -> Self { self.write = write; match &mut self.inner { Inner::StdFs(x) => { @@ -70,7 +71,7 @@ impl OpenOptions { self } - pub fn create(&mut self, create: bool) -> &mut OpenOptions { + pub fn create(mut self, create: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create(create); @@ -83,7 +84,7 @@ impl OpenOptions { self } - pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions { + pub fn create_new(mut self, create_new: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create_new(create_new); @@ -96,7 +97,7 @@ impl OpenOptions { self } - pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions { + pub fn truncate(mut self, truncate: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.truncate(truncate); @@ -109,25 +110,28 @@ impl OpenOptions { self } + /// Don't use, `O_APPEND` is not supported. + pub fn append(&mut self, _append: bool) { + super::io_engine::panic_operation_must_be_idempotent(); + } + pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { match &self.inner { Inner::StdFs(x) => x.open(path).map(|file| file.into()), #[cfg(target_os = "linux")] Inner::TokioEpollUring(x) => { let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; - system.open(path, x).await.map_err(|e| match e { - tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } + let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async { + let res = system.open(path, x).await; + ((), res) }) + .await; + res.map_err(super::io_engine::epoll_uring_error_to_std) } } } -} -impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { - fn mode(&mut self, mode: u32) -> &mut OpenOptions { + pub fn mode(mut self, mode: u32) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.mode(mode); @@ -140,7 +144,10 @@ impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { self } - fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions { + pub fn custom_flags(mut self, flags: i32) -> Self { + if flags & nix::libc::O_APPEND != 0 { + super::io_engine::panic_operation_must_be_idempotent(); + } match &mut self.inner { Inner::StdFs(x) => { let _ = x.custom_flags(flags); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index b41a9f6cd2..ac9867e8b4 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -247,6 +247,19 @@ pub enum FlushTaskError { Cancelled, } +impl FlushTaskError { + pub fn is_cancel(&self) -> bool { + match self { + FlushTaskError::Cancelled => true, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + FlushTaskError::Cancelled => anyhow::anyhow!(self), + } + } +} + impl FlushBackgroundTask where Buf: IoBufAligned + Send + Sync, diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 426b176af9..8bcc6bf924 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -36,6 +36,8 @@ DATA = \ neon--1.2--1.3.sql \ neon--1.3--1.4.sql \ neon--1.4--1.5.sql \ + neon--1.5--1.6.sql \ + neon--1.6--1.5.sql \ neon--1.5--1.4.sql \ neon--1.4--1.3.sql \ neon--1.3--1.2.sql \ diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index db3e053321..818a149499 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -88,9 +88,6 @@ typedef PGAlignedBlock PGIOAlignedBlock; page_server_api *page_server; -static uint32 local_request_counter; -#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) - /* * Various settings related to prompt (fast) handling of PageStream responses * at any CHECK_FOR_INTERRUPTS point. @@ -690,8 +687,14 @@ prefetch_wait_for(uint64 ring_index) END_PREFETCH_RECEIVE_WORK(); CHECK_FOR_INTERRUPTS(); } - - return result; + if (result) + { + /* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */ + PrefetchRequest *slot = GetPrfSlot(ring_index); + return slot->status == PRFS_RECEIVED; + } + return false; +; } /* @@ -788,6 +791,27 @@ prefetch_read(PrefetchRequest *slot) } } + +/* + * Wait completion of previosly registered prefetch request. + * Prefetch result should be placed in LFC by prefetch_wait_for. + */ +bool +communicator_prefetch_receive(BufferTag tag) +{ + PrfHashEntry *entry; + PrefetchRequest hashkey; + + hashkey.buftag = tag; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index)) + { + prefetch_set_unused(entry->slot->my_ring_index); + return true; + } + return false; +} + /* * Disconnect hook - drop prefetches when the connection drops * @@ -906,7 +930,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns NeonGetPageRequest request = { .hdr.tag = T_NeonGetPageRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), /* lsn and not_modified_since are filled in below */ .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, @@ -915,8 +938,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns Assert(mySlotNo == MyPState->ring_unused); - slot->reqid = request.hdr.reqid; - if (force_request_lsns) slot->request_lsns = *force_request_lsns; else @@ -934,6 +955,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns Assert(mySlotNo == MyPState->ring_unused); /* loop */ } + slot->reqid = request.hdr.reqid; /* update prefetch state */ MyPState->n_requests_inflight += 1; @@ -1937,7 +1959,6 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r { NeonExistsRequest request = { .hdr.tag = T_NeonExistsRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), .hdr.lsn = request_lsns->request_lsn, .hdr.not_modified_since = request_lsns->not_modified_since, .rinfo = rinfo, @@ -2212,7 +2233,6 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns * { NeonNblocksRequest request = { .hdr.tag = T_NeonNblocksRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), .hdr.lsn = request_lsns->request_lsn, .hdr.not_modified_since = request_lsns->not_modified_since, .rinfo = rinfo, @@ -2285,7 +2305,6 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) { NeonDbSizeRequest request = { .hdr.tag = T_NeonDbSizeRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), .hdr.lsn = request_lsns->request_lsn, .hdr.not_modified_since = request_lsns->not_modified_since, .dbNode = dbNode, @@ -2353,7 +2372,6 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re request = (NeonGetSlruSegmentRequest) { .hdr.tag = T_NeonGetSlruSegmentRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), .hdr.lsn = request_lsns->request_lsn, .hdr.not_modified_since = request_lsns->not_modified_since, .kind = kind, diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h index 72cba526c1..f55c4b10f1 100644 --- a/pgxn/neon/communicator.h +++ b/pgxn/neon/communicator.h @@ -37,6 +37,8 @@ extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber nblocks, void **buffers, bits8 *mask); extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, BlockNumber nblocks, const bits8 *mask); +extern bool communicator_prefetch_receive(BufferTag tag); + extern int communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *request_lsns, void *buffer); diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 8c2990e57a..ecc55bb540 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -25,6 +25,7 @@ #include "pgstat.h" #include "port/pg_iovec.h" #include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" #include RELFILEINFO_HDR #include "storage/buf_internals.h" #include "storage/fd.h" @@ -32,6 +33,8 @@ #include "storage/latch.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" +#include "storage/procsignal.h" +#include "tcop/tcopprot.h" #include "utils/builtins.h" #include "utils/dynahash.h" #include "utils/guc.h" @@ -46,6 +49,8 @@ #include "neon.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" +#include "pagestore_client.h" +#include "communicator.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -87,14 +92,13 @@ * 1Mb chunks can reduce hash map size to 320Mb. * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed */ -#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ -/* - * Smaller chunk seems to be better for OLTP workload - */ -// #define BLOCKS_PER_CHUNK 8 /* 64kb chunk */ +#define MAX_BLOCKS_PER_CHUNK_LOG 7 /* 1Mb chunk */ +#define MAX_BLOCKS_PER_CHUNK (1 << MAX_BLOCKS_PER_CHUNK_LOG) + #define MB ((uint64)1024*1024) -#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) +#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log)) +#define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1)) /* * Blocks are read or written to LFC file outside LFC critical section. @@ -119,16 +123,26 @@ typedef struct FileCacheEntry uint32 hash; uint32 offset; uint32 access_count; - uint32 state[(BLOCKS_PER_CHUNK + 31) / 32 * 2]; /* two bits per block */ dlist_node list_node; /* LRU/holes list node */ + uint32 state[FLEXIBLE_ARRAY_MEMBER]; /* two bits per block */ } FileCacheEntry; +#define FILE_CACHE_ENRTY_SIZE MAXALIGN(offsetof(FileCacheEntry, state) + (lfc_blocks_per_chunk*2+31)/32*4) #define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3) #define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2)) #define N_COND_VARS 64 #define CV_WAIT_TIMEOUT 10 +#define MAX_PREWARM_WORKERS 8 + +typedef struct PrewarmWorkerState +{ + uint32 prewarmed_pages; + uint32 skipped_pages; + TimestampTz completed; +} PrewarmWorkerState; + typedef struct FileCacheControl { uint64 generation; /* generation is needed to handle correct hash @@ -136,6 +150,7 @@ typedef struct FileCacheControl uint32 size; /* size of cache file in chunks */ uint32 used; /* number of used chunks */ uint32 used_pages; /* number of used pages */ + uint32 pinned; /* number of pinned chunks */ uint32 limit; /* shared copy of lfc_size_limit */ uint64 hits; uint64 misses; @@ -149,23 +164,43 @@ typedef struct FileCacheControl dlist_head holes; /* double linked list of punched holes */ HyperLogLogState wss_estimation; /* estimation of working set size */ ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */ + PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS]; + size_t n_prewarm_workers; + size_t n_prewarm_entries; + size_t total_prewarm_pages; + size_t prewarm_batch; + bool prewarm_active; + bool prewarm_canceled; + dsm_handle prewarm_lfc_state_handle; } FileCacheControl; -bool lfc_store_prefetch_result; +#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc + +#define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks]) +#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8) +#define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8) static HTAB *lfc_hash; static int lfc_desc = -1; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; +static int lfc_prewarm_limit; +static int lfc_prewarm_batch; +static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG; +static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK; static char *lfc_path; static uint64 lfc_generation; static FileCacheControl *lfc_ctl; +static bool lfc_do_prewarm; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 static shmem_request_hook_type prev_shmem_request_hook; #endif +bool lfc_store_prefetch_result; +bool lfc_prewarm_update_ws_estimation; + #define LFC_ENABLED() (lfc_ctl->limit != 0) /* @@ -206,7 +241,9 @@ lfc_switch_off(void) } lfc_ctl->generation += 1; lfc_ctl->size = 0; + lfc_ctl->pinned = 0; lfc_ctl->used = 0; + lfc_ctl->used_pages = 0; lfc_ctl->limit = 0; dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); @@ -296,7 +333,7 @@ lfc_shmem_startup(void) lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock"); info.keysize = sizeof(BufferTag); - info.entrysize = sizeof(FileCacheEntry); + info.entrysize = FILE_CACHE_ENRTY_SIZE; /* * n_chunks+1 because we add new element to hash table before eviction @@ -342,7 +379,7 @@ lfc_shmem_request(void) prev_shmem_request_hook(); #endif - RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry))); + RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE)); RequestNamedLWLockTranche("lfc_lock", 1); } @@ -359,6 +396,24 @@ is_normal_backend(void) return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker(); } +static bool +lfc_check_chunk_size(int *newval, void **extra, GucSource source) +{ + if (*newval & (*newval - 1)) + { + elog(ERROR, "LFC chunk size should be power of two"); + return false; + } + return true; +} + +static void +lfc_change_chunk_size(int newval, void* extra) +{ + lfc_chunk_size_log = pg_ceil_log2_32(newval); +} + + static bool lfc_check_limit_hook(int *newval, void **extra, GucSource source) { @@ -415,11 +470,11 @@ lfc_change_limit_hook(int newval, void *extra) CriticalAssert(victim->access_count == 0); #ifdef FALLOC_FL_PUNCH_HOLE - if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0) + if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * lfc_blocks_per_chunk * BLCKSZ, lfc_blocks_per_chunk * BLCKSZ) < 0) neon_log(LOG, "Failed to punch hole in file: %m"); #endif /* We remove the old entry, and re-enter a hole to the hash table */ - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + for (int i = 0; i < lfc_blocks_per_chunk; i++) { bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; lfc_ctl->used_pages -= is_page_cached; @@ -471,6 +526,17 @@ lfc_init(void) NULL, NULL); + DefineCustomBoolVariable("neon.prewarm_update_ws_estimation", + "Consider prewarmed pages for working set estimation", + NULL, + &lfc_prewarm_update_ws_estimation, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + DefineCustomIntVariable("neon.max_file_cache_size", "Maximal size of Neon local file cache", NULL, @@ -508,6 +574,45 @@ lfc_init(void) NULL, NULL); + DefineCustomIntVariable("neon.file_cache_chunk_size", + "LFC chunk size in blocks (should be power of two)", + NULL, + &lfc_blocks_per_chunk, + MAX_BLOCKS_PER_CHUNK, + 1, + MAX_BLOCKS_PER_CHUNK, + PGC_POSTMASTER, + GUC_UNIT_BLOCKS, + lfc_check_chunk_size, + lfc_change_chunk_size, + NULL); + + DefineCustomIntVariable("neon.file_cache_prewarm_limit", + "Maximal number of prewarmed chunks", + NULL, + &lfc_prewarm_limit, + INT_MAX, /* no limit by default */ + 0, + INT_MAX, + PGC_SIGHUP, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("neon.file_cache_prewarm_batch", + "Number of pages retrivied by prewarm from page server", + NULL, + &lfc_prewarm_batch, + 64, + 1, + INT_MAX, + PGC_SIGHUP, + 0, + NULL, + NULL, + NULL); + if (lfc_max_size == 0) return; @@ -521,6 +626,317 @@ lfc_init(void) #endif } +FileCacheState* +lfc_get_state(size_t max_entries) +{ + FileCacheState* fcs = NULL; + + if (lfc_maybe_disabled() || max_entries == 0) /* fast exit if file cache is disabled */ + return NULL; + + LWLockAcquire(lfc_lock, LW_SHARED); + + if (LFC_ENABLED()) + { + dlist_iter iter; + size_t i = 0; + uint8* bitmap; + size_t n_pages = 0; + size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned); + size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries); + fcs = (FileCacheState*)palloc0(state_size); + SET_VARSIZE(fcs, state_size); + fcs->magic = FILE_CACHE_STATE_MAGIC; + fcs->chunk_size_log = lfc_chunk_size_log; + fcs->n_chunks = n_entries; + bitmap = FILE_CACHE_STATE_BITMAP(fcs); + + dlist_reverse_foreach(iter, &lfc_ctl->lru) + { + FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur); + fcs->chunks[i] = entry->key; + for (int j = 0; j < lfc_blocks_per_chunk; j++) + { + if (GET_STATE(entry, j) != UNAVAILABLE) + { + BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j); + n_pages += 1; + } + } + if (++i == n_entries) + break; + } + Assert(i == n_entries); + fcs->n_pages = n_pages; + Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages); + elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages); + } + + LWLockRelease(lfc_lock); + + return fcs; +} + +/* + * Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock + * and avoid race conditions with other backends. + */ +void +lfc_prewarm(FileCacheState* fcs, uint32 n_workers) +{ + size_t fcs_chunk_size_log; + size_t n_entries; + size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size); + size_t fcs_size; + dsm_segment *seg; + BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS]; + + + if (!lfc_ensure_opened()) + return; + + if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0) + { + elog(LOG, "LFC: prewarm is disabled"); + return; + } + + if (n_workers > MAX_PREWARM_WORKERS) + { + elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS); + } + + if (fcs == NULL || fcs->n_chunks == 0) + { + elog(LOG, "LFC: nothing to prewarm"); + return; + } + + if (fcs->magic != FILE_CACHE_STATE_MAGIC) + { + elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic); + } + + fcs_size = VARSIZE(fcs); + if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size) + { + elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs)); + } + + fcs_chunk_size_log = fcs->chunk_size_log; + if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG) + { + elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log); + } + + n_entries = Min(fcs->n_chunks, lfc_prewarm_limit); + Assert(n_entries != 0); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + /* Do not prewarm more entries than LFC limit */ + if (lfc_ctl->limit <= lfc_ctl->size) + { + elog(LOG, "LFC: skip prewarm because LFC is already filled"); + LWLockRelease(lfc_lock); + return; + } + + if (lfc_ctl->prewarm_active) + { + LWLockRelease(lfc_lock); + elog(ERROR, "LFC: skip prewarm because another prewarm is still active"); + } + lfc_ctl->n_prewarm_entries = n_entries; + lfc_ctl->n_prewarm_workers = n_workers; + lfc_ctl->prewarm_active = true; + lfc_ctl->prewarm_canceled = false; + lfc_ctl->prewarm_batch = prewarm_batch; + memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState)); + + LWLockRelease(lfc_lock); + + /* Calculate total number of pages to be prewarmed */ + lfc_ctl->total_prewarm_pages = fcs->n_pages; + + seg = dsm_create(fcs_size, 0); + memcpy(dsm_segment_address(seg), fcs, fcs_size); + lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg); + + /* Spawn background workers */ + for (uint32 i = 0; i < n_workers; i++) + { + BackgroundWorker worker = {0}; + + worker.bgw_flags = BGWORKER_SHMEM_ACCESS; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = BGW_NEVER_RESTART; + strcpy(worker.bgw_library_name, "neon"); + strcpy(worker.bgw_function_name, "lfc_prewarm_main"); + snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1); + strcpy(worker.bgw_type, "LFC prewarm worker"); + worker.bgw_main_arg = Int32GetDatum(i); + /* must set notify PID to wait for shutdown */ + worker.bgw_notify_pid = MyProcPid; + + if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i])) + { + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("LFC: registering dynamic bgworker prewarm failed"), + errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes"))); + n_workers = i; + lfc_ctl->prewarm_canceled = true; + break; + } + } + + for (uint32 i = 0; i < n_workers; i++) + { + bool interrupted; + do + { + interrupted = false; + PG_TRY(); + { + BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]); + if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED) + { + elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status); + } + } + PG_CATCH(); + { + elog(LOG, "LFC: cancel prewarm"); + lfc_ctl->prewarm_canceled = true; + interrupted = true; + } + PG_END_TRY(); + } while (interrupted); + + if (!lfc_ctl->prewarm_workers[i].completed) + { + /* Background worker doesn't set completion time: it means that it was abnormally terminated */ + elog(LOG, "LFC: prewarm worker %d failed", i+1); + /* Set completion time to prevent get_prewarm_info from considering this worker as active */ + lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp(); + } + } + dsm_detach(seg); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + lfc_ctl->prewarm_active = false; + LWLockRelease(lfc_lock); +} + +void +lfc_prewarm_main(Datum main_arg) +{ + size_t snd_idx = 0, rcv_idx = 0; + size_t n_sent = 0, n_received = 0; + size_t fcs_chunk_size_log; + size_t max_prefetch_pages; + size_t prewarm_batch; + size_t n_workers; + dsm_segment *seg; + FileCacheState* fcs; + uint8* bitmap; + BufferTag tag; + PrewarmWorkerState* ws; + uint32 worker_id = DatumGetInt32(main_arg); + + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle); + if (seg == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not map dynamic shared memory segment"))); + + fcs = (FileCacheState*) dsm_segment_address(seg); + prewarm_batch = lfc_ctl->prewarm_batch; + fcs_chunk_size_log = fcs->chunk_size_log; + n_workers = lfc_ctl->n_prewarm_workers; + max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log; + ws = &lfc_ctl->prewarm_workers[worker_id]; + bitmap = FILE_CACHE_STATE_BITMAP(fcs); + + /* enable prefetch in LFC */ + lfc_store_prefetch_result = true; + lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */ + + elog(LOG, "LFC: worker %d start prewarming", worker_id); + while (!lfc_ctl->prewarm_canceled) + { + if (snd_idx < max_prefetch_pages) + { + if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id) + { + /* If there are multiple workers, split chunks between them */ + snd_idx += 1 << fcs_chunk_size_log; + } + else + { + if (BITMAP_ISSET(bitmap, snd_idx)) + { + tag = fcs->chunks[snd_idx >> fcs_chunk_size_log]; + tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1); + if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum)) + { + (void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); + n_sent += 1; + } + else + { + ws->skipped_pages += 1; + BITMAP_CLR(bitmap, snd_idx); + } + } + snd_idx += 1; + } + } + if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages) + { + if (n_received == n_sent && snd_idx == max_prefetch_pages) + { + break; + } + if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id) + { + /* Skip chunks processed by other workers */ + rcv_idx += 1 << fcs_chunk_size_log; + continue; + } + + /* Locate next block to prefetch */ + while (!BITMAP_ISSET(bitmap, rcv_idx)) + { + rcv_idx += 1; + } + tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log]; + tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1); + if (communicator_prefetch_receive(tag)) + { + ws->prewarmed_pages += 1; + } + else + { + ws->skipped_pages += 1; + } + rcv_idx += 1; + n_received += 1; + } + } + /* No need to perform prefetch cleanup here because prewarm worker will be terminated and + * connection to PS dropped just after return from this function. + */ + Assert(n_sent == n_received || lfc_ctl->prewarm_canceled); + elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received); + lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp(); +} + + /* * Check if page is present in the cache. * Returns true if page is found in local cache. @@ -530,7 +946,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { BufferTag tag; FileCacheEntry *entry; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); bool found = false; uint32 hash; @@ -539,7 +955,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + tag.blockNum = blkno - chunk_offs; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); @@ -577,9 +993,9 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); + tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); - chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); LWLockAcquire(lfc_lock, LW_SHARED); @@ -590,12 +1006,12 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } while (true) { - int this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs); + int this_chunk = Min(nblocks - i, lfc_blocks_per_chunk - chunk_offs); entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); if (entry != NULL) { - for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) + for (; chunk_offs < lfc_blocks_per_chunk && i < nblocks; chunk_offs++, i++) { if (GET_STATE(entry, chunk_offs) != UNAVAILABLE) { @@ -619,9 +1035,9 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * Prepare for the next iteration. We don't unlock here, as that'd * probably be more expensive than the gains it'd get us. */ - tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + chunk_offs = BLOCK_TO_CHUNK_OFF(blkno + i); + tag.blockNum = (blkno + i) - chunk_offs; hash = get_hash_value(lfc_hash, &tag); - chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); } LWLockRelease(lfc_lock); @@ -696,9 +1112,9 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, while (nblocks > 0) { struct iovec iov[PG_IOV_MAX]; - int8 chunk_mask[BLOCKS_PER_CHUNK / 8] = {0}; - int chunk_offs = (blkno & (BLOCKS_PER_CHUNK - 1)); - int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + uint8 chunk_mask[MAX_BLOCKS_PER_CHUNK / 8] = {0}; + int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); + int blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs); int iteration_hits = 0; int iteration_misses = 0; uint64 io_time_us = 0; @@ -786,8 +1202,10 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* Unlink entry from LRU list to pin it for the duration of IO operation */ if (entry->access_count++ == 0) + { + lfc_ctl->pinned += 1; dlist_delete(&entry->list_node); - + } generation = lfc_ctl->generation; entry_offset = entry->offset; @@ -836,7 +1254,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (iteration_hits != 0) { /* chunk offset (# of pages) into the LFC file */ - off_t first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK; + off_t first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk; int nwrite = iov_last_used - first_block_in_chunk_read; /* offset of first IOV */ first_read_offset += chunk_offs + first_block_in_chunk_read; @@ -884,7 +1302,10 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) + { + lfc_ctl->pinned -= 1; dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + } } else { @@ -954,14 +1375,17 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) * If we can't (e.g. because all other slots are being accessed) * then we will remove this entry from the hash and continue * on to the next chunk, as we may not exceed the limit. + * + * While prewarming LFC we do not want to replace existed entries, + * so we just stop prewarm is LFC cache is full. */ - else if (!dlist_is_empty(&lfc_ctl->lru)) + else if (!dlist_is_empty(&lfc_ctl->lru) && !lfc_do_prewarm) { /* Cache overflow: evict least recently used chunk */ FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + for (int i = 0; i < lfc_blocks_per_chunk; i++) { bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; lfc_ctl->used_pages -= is_page_cached; @@ -979,14 +1403,15 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) /* Can't add this chunk - we don't have the space for it */ hash_search_with_hash_value(lfc_hash, &entry->key, hash, HASH_REMOVE, NULL); - + lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */ return false; } entry->access_count = 1; entry->hash = hash; + lfc_ctl->pinned += 1; - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + for (int i = 0; i < lfc_blocks_per_chunk; i++) SET_STATE(entry, i, UNAVAILABLE); return true; @@ -1031,7 +1456,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, FileCacheBlockState state; XLogRecPtr lwlsn; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return false; @@ -1041,7 +1466,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); cv = &lfc_ctl->cv[hash % N_COND_VARS]; @@ -1052,7 +1477,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, LWLockRelease(lfc_lock); return false; } - + lwlsn = neon_get_lwlsn(rinfo, forknum, blkno); if (lwlsn > lsn) @@ -1065,9 +1490,11 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - tag.blockNum = blkno; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); - + if (lfc_prewarm_update_ws_estimation) + { + tag.blockNum = blkno; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } if (found) { state = GET_STATE(entry, chunk_offs); @@ -1081,7 +1508,10 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * operation */ if (entry->access_count++ == 0) + { + lfc_ctl->pinned += 1; dlist_delete(&entry->list_node); + } } else { @@ -1106,7 +1536,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); INSTR_TIME_SET_CURRENT(io_start); rc = pwrite(lfc_desc, buffer, BLCKSZ, - ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + ((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ); INSTR_TIME_SET_CURRENT(io_end); pgstat_report_wait_end(); @@ -1132,7 +1562,10 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, inc_page_cache_write_wait(time_spent_us); if (--entry->access_count == 0) + { + lfc_ctl->pinned -= 1; dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + } state = GET_STATE(entry, chunk_offs); if (state == REQUESTED) { @@ -1199,8 +1632,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, while (nblocks > 0) { struct iovec iov[PG_IOV_MAX]; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); - int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); + int blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs); instr_time io_start, io_end; ConditionVariable* cv; @@ -1212,7 +1645,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, iov[i].iov_len = BLCKSZ; } - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); cv = &lfc_ctl->cv[hash % N_COND_VARS]; @@ -1232,7 +1665,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * operation */ if (entry->access_count++ == 0) + { + lfc_ctl->pinned += 1; dlist_delete(&entry->list_node); + } } else { @@ -1285,7 +1721,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); INSTR_TIME_SET_CURRENT(io_start); rc = pwritev(lfc_desc, iov, blocks_in_chunk, - ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + ((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ); INSTR_TIME_SET_CURRENT(io_end); pgstat_report_wait_end(); @@ -1312,7 +1748,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, inc_page_cache_write_wait(time_spent_us); if (--entry->access_count == 0) + { + lfc_ctl->pinned -= 1; dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + } for (int i = 0; i < blocks_in_chunk; i++) { @@ -1438,7 +1877,12 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) break; case 8: key = "file_cache_chunk_size_pages"; - value = BLOCKS_PER_CHUNK; + value = lfc_blocks_per_chunk; + break; + case 9: + key = "file_cache_chunks_pinned"; + if (lfc_ctl) + value = lfc_ctl->pinned; break; default: SRF_RETURN_DONE(funcctx); @@ -1566,7 +2010,7 @@ local_cache_pages(PG_FUNCTION_ARGS) /* Skip hole tags */ if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) { - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + for (int i = 0; i < lfc_blocks_per_chunk; i++) n_pages += GET_STATE(entry, i) == AVAILABLE; } } @@ -1594,13 +2038,13 @@ local_cache_pages(PG_FUNCTION_ARGS) hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + for (int i = 0; i < lfc_blocks_per_chunk; i++) { if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) { if (GET_STATE(entry, i) == AVAILABLE) { - fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i; + fctx->record[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i; fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key)); fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key)); @@ -1684,3 +2128,82 @@ approximate_working_set_size(PG_FUNCTION_ARGS) } PG_RETURN_NULL(); } + +PG_FUNCTION_INFO_V1(get_local_cache_state); + +Datum +get_local_cache_state(PG_FUNCTION_ARGS) +{ + size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0); + FileCacheState* fcs = lfc_get_state(max_entries); + if (fcs != NULL) + PG_RETURN_BYTEA_P((bytea*)fcs); + else + PG_RETURN_NULL(); +} + +PG_FUNCTION_INFO_V1(prewarm_local_cache); + +Datum +prewarm_local_cache(PG_FUNCTION_ARGS) +{ + bytea* state = PG_GETARG_BYTEA_PP(0); + uint32 n_workers = PG_GETARG_INT32(1); + FileCacheState* fcs = (FileCacheState*)state; + + lfc_prewarm(fcs, n_workers); + + PG_RETURN_NULL(); +} + +PG_FUNCTION_INFO_V1(get_prewarm_info); + +Datum +get_prewarm_info(PG_FUNCTION_ARGS) +{ + Datum values[4]; + bool nulls[4]; + TupleDesc tupdesc; + uint32 prewarmed_pages = 0; + uint32 skipped_pages = 0; + uint32 active_workers = 0; + uint32 total_pages; + size_t n_workers; + + if (lfc_size_limit == 0) + PG_RETURN_NULL(); + + LWLockAcquire(lfc_lock, LW_SHARED); + if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0) + { + LWLockRelease(lfc_lock); + PG_RETURN_NULL(); + } + n_workers = lfc_ctl->n_prewarm_workers; + total_pages = lfc_ctl->total_prewarm_pages; + for (size_t i = 0; i < n_workers; i++) + { + PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i]; + prewarmed_pages += ws->prewarmed_pages; + skipped_pages += ws->skipped_pages; + active_workers += ws->completed != 0; + } + LWLockRelease(lfc_lock); + + tupdesc = CreateTemplateTupleDesc(4); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(total_pages); + values[1] = Int32GetDatum(prewarmed_pages); + values[2] = Int32GetDatum(skipped_pages); + values[3] = Int32GetDatum(active_workers); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h index 849558b83d..c7b6b09f72 100644 --- a/pgxn/neon/file_cache.h +++ b/pgxn/neon/file_cache.h @@ -13,6 +13,17 @@ #include "neon_pgversioncompat.h" +typedef struct FileCacheState +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + uint32 magic; + uint32 n_chunks; + uint32 n_pages; + uint16 chunk_size_log; + BufferTag chunks[FLEXIBLE_ARRAY_MEMBER]; + /* followed by bitmap */ +} FileCacheState; + /* GUCs */ extern bool lfc_store_prefetch_result; @@ -32,7 +43,10 @@ extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, extern void lfc_init(void); extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, const void* buffer, XLogRecPtr lsn); +extern FileCacheState* lfc_get_state(size_t max_entries); +extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers); +PGDLLEXPORT void lfc_prewarm_main(Datum main_arg); static inline bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 64d38e7913..ee4e6ccc5b 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -26,6 +26,7 @@ #include "portability/instr_time.h" #include "postmaster/interrupt.h" #include "storage/buf_internals.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" @@ -48,7 +49,6 @@ #define MIN_RECONNECT_INTERVAL_USEC 1000 #define MAX_RECONNECT_INTERVAL_USEC 1000000 - enum NeonComputeMode { CP_MODE_PRIMARY = 0, CP_MODE_REPLICA, @@ -80,6 +80,7 @@ int neon_protocol_version = 3; static int neon_compute_mode = 0; static int max_reconnect_attempts = 60; static int stripe_size; +static int max_sockets; static int pageserver_response_log_timeout = 10000; /* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */ @@ -167,6 +168,9 @@ typedef struct WaitEventSet *wes_read; } PageServer; +static uint32 local_request_counter; +#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) + static PageServer page_servers[MAX_SHARDS]; static bool pageserver_flush(shardno_t shard_no); @@ -334,6 +338,13 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p) pageserver_disconnect(i); } pagestore_local_counter = end_update_counter; + + /* Reserve file descriptors for sockets */ + while (max_sockets < num_shards) + { + max_sockets += 1; + ReserveExternalFD(); + } } if (num_shards_p) @@ -734,8 +745,8 @@ pageserver_connect(shardno_t shard_no, int elevel) default: neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state); } - /* This shouldn't be hit */ - Assert(false); + + pg_unreachable(); } static void @@ -875,6 +886,7 @@ retry: int port; int sndbuf; int recvbuf; + uint64* max_wait; get_local_port(PQsocket(pageserver_conn), &port); get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); @@ -885,7 +897,10 @@ retry: shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf, pageserver_conn->inStart, pageserver_conn->inEnd); shard->receive_last_log_time = now; + MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged; shard->receive_logged = true; + max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms; + *max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start)); } /* @@ -908,6 +923,7 @@ retry: get_local_port(PQsocket(pageserver_conn), &port); neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)", INSTR_TIME_GET_DOUBLE(since_start), port); + MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; pageserver_disconnect(shard_no); return -1; } @@ -931,6 +947,7 @@ retry: INSTR_TIME_SET_ZERO(shard->receive_start_time); INSTR_TIME_SET_ZERO(shard->receive_last_log_time); shard->receive_logged = false; + MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; return ret; } @@ -994,6 +1011,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) pageserver_conn = NULL; } + request->reqid = GENERATE_REQUEST_ID(); req_buff = nm_pack_request(request); /* diff --git a/pgxn/neon/neon--1.5--1.6.sql b/pgxn/neon/neon--1.5--1.6.sql new file mode 100644 index 0000000000..c05f0f87aa --- /dev/null +++ b/pgxn/neon/neon--1.5--1.6.sql @@ -0,0 +1,22 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.6'" to load this file. \quit + +CREATE FUNCTION get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer) +RETURNS record +AS 'MODULE_PATHNAME', 'get_prewarm_info' +LANGUAGE C STRICT +PARALLEL SAFE; + +CREATE FUNCTION get_local_cache_state(max_chunks integer default null) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_local_cache_state' +LANGUAGE C +PARALLEL UNSAFE; + +CREATE FUNCTION prewarm_local_cache(state bytea, n_workers integer default 1) +RETURNS void +AS 'MODULE_PATHNAME', 'prewarm_local_cache' +LANGUAGE C STRICT +PARALLEL UNSAFE; + + + diff --git a/pgxn/neon/neon--1.6--1.5.sql b/pgxn/neon/neon--1.6--1.5.sql new file mode 100644 index 0000000000..57512980f5 --- /dev/null +++ b/pgxn/neon/neon--1.6--1.5.sql @@ -0,0 +1,7 @@ +DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer); + +DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer); + +DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer default 1); + + diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c index 59222eb855..a8cfa0f825 100644 --- a/pgxn/neon/neon_lwlsncache.c +++ b/pgxn/neon/neon_lwlsncache.c @@ -4,6 +4,7 @@ #include "miscadmin.h" #include "access/xlog.h" +#include "access/xlog_internal.h" #include "storage/ipc.h" #include "storage/shmem.h" #include "storage/buf_internals.h" @@ -399,6 +400,7 @@ neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber for if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0) return lsn; + Assert(lsn >= WalSegMinSize); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, from, n_blocks); LWLockRelease(LastWrittenLsnLock); @@ -435,7 +437,6 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, NInfoGetRelNumber(relfilenode) == InvalidOid) return InvalidXLogRecPtr; - BufTagInit(key, relNumber, forknum, blockno, spcOid, dbOid); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); @@ -444,6 +445,10 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, { XLogRecPtr lsn = lsns[i]; + if (lsn == InvalidXLogRecPtr) + continue; + + Assert(lsn >= WalSegMinSize); key.blockNum = blockno + i; entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); if (found) @@ -505,4 +510,5 @@ neon_set_lwlsn_db(XLogRecPtr lsn) { NRelFileInfo dummyNode = {InvalidOid, InvalidOid, InvalidOid}; return neon_set_lwlsn_block(lsn, dummyNode, MAIN_FORKNUM, 0); -} \ No newline at end of file +} + diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index 05db187076..c77d99d636 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -148,7 +148,7 @@ histogram_to_metrics(IOHistogram histogram, static metric_t * neon_perf_counters_to_metrics(neon_per_backend_counters *counters) { -#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10) +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12) metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); int i = 0; @@ -166,6 +166,8 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) APPEND_METRIC(getpage_prefetch_requests_total); APPEND_METRIC(getpage_sync_requests_total); + APPEND_METRIC(compute_getpage_stuck_requests_total); + APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms); APPEND_METRIC(getpage_prefetch_misses_total); APPEND_METRIC(getpage_prefetch_discards_total); APPEND_METRIC(pageserver_requests_sent_total); @@ -294,6 +296,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.file_cache_hits_total += counters->file_cache_hits_total; histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); + + totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total; + totals.compute_getpage_max_inflight_stuck_time_ms = Max( + totals.compute_getpage_max_inflight_stuck_time_ms, + counters->compute_getpage_max_inflight_stuck_time_ms); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 5f5330bb69..10cf094d4a 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -57,6 +57,18 @@ typedef struct uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; + /* + * Total number of Getpage requests left without an answer for more than + * pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout + */ + uint64 compute_getpage_stuck_requests_total; + + /* + * Longest waiting time for active stuck requests. If a stuck request gets a + * response or disconnects, this metric is updated + */ + uint64 compute_getpage_max_inflight_stuck_time_ms; + /* * Total number of readahead misses; consisting of either prefetches that * don't satisfy the LSN bounds, or cases where no readahead was issued diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index be2c4ddf79..d5e3a38dbb 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -150,7 +150,7 @@ NeonWALReaderFree(NeonWALReader *state) * fetched from timeline 'tli'. * * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error - * occurs, in which case 'err' has the desciption. Error always closes remote + * occurs, in which case 'err' has the description. Error always closes remote * connection, if there was any, so socket subscription should be removed. * * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 0ab539fe56..9df202290d 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -65,7 +65,6 @@ typedef enum { SLRU_MULTIXACT_OFFSETS } SlruKind; - /*-- * supertype of all the Neon*Request structs below. * @@ -129,6 +128,7 @@ typedef struct int segno; } NeonGetSlruSegmentRequest; + /* supertype of all the Neon*Response structs below */ typedef NeonMessage NeonResponse; @@ -187,6 +187,7 @@ typedef struct { /* * Send this request to the PageServer associated with this shard. + * This function assigns request_id to the request which can be extracted by caller from request struct. */ bool (*send) (shardno_t shard_no, NeonRequest * request); /* diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3bf0bedf99..87eb420717 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1989,8 +1989,14 @@ neon_start_unlogged_build(SMgrRelation reln) neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } +#if PG_MAJORVERSION_NUM >= 17 + /* + * We have to disable this check for pg14-16 because sorted build of GIST index requires + * to perform unlogged build several times + */ if (smgrnblocks(reln, MAIN_FORKNUM) != 0) neon_log(ERROR, "cannot perform unlogged index build, index is not empty "); +#endif unlogged_build_rel = reln; unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index b95b1451e4..3befb42030 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -124,6 +124,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } else { + wp->safekeepers_generation = INVALID_GENERATION; host = wp->config->safekeepers_list; } wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation); @@ -756,7 +757,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) { SafekeeperId *sk_id = &wp->mconf.members.m[i]; - if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId) + if (sk_id->node_id == sk->greetResponse.nodeId) { /* * If mconf or list of safekeepers to connect to changed (the @@ -781,7 +782,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) { SafekeeperId *sk_id = &wp->mconf.new_members.m[i]; - if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId) + if (sk_id->node_id == sk->greetResponse.nodeId) { if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk) { @@ -836,7 +837,7 @@ TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf { uint32 n_greeted = 0; - for (uint32 i = 0; i < wp->mconf.members.len; i++) + for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; @@ -1071,7 +1072,6 @@ RecvVoteResponse(Safekeeper *sk) /* ready for elected message */ sk->state = SS_WAIT_ELECTED; - wp->n_votes++; /* Are we already elected? */ if (wp->state == WPS_CAMPAIGN) { @@ -1106,7 +1106,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf { uint32 n_votes = 0; - for (uint32 i = 0; i < wp->mconf.members.len; i++) + for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 648b0015ad..83ef72d3d7 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -845,9 +845,6 @@ typedef struct WalProposer /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; - /* number of votes collected from safekeepers */ - int n_votes; - /* number of successful connections over the lifetime of walproposer */ int n_connected; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index a061639815..17582405db 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,7 +63,7 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; -int safekeeper_proto_version = 2; +int safekeeper_proto_version = 3; /* Set to true in the walproposer bgw. */ static bool am_walproposer; @@ -228,7 +228,7 @@ nwp_register_gucs(void) "Version of compute <-> safekeeper protocol.", "Used while migrating from 2 to 3.", &safekeeper_proto_version, - 2, 0, INT_MAX, + 3, 0, INT_MAX, PGC_POSTMASTER, 0, NULL, NULL, NULL); diff --git a/proxy/README.md b/proxy/README.md index 1156bfd352..583db36f28 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -32,7 +32,7 @@ To play with it locally one may start proxy over a local postgres installation (see end of this page on how to generate certs with openssl): ``` -./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444 +LOGFMT=text ./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444 ``` If both postgres and proxy are running you may send a SQL query: @@ -130,7 +130,7 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key Then we need to build proxy with 'testing' feature and run, e.g.: ```sh -RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key +RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key ``` Now from client you can start a new session: diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 44a6a42665..a48f67199a 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -409,14 +409,22 @@ impl JwkCacheEntryLock { if let Some(exp) = payload.expiration { if now >= exp + CLOCK_SKEW_LEEWAY { - return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired)); + return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired( + exp.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ))); } } if let Some(nbf) = payload.not_before { if nbf >= now + CLOCK_SKEW_LEEWAY { return Err(JwtError::InvalidClaims( - JwtClaimsError::JwtTokenNotYetReadyToUse, + JwtClaimsError::JwtTokenNotYetReadyToUse( + nbf.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), )); } } @@ -534,10 +542,10 @@ struct JwtPayload<'a> { #[serde(rename = "aud", default)] audience: OneOrMany, /// Expiration - Time after which the JWT expires - #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)] + #[serde(rename = "exp", deserialize_with = "numeric_date_opt", default)] expiration: Option, - /// Not before - Time after which the JWT expires - #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)] + /// Not before - Time before which the JWT is not valid + #[serde(rename = "nbf", deserialize_with = "numeric_date_opt", default)] not_before: Option, // the following entries are only extracted for the sake of debug logging. @@ -609,8 +617,15 @@ impl<'de> Deserialize<'de> for OneOrMany { } fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { - let d = >::deserialize(d)?; - Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n))) + >::deserialize(d)? + .map(|t| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_secs(t)) + .ok_or_else(|| { + serde::de::Error::custom(format_args!("timestamp out of bounds: {t}")) + }) + }) + .transpose() } struct JwkRenewalPermit<'a> { @@ -746,11 +761,11 @@ pub enum JwtClaimsError { #[error("invalid JWT token audience")] InvalidJwtTokenAudience, - #[error("JWT token has expired")] - JwtTokenHasExpired, + #[error("JWT token has expired (exp={0})")] + JwtTokenHasExpired(u64), - #[error("JWT token is not yet ready to use")] - JwtTokenNotYetReadyToUse, + #[error("JWT token is not yet ready to use (nbf={0})")] + JwtTokenNotYetReadyToUse(u64), } #[allow(dead_code, reason = "Debug use only")] @@ -1233,14 +1248,14 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL "nbf": now + 60, "aud": "neon", }}, - error: JwtClaimsError::JwtTokenNotYetReadyToUse, + error: JwtClaimsError::JwtTokenNotYetReadyToUse(now + 60), }, Test { body: json! {{ "exp": now - 60, "aud": ["neon"], }}, - error: JwtClaimsError::JwtTokenHasExpired, + error: JwtClaimsError::JwtTokenHasExpired(now - 60), }, Test { body: json! {{ diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index c55af325e3..526d0df7f2 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -12,9 +12,9 @@ use tracing::{debug, warn}; use crate::auth::password_hack::parse_endpoint_param; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; -use crate::metrics::{Metrics, SniKind}; +use crate::metrics::{Metrics, SniGroup, SniKind}; use crate::proxy::NeonOptions; -use crate::serverless::SERVERLESS_DRIVER_SNI; +use crate::serverless::{AUTH_BROKER_SNI, SERVERLESS_DRIVER_SNI}; use crate::types::{EndpointId, RoleName}; #[derive(Debug, Error, PartialEq, Eq, Clone)] @@ -32,12 +32,6 @@ pub(crate) enum ComputeUserInfoParseError { option: EndpointId, }, - #[error( - "Common name inferred from SNI ('{}') is not known", - .cn, - )] - UnknownCommonName { cn: String }, - #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] MalformedProjectName(EndpointId), } @@ -66,22 +60,15 @@ impl ComputeUserInfoMaybeEndpoint { } } -pub(crate) fn endpoint_sni( - sni: &str, - common_names: &HashSet, -) -> Result, ComputeUserInfoParseError> { - let Some((subdomain, common_name)) = sni.split_once('.') else { - return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() }); - }; +pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet) -> Option { + let (subdomain, common_name) = sni.split_once('.')?; if !common_names.contains(common_name) { - return Err(ComputeUserInfoParseError::UnknownCommonName { - cn: common_name.into(), - }); + return None; } - if subdomain == SERVERLESS_DRIVER_SNI { - return Ok(None); + if subdomain == SERVERLESS_DRIVER_SNI || subdomain == AUTH_BROKER_SNI { + return None; } - Ok(Some(EndpointId::from(subdomain))) + Some(EndpointId::from(subdomain)) } impl ComputeUserInfoMaybeEndpoint { @@ -113,15 +100,8 @@ impl ComputeUserInfoMaybeEndpoint { }) .map(|name| name.into()); - let endpoint_from_domain = if let Some(sni_str) = sni { - if let Some(cn) = common_names { - endpoint_sni(sni_str, cn)? - } else { - None - } - } else { - None - }; + let endpoint_from_domain = + sni.and_then(|sni_str| common_names.and_then(|cn| endpoint_sni(sni_str, cn))); let endpoint = match (endpoint_option, endpoint_from_domain) { // Invariant: if we have both project name variants, they should match. @@ -148,22 +128,23 @@ impl ComputeUserInfoMaybeEndpoint { let metrics = Metrics::get(); debug!(%user, "credentials"); - if sni.is_some() { + + let protocol = ctx.protocol(); + let kind = if sni.is_some() { debug!("Connection with sni"); - metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); + SniKind::Sni } else if endpoint.is_some() { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::NoSni); debug!("Connection without sni"); + SniKind::NoSni } else { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::PasswordHack); debug!("Connection with password hack"); - } + SniKind::PasswordHack + }; + + metrics + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); let options = NeonOptions::parse_params(params); @@ -424,21 +405,34 @@ mod tests { } #[test] - fn parse_inconsistent_sni() { + fn parse_unknown_sni() { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); let ctx = RequestContext::test(); - let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); - match err { - UnknownCommonName { cn } => { - assert_eq!(cn, "localhost"); - } - _ => panic!("bad error: {err:?}"), - } + let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .unwrap(); + + assert!(info.endpoint_id.is_none()); + } + + #[test] + fn parse_unknown_sni_with_options() { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("options", "endpoint=foo-bar-baz-1234"), + ]); + + let sni = Some("project.localhost"); + let common_names = Some(["example.com".into()].into()); + + let ctx = RequestContext::test(); + let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .unwrap(); + + assert_eq!(info.endpoint_id.as_deref(), Some("foo-bar-baz-1234")); } #[test] diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index aef5c9383e..19be058ac3 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -7,13 +7,14 @@ use std::{net::SocketAddr, sync::Arc}; use anyhow::{Context, anyhow, bail, ensure}; use clap::Arg; -use futures::TryFutureExt; use futures::future::Either; +use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use rustls::crypto::ring; -use rustls::pki_types::PrivateKeyDer; -use tokio::io::{AsyncRead, AsyncWrite}; +use rustls::pki_types::{DnsName, PrivateKeyDer}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio::net::TcpListener; +use tokio_rustls::TlsConnector; use tokio_util::sync::CancellationToken; use tracing::{Instrument, error, info}; use utils::project_git_version; @@ -38,6 +39,12 @@ fn cli() -> clap::Command { .help("listen for incoming client connections on ip:port") .default_value("127.0.0.1:4432"), ) + .arg( + Arg::new("listen-tls") + .long("listen-tls") + .help("listen for incoming client connections on ip:port, requiring TLS to compute") + .default_value("127.0.0.1:4433"), + ) .arg( Arg::new("tls-key") .short('k') @@ -122,31 +129,58 @@ pub async fn run() -> anyhow::Result<()> { _ => bail!("tls-key and tls-cert must be specified"), }; + let compute_tls_config = + Arc::new(crate::tls::client_config::compute_client_config_with_root_certs()?); + // Start listening for incoming client connections let proxy_address: SocketAddr = args .get_one::("listen") - .expect("string argument defined") + .expect("listen argument defined") .parse()?; + let proxy_address_compute_tls: SocketAddr = args + .get_one::("listen-tls") + .expect("listen-tls argument defined") + .parse()?; + info!("Starting sni router on {proxy_address}"); + info!("Starting sni router on {proxy_address_compute_tls}"); let proxy_listener = TcpListener::bind(proxy_address).await?; + let proxy_listener_compute_tls = TcpListener::bind(proxy_address_compute_tls).await?; let cancellation_token = CancellationToken::new(); + let dest = Arc::new(destination); let main = tokio::spawn(task_main( - Arc::new(destination), - tls_config, + dest.clone(), + tls_config.clone(), + None, tls_server_end_point, proxy_listener, cancellation_token.clone(), - )); + )) + .map(crate::error::flatten_err); + + let main_tls = tokio::spawn(task_main( + dest, + tls_config, + Some(compute_tls_config), + tls_server_end_point, + proxy_listener_compute_tls, + cancellation_token.clone(), + )) + .map(crate::error::flatten_err); let signals_task = tokio::spawn(crate::signals::handle(cancellation_token, || {})); // the signal task cant ever succeed. // the main task can error, or can succeed on cancellation. // we want to immediately exit on either of these cases + let main = futures::future::try_join(main, main_tls); let signal = match futures::future::select(signals_task, main).await { Either::Left((res, _)) => crate::error::flatten_err(res)?, - Either::Right((res, _)) => return crate::error::flatten_err(res), + Either::Right((res, _)) => { + res?; + return Ok(()); + } }; // maintenance tasks return `Infallible` success values, this is an impossible value @@ -157,6 +191,7 @@ pub async fn run() -> anyhow::Result<()> { async fn task_main( dest_suffix: Arc, tls_config: Arc, + compute_tls_config: Option>, tls_server_end_point: TlsServerEndPoint, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, @@ -175,6 +210,7 @@ async fn task_main( let session_id = uuid::Uuid::new_v4(); let tls_config = Arc::clone(&tls_config); let dest_suffix = Arc::clone(&dest_suffix); + let compute_tls_config = compute_tls_config.clone(); connections.spawn( async move { @@ -192,7 +228,15 @@ async fn task_main( crate::metrics::Protocol::SniRouter, "sni", ); - handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await + handle_client( + ctx, + dest_suffix, + tls_config, + compute_tls_config, + tls_server_end_point, + socket, + ) + .await } .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. @@ -268,6 +312,7 @@ async fn handle_client( ctx: RequestContext, dest_suffix: Arc, tls_config: Arc, + compute_tls_config: Option>, tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { @@ -288,7 +333,33 @@ async fn handle_client( info!("destination: {}", destination); - let mut client = tokio::net::TcpStream::connect(destination).await?; + let mut client = tokio::net::TcpStream::connect(&destination).await?; + + let client = if let Some(compute_tls_config) = compute_tls_config { + info!("upgrading TLS"); + + // send SslRequest + client + .write_all(b"\x00\x00\x00\x08\x04\xd2\x16\x2f") + .await?; + + // wait for S/N respons + let mut resp = b'N'; + client.read_exact(std::slice::from_mut(&mut resp)).await?; + + // error if not S + ensure!(resp == b'S', "compute refused TLS"); + + // upgrade to TLS. + let domain = DnsName::try_from(destination)?; + let domain = rustls::pki_types::ServerName::DnsName(domain); + let client = TlsConnector::from(compute_tls_config) + .connect(domain, client) + .await?; + Connection::Tls(client) + } else { + Connection::Raw(client) + }; // doesn't yet matter as pg-sni-router doesn't report analytics logs ctx.set_success(); @@ -297,9 +368,19 @@ async fn handle_client( // Starting from here we only proxy the client's traffic. info!("performing the proxy pass..."); - match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await { + let res = match client { + Connection::Raw(mut c) => copy_bidirectional_client_compute(&mut tls_stream, &mut c).await, + Connection::Tls(mut c) => copy_bidirectional_client_compute(&mut tls_stream, &mut c).await, + }; + + match res { Ok(_) => Ok(()), Err(ErrorSource::Client(err)) => Err(err).context("client"), Err(ErrorSource::Compute(err)) => Err(err).context("compute"), } } + +enum Connection { + Raw(tokio::net::TcpStream), + Tls(tokio_rustls::client::TlsStream), +} diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index b83b03bc4f..efa3c0b514 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -132,11 +132,10 @@ impl Drop for LoggingGuard { } } -// TODO: make JSON the default #[derive(Copy, Clone, PartialEq, Eq, Default, Debug)] enum LogFormat { + Text, #[default] - Text = 1, Json, } diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index e5fc0b724b..4b22c912eb 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -115,8 +115,8 @@ pub struct ProxyMetrics { #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_vpc_endpoint_ids: Histogram<10>, - /// Number of connections (per sni). - pub accepted_connections_by_sni: CounterVec>, + /// Number of connections, by the method we used to determine the endpoint. + pub accepted_connections_by_sni: CounterVec, /// Number of connection failures (per kind). pub connection_failures_total: CounterVec>, @@ -342,11 +342,20 @@ pub enum LatencyExclusions { ClientCplaneComputeRetry, } +#[derive(LabelGroup)] +#[label(set = SniSet)] +pub struct SniGroup { + pub protocol: Protocol, + pub kind: SniKind, +} + #[derive(FixedCardinalityLabel, Copy, Clone)] -#[label(singleton = "kind")] pub enum SniKind { + /// Domain name based routing. SNI for libpq/websockets. Host for HTTP Sni, + /// Metadata based routing. `options` for libpq/websockets. Header for HTTP NoSni, + /// Metadata based routing, using the password field. PasswordHack, } diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index c05031ad97..54c02f2c15 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -24,9 +24,6 @@ pub(crate) enum HandshakeError { #[error("protocol violation")] ProtocolViolation, - #[error("missing certificate")] - MissingCertificate, - #[error("{0}")] StreamUpgradeError(#[from] StreamUpgradeError), @@ -42,10 +39,6 @@ impl ReportableError for HandshakeError { match self { HandshakeError::EarlyData => crate::error::ErrorKind::User, HandshakeError::ProtocolViolation => crate::error::ErrorKind::User, - // This error should not happen, but will if we have no default certificate and - // the client sends no SNI extension. - // If they provide SNI then we can be sure there is a certificate that matches. - HandshakeError::MissingCertificate => crate::error::ErrorKind::Service, HandshakeError::StreamUpgradeError(upgrade) => match upgrade { StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service, StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, @@ -146,7 +139,7 @@ pub(crate) async fn handshake( // try parse endpoint let ep = conn_info .server_name() - .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten()); + .and_then(|sni| endpoint_sni(sni, &tls.common_names)); if let Some(ep) = ep { ctx.set_endpoint_id(ep); } @@ -161,10 +154,8 @@ pub(crate) async fn handshake( } } - let (_, tls_server_end_point) = tls - .cert_resolver - .resolve(conn_info.server_name()) - .ok_or(HandshakeError::MissingCertificate)?; + let (_, tls_server_end_point) = + tls.cert_resolver.resolve(conn_info.server_name()); stream = PqStream { framed: Framed { diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 9a6864c33e..f47636cd71 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -98,8 +98,7 @@ fn generate_tls_config<'a>( .with_no_client_auth() .with_single_cert(vec![cert.clone()], key.clone_key())?; - let mut cert_resolver = CertResolver::new(); - cert_resolver.add_cert(key, vec![cert], true)?; + let cert_resolver = CertResolver::new(key, vec![cert])?; let common_names = cert_resolver.get_common_names(); diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 1d9b35f41d..bb5637cd5f 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; -pub(crate) const EXT_VERSION: &str = "0.3.0"; +pub(crate) const EXT_VERSION: &str = "0.3.1"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 6f24ad3dec..2a7069b1c2 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -56,6 +56,7 @@ use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; +pub(crate) const AUTH_BROKER_SNI: &str = "apiauth"; pub async fn task_main( config: &'static ProxyConfig, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7fb39553f9..dfaeedaeae 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -38,7 +38,7 @@ use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::{ReadBodyError, read_body_with_limit}; -use crate::metrics::{HttpDirection, Metrics}; +use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind}; use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; @@ -199,8 +199,7 @@ fn get_conn_info( let endpoint = match connection_url.host() { Some(url::Host::Domain(hostname)) => { if let Some(tls) = tls { - endpoint_sni(hostname, &tls.common_names)? - .ok_or(ConnInfoError::MalformedEndpoint)? + endpoint_sni(hostname, &tls.common_names).ok_or(ConnInfoError::MalformedEndpoint)? } else { hostname .split_once('.') @@ -228,6 +227,32 @@ fn get_conn_info( } } + // check the URL that was used, for metrics + { + let host_endpoint = headers + // get the host header + .get("host") + // extract the domain + .and_then(|h| { + let (host, _port) = h.to_str().ok()?.split_once(':')?; + Some(host) + }) + // get the endpoint prefix + .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); + + let kind = if host_endpoint == Some(&*endpoint) { + SniKind::Sni + } else { + SniKind::NoSni + }; + + let protocol = ctx.protocol(); + Metrics::get() + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); + } + ctx.set_user_agent( headers .get(hyper::header::USER_AGENT) diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 5a95e69fde..8f8917ef62 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -5,6 +5,7 @@ use anyhow::{Context, bail}; use itertools::Itertools; use rustls::crypto::ring::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use rustls::sign::CertifiedKey; use x509_cert::der::{Reader, SliceReader}; use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; @@ -25,10 +26,8 @@ pub fn configure_tls( certs_dir: Option<&String>, allow_tls_keylogfile: bool, ) -> anyhow::Result { - let mut cert_resolver = CertResolver::new(); - // add default certificate - cert_resolver.add_cert_path(key_path, cert_path, true)?; + let mut cert_resolver = CertResolver::parse_new(key_path, cert_path)?; // add extra certificates if let Some(certs_dir) = certs_dir { @@ -40,11 +39,8 @@ pub fn configure_tls( let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { - cert_resolver.add_cert_path( - &key_path.to_string_lossy(), - &cert_path.to_string_lossy(), - false, - )?; + cert_resolver + .add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; } } } @@ -83,92 +79,42 @@ pub fn configure_tls( }) } -#[derive(Default, Debug)] +#[derive(Debug)] pub struct CertResolver { certs: HashMap, TlsServerEndPoint)>, - default: Option<(Arc, TlsServerEndPoint)>, + default: (Arc, TlsServerEndPoint), } impl CertResolver { - pub fn new() -> Self { - Self::default() + fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result { + let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; + Self::new(priv_key, cert_chain) } - fn add_cert_path( - &mut self, - key_path: &str, - cert_path: &str, - is_default: bool, - ) -> anyhow::Result<()> { - let priv_key = { - let key_bytes = std::fs::read(key_path) - .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; - rustls_pemfile::private_key(&mut &key_bytes[..]) - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - }; + pub fn new( + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, + ) -> anyhow::Result { + let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - self.add_cert(priv_key, cert_chain, is_default) + let mut certs = HashMap::new(); + let default = (cert.clone(), tls_server_end_point); + certs.insert(common_name, (cert, tls_server_end_point)); + Ok(Self { certs, default }) } - pub fn add_cert( + fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> { + let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; + self.add_cert(priv_key, cert_chain) + } + + fn add_cert( &mut self, priv_key: PrivateKeyDer<'static>, cert_chain: Vec>, - is_default: bool, ) -> anyhow::Result<()> { - let key = sign::any_supported_type(&priv_key).context("invalid private key")?; - - let first_cert = &cert_chain[0]; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let certificate = SliceReader::new(first_cert) - .context("Failed to parse cerficiate")? - .decode::() - .context("Failed to parse cerficiate")?; - - let common_name = certificate.tbs_certificate.subject.to_string(); - - // We need to get the canonical name for this certificate so we can match them against any domain names - // seen within the proxy codebase. - // - // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. - // We need to remove the wildcard prefix for the purposes of certificate selection. - // - // auth-broker does not use SNI and instead uses the Neon-Connection-String header. - // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. - // - // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string - // validation, so let's we can continue with any common-name - let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=") { - s.to_string() - } else { - bail!("Failed to parse common name from certificate") - }; - - let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); - - if is_default { - self.default = Some((cert.clone(), tls_server_end_point)); - } - + let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; self.certs.insert(common_name, (cert, tls_server_end_point)); - Ok(()) } @@ -177,12 +123,82 @@ impl CertResolver { } } +fn parse_key_cert( + key_path: &str, + cert_path: &str, +) -> anyhow::Result<(PrivateKeyDer<'static>, Vec>)> { + let priv_key = { + let key_bytes = std::fs::read(key_path) + .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; + rustls_pemfile::private_key(&mut &key_bytes[..]) + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ) + })? + }; + + Ok((priv_key, cert_chain)) +} + +fn process_key_cert( + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, +) -> anyhow::Result<(String, Arc, TlsServerEndPoint)> { + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let first_cert = &cert_chain[0]; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let certificate = SliceReader::new(first_cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + let common_name = certificate.tbs_certificate.subject.to_string(); + + // We need to get the canonical name for this certificate so we can match them against any domain names + // seen within the proxy codebase. + // + // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. + // We need to remove the wildcard prefix for the purposes of certificate selection. + // + // auth-broker does not use SNI and instead uses the Neon-Connection-String header. + // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. + // + // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string + // validation, so let's we can continue with any common-name + let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=") { + s.to_string() + } else { + bail!("Failed to parse common name from certificate") + }; + + let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); + + Ok((common_name, cert, tls_server_end_point)) +} + impl rustls::server::ResolvesServerCert for CertResolver { fn resolve( &self, client_hello: rustls::server::ClientHello<'_>, ) -> Option> { - self.resolve(client_hello.server_name()).map(|x| x.0) + Some(self.resolve(client_hello.server_name()).0) } } @@ -190,7 +206,7 @@ impl CertResolver { pub fn resolve( &self, server_name: Option<&str>, - ) -> Option<(Arc, TlsServerEndPoint)> { + ) -> (Arc, TlsServerEndPoint) { // loop here and cut off more and more subdomains until we find // a match to get a proper wildcard support. OTOH, we now do not // use nested domains, so keep this simple for now. @@ -200,12 +216,17 @@ impl CertResolver { if let Some(mut sni_name) = server_name { loop { if let Some(cert) = self.certs.get(sni_name) { - return Some(cert.clone()); + return cert.clone(); } if let Some((_, rest)) = sni_name.split_once('.') { sni_name = rest; } else { - return None; + // The customer has some custom DNS mapping - just return + // a default certificate. + // + // This will error if the customer uses anything stronger + // than sslmode=require. That's a choice they can make. + return self.default.clone(); } } } else { diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 5849df0343..b364ac8e48 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -121,6 +121,20 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/membership", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.put(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result { let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id); let resp = self diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index dd71420efb..c267a55cb6 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -1,7 +1,6 @@ // // Main entry point for the safekeeper executable // -use std::env::{VarError, var}; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; @@ -354,29 +353,13 @@ async fn main() -> anyhow::Result<()> { }; // Load JWT auth token to connect to other safekeepers for pull_timeline. - // First check if the env var is present, then check the arg with the path. - // We want to deprecate and remove the env var method in the future. - let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") { - Ok(v) => { - info!("loaded JWT token for authentication with safekeepers"); - Some(SecretString::from(v)) - } - Err(VarError::NotPresent) => { - if let Some(auth_token_path) = args.auth_token_path.as_ref() { - info!( - "loading JWT token for authentication with safekeepers from {auth_token_path}" - ); - let auth_token = tokio::fs::read_to_string(auth_token_path).await?; - Some(SecretString::from(auth_token.trim().to_owned())) - } else { - info!("no JWT token for authentication with safekeepers detected"); - None - } - } - Err(_) => { - warn!("JWT token for authentication with safekeepers is not unicode"); - None - } + let sk_auth_token = if let Some(auth_token_path) = args.auth_token_path.as_ref() { + info!("loading JWT token for authentication with safekeepers from {auth_token_path}"); + let auth_token = tokio::fs::read_to_string(auth_token_path).await?; + Some(SecretString::from(auth_token.trim().to_owned())) + } else { + info!("no JWT token for authentication with safekeepers detected"); + None }; let ssl_ca_certs = match args.ssl_ca_file.as_ref() { diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 2b2d721db2..1a25b07496 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -243,8 +243,7 @@ async fn timeline_pull_handler(mut request: Request) -> Result, ssl_ca_certs: Vec, global_timelines: Arc, -) -> Result { +) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, request.timeline_id, )); if existing_tli.is_ok() { - bail!("Timeline {} already exists", request.timeline_id); + info!("Timeline {} already exists", request.timeline_id); + return Ok(PullTimelineResponse { + safekeeper_host: None, + }); } let mut http_client = reqwest::Client::builder(); for ssl_ca_cert in ssl_ca_certs { http_client = http_client.add_root_certificate(ssl_ca_cert); } - let http_client = http_client.build()?; + let http_client = http_client + .build() + .map_err(|e| ApiError::InternalServerError(e.into()))?; let http_hosts = request.http_hosts.clone(); @@ -425,8 +431,25 @@ pub async fn handle_request( let mut statuses = Vec::new(); for (i, response) in responses.into_iter().enumerate() { - let status = response.context(format!("fetching status from {}", http_hosts[i]))?; - statuses.push((status, i)); + match response { + Ok(status) => { + statuses.push((status, i)); + } + Err(e) => { + info!("error fetching status from {}: {e}", http_hosts[i]); + } + } + } + + // Allow missing responses from up to one safekeeper (say due to downtime) + // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes + // offline and C comes online. Then we want a pull on C with A and B as hosts to work. + let min_required_successful = (http_hosts.len() - 1).max(1); + if statuses.len() < min_required_successful { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "only got {} successful status responses. required: {min_required_successful}", + statuses.len() + ))); } // Find the most advanced safekeeper @@ -445,7 +468,7 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline( + match pull_timeline( status, safekeeper_host, sk_auth_token, @@ -453,6 +476,21 @@ pub async fn handle_request( global_timelines, ) .await + { + Ok(resp) => Ok(resp), + Err(e) => { + match e.downcast_ref::() { + Some(TimelineError::AlreadyExists(_)) => Ok(PullTimelineResponse { + safekeeper_host: None, + }), + Some(TimelineError::CreationInProgress(_)) => { + // We don't return success here because creation might still fail. + Err(ApiError::Conflict("Creation in progress".to_owned())) + } + _ => Err(ApiError::InternalServerError(e)), + } + } + } } async fn pull_timeline( @@ -536,6 +574,6 @@ async fn pull_timeline( .await?; Ok(PullTimelineResponse { - safekeeper_host: host, + safekeeper_host: Some(host), }) } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 9975153f6c..eb8eee6ab8 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -32,7 +32,7 @@ use crate::metrics::{ WAL_RECEIVERS, }; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; -use crate::timeline::WalResidentTimeline; +use crate::timeline::{TimelineError, WalResidentTimeline}; const DEFAULT_FEEDBACK_CAPACITY: usize = 8; @@ -357,9 +357,14 @@ impl NetworkReader<'_, IO> { .await .context("create timeline")? } else { - self.global_timelines - .get(self.ttid) - .context("get timeline")? + let timeline_res = self.global_timelines.get(self.ttid); + match timeline_res { + Ok(tl) => tl, + Err(TimelineError::NotFound(_)) => { + return Err(CopyStreamHandlerEnd::TimelineNoCreate); + } + other => other.context("get_timeline")?, + } }; tli.wal_residence_guard().await? } diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 71dde9e126..2eea2f9d10 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -19,7 +19,8 @@ use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, - PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service, + PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT, Service, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -132,6 +133,10 @@ struct Cli { #[arg(long)] priority_reconciler_concurrency: Option, + /// Maximum number of safekeeper reconciliations that may run in parallel (per safekeeper) + #[arg(long)] + safekeeper_reconciler_concurrency: Option, + /// Tenant API rate limit, as requests per second per tenant. #[arg(long, default_value = "10")] tenant_rate_limit: NonZeroU32, @@ -403,6 +408,9 @@ async fn async_main() -> anyhow::Result<()> { priority_reconciler_concurrency: args .priority_reconciler_concurrency .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), + safekeeper_reconciler_concurrency: args + .safekeeper_reconciler_concurrency + .unwrap_or(SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT), tenant_rate_limit: args.tenant_rate_limit, split_threshold: args.split_threshold, max_split_shards: args.max_split_shards, diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index 988159af4a..1f3ea96d96 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -98,6 +98,23 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + measured_request!( + "switch_timeline_membership", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .switch_timeline_membership(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn delete_tenant( &self, tenant_id: TenantId, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index ca9b911c4d..fdb791c2cf 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -194,6 +194,7 @@ pub(crate) enum LeadershipStatus { pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; +pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly @@ -382,6 +383,9 @@ pub struct Config { /// How many high-priority Reconcilers may be spawned concurrently pub priority_reconciler_concurrency: usize, + /// How many safekeeper reconciles may happen concurrently (per safekeeper) + pub safekeeper_reconciler_concurrency: usize, + /// How many API requests per second to allow per tenant, across all /// tenant-scoped API endpoints. Further API requests queue until ready. pub tenant_rate_limit: NonZeroU32, @@ -3659,7 +3663,7 @@ impl Service { locations: ShardMutationLocations, http_client: reqwest::Client, jwt: Option, - create_req: TimelineCreateRequest, + mut create_req: TimelineCreateRequest, ) -> Result { let latest = locations.latest.node; @@ -3678,6 +3682,15 @@ impl Service { .await .map_err(|e| passthrough_api_error(&latest, e))?; + // If we are going to create the timeline on some stale locations for shard 0, then ask them to re-use + // the initdb generated by the latest location, rather than generating their own. This avoids racing uploads + // of initdb to S3 which might not be binary-identical if different pageservers have different postgres binaries. + if tenant_shard_id.is_shard_zero() { + if let models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } = &mut create_req.mode { + *existing_initdb_timeline_id = Some(create_req.new_timeline_id); + } + } + // We propagate timeline creations to all attached locations such that a compute // for the new timeline is able to start regardless of the current state of the // tenant shard reconciliation. @@ -3720,6 +3733,10 @@ impl Service { // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then // use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard // that will get the first creation request, and propagate the LSN to all the >0 shards. + // + // This also enables non-zero shards to use the initdb that shard 0 generated and uploaded to S3, rather than + // independently generating their own initdb. This guarantees that shards cannot end up with different initial + // states if e.g. they have different postgres binary versions. let timeline_info = create_one( shard_zero_tid, shard_zero_locations, @@ -3729,11 +3746,16 @@ impl Service { ) .await?; - // Propagate the LSN that shard zero picked, if caller didn't provide one + // Update the create request for shards >= 0 match &mut create_req.mode { models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => { + // Propagate the LSN that shard zero picked, if caller didn't provide one *ancestor_start_lsn = timeline_info.ancestor_lsn; }, + models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } => { + // For shards >= 0, do not run initdb: use the one that shard 0 uploaded to S3 + *existing_initdb_timeline_id = Some(create_req.new_timeline_id) + } _ => {} } @@ -5159,7 +5181,8 @@ impl Service { } // We don't expect any new_shard_count shards to exist here, but drop them just in case - tenants.retain(|_id, s| s.shard.count != *new_shard_count); + tenants + .retain(|id, s| !(id.tenant_id == *tenant_id && s.shard.count == *new_shard_count)); detach_locations }; @@ -8462,7 +8485,7 @@ impl Service { // By default, live migrations are generous about the wait time for getting // the secondary location up to speed. When draining, give up earlier in order // to not stall the operation when a cold secondary is encountered. - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) @@ -8795,7 +8818,7 @@ impl Service { node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index b15772a36c..71c73a0112 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -3,7 +3,10 @@ use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration}; use clashmap::{ClashMap, Entry}; use safekeeper_api::models::PullTimelineRequest; use safekeeper_client::mgmt_api; -use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender}; +use tokio::sync::{ + Semaphore, + mpsc::{self, UnboundedReceiver, UnboundedSender}, +}; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::{ @@ -206,18 +209,27 @@ impl ReconcilerHandle { } pub(crate) struct SafekeeperReconciler { - service: Arc, + inner: SafekeeperReconcilerInner, + concurrency_limiter: Arc, rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>, cancel: CancellationToken, } +/// Thin wrapper over `Service` to not clutter its inherent functions +#[derive(Clone)] +struct SafekeeperReconcilerInner { + service: Arc, +} + impl SafekeeperReconciler { fn spawn(cancel: CancellationToken, service: Arc) -> ReconcilerHandle { // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking. let (tx, rx) = mpsc::unbounded_channel(); + let concurrency = service.config.safekeeper_reconciler_concurrency; let mut reconciler = SafekeeperReconciler { - service, + inner: SafekeeperReconcilerInner { service }, rx, + concurrency_limiter: Arc::new(Semaphore::new(concurrency)), cancel: cancel.clone(), }; let handle = ReconcilerHandle { @@ -230,31 +242,44 @@ impl SafekeeperReconciler { } async fn run(&mut self) { loop { - // TODO add parallelism with semaphore here let req = tokio::select! { req = self.rx.recv() => req, _ = self.cancel.cancelled() => break, }; let Some((req, req_cancel)) = req else { break }; + + let permit_res = tokio::select! { + req = self.concurrency_limiter.clone().acquire_owned() => req, + _ = self.cancel.cancelled() => break, + }; + let Ok(_permit) = permit_res else { return }; + + let inner = self.inner.clone(); if req_cancel.is_cancelled() { continue; } - let kind = req.kind; - let tenant_id = req.tenant_id; - let timeline_id = req.timeline_id; - let node_id = req.safekeeper.skp.id; - self.reconcile_one(req, req_cancel) - .instrument(tracing::info_span!( - "reconcile_one", - ?kind, - %tenant_id, - ?timeline_id, - %node_id, - )) - .await; + tokio::task::spawn(async move { + let kind = req.kind; + let tenant_id = req.tenant_id; + let timeline_id = req.timeline_id; + let node_id = req.safekeeper.skp.id; + inner + .reconcile_one(req, req_cancel) + .instrument(tracing::info_span!( + "reconcile_one", + ?kind, + %tenant_id, + ?timeline_id, + %node_id, + )) + .await; + }); } } +} + +impl SafekeeperReconcilerInner { async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) { let req_host = req.safekeeper.skp.host.clone(); match req.kind { @@ -281,10 +306,11 @@ impl SafekeeperReconciler { req, async |client| client.pull_timeline(&pull_req).await, |resp| { - tracing::info!( - "pulled timeline from {} onto {req_host}", - resp.safekeeper_host, - ); + if let Some(host) = resp.safekeeper_host { + tracing::info!("pulled timeline from {host} onto {req_host}"); + } else { + tracing::info!("timeline already present on safekeeper on {req_host}"); + } }, req_cancel, ) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index f0ba632fd4..b151b612bf 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -165,16 +165,17 @@ pub(crate) async fn branch_cleanup_and_check_errors( .head_object(&path, &CancellationToken::new()) .await; - if response.is_err() { + if let Err(e) = response { // Object is not present. let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); let msg = format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}", layer, metadata.generation.get_suffix(), metadata.shard, is_l0, + e, ); if is_l0 || ignore_error { diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index f14341c7bc..e1a4095a3c 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -137,11 +137,10 @@ struct TenantRefAccumulator { impl TenantRefAccumulator { fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) { let this_shard_idx = ttid.tenant_shard_id.to_index(); - (*self - .shards_seen + self.shards_seen .entry(ttid.tenant_shard_id.tenant_id) - .or_default()) - .insert(this_shard_idx); + .or_default() + .insert(this_shard_idx); let mut ancestor_refs = Vec::new(); for (layer_name, layer_metadata) in &index_part.layer_metadata { @@ -767,10 +766,13 @@ pub async fn pageserver_physical_gc( stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?, ); Ok(try_stream! { + let mut cnt = 0; while let Some(ttid_res) = timelines.next().await { let ttid = ttid_res?; + cnt += 1; yield (ttid, tenant_manifest_arc.clone()); } + tracing::info!(%tenant_shard_id, "Found {} timelines", cnt); }) } }); @@ -790,6 +792,7 @@ pub async fn pageserver_physical_gc( &accumulator, tenant_manifest_arc, ) + .instrument(info_span!("gc_timeline", %ttid)) }); let timelines = timelines.try_buffered(CONCURRENCY); let mut timelines = std::pin::pin!(timelines); diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index ba75f25984..77c7987aa7 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -153,7 +153,10 @@ pub async fn scan_pageserver_metadata( const CONCURRENCY: usize = 32; // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); + let timelines = tenants.map_ok(|t| { + tracing::info!("Found tenant: {}", t); + stream_tenant_timelines(&remote_client, &target, t) + }); let timelines = timelines.try_buffered(CONCURRENCY); let timelines = timelines.try_flatten(); diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 24231e32fc..d0ca53f8ab 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -24,7 +24,6 @@ pub struct SnapshotDownloader { remote_client: GenericRemoteStorage, #[allow(dead_code)] target: RootTarget, - bucket_config: BucketConfig, tenant_id: TenantId, output_path: Utf8PathBuf, concurrency: usize, @@ -43,7 +42,6 @@ impl SnapshotDownloader { Ok(Self { remote_client, target, - bucket_config, tenant_id, output_path, concurrency, @@ -218,11 +216,9 @@ impl SnapshotDownloader { } pub async fn download(&self) -> anyhow::Result<()> { - let (remote_client, target) = - init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?; - // Generate a stream of TenantShardId - let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?; + let shards = + stream_tenant_shards(&self.remote_client, &self.target, self.tenant_id).await?; let shards: Vec = shards.try_collect().await?; // Only read from shards that have the highest count: avoids redundantly downloading @@ -240,7 +236,8 @@ impl SnapshotDownloader { for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { // Generate a stream of TenantTimelineId - let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?; + let timelines = + stream_tenant_timelines(&self.remote_client, &self.target, shard).await?; // Generate a stream of S3TimelineBlobData async fn load_timeline_index( @@ -251,8 +248,8 @@ impl SnapshotDownloader { let data = list_timeline_blobs(remote_client, ttid, target).await?; Ok((ttid, data)) } - let timelines = - timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid)); + let timelines = timelines + .map_ok(|ttid| load_timeline_index(&self.remote_client, &self.target, ttid)); let mut timelines = std::pin::pin!(timelines.try_buffered(8)); while let Some(i) = timelines.next().await { diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 652c38f5c3..4b4b98aa6c 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -1,6 +1,7 @@ from __future__ import annotations import urllib.parse +from enum import StrEnum from typing import TYPE_CHECKING, final import requests @@ -9,11 +10,23 @@ from requests.auth import AuthBase from typing_extensions import override from fixtures.log_helper import log +from fixtures.utils import wait_until if TYPE_CHECKING: from requests import PreparedRequest +COMPUTE_AUDIENCE = "compute" +""" +The value to place in the `aud` claim. +""" + + +@final +class ComputeClaimsScope(StrEnum): + ADMIN = "admin" + + @final class BearerAuth(AuthBase): """ @@ -50,6 +63,35 @@ class EndpointHttpClient(requests.Session): res.raise_for_status() return res.json() + def prewarm_lfc_status(self) -> dict[str, str]: + res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm") + res.raise_for_status() + json: dict[str, str] = res.json() + return json + + def prewarm_lfc(self): + self.post(f"http://localhost:{self.external_port}/lfc/prewarm").raise_for_status() + + def prewarmed(): + json = self.prewarm_lfc_status() + status, err = json["status"], json.get("error") + assert status == "completed", f"{status}, error {err}" + + wait_until(prewarmed) + + def offload_lfc(self): + url = f"http://localhost:{self.external_port}/lfc/offload" + self.post(url).raise_for_status() + + def offloaded(): + res = self.get(url) + res.raise_for_status() + json = res.json() + status, err = json["status"], json.get("error") + assert status == "completed", f"{status}, error {err}" + + wait_until(offloaded) + def database_schema(self, database: str): res = self.get( f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}", diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index b5d69b5ab6..4eaa4b7d99 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: Any, ) + from fixtures.endpoint.http import ComputeClaimsScope from fixtures.pg_version import PgVersion @@ -535,12 +536,16 @@ class NeonLocalCli(AbstractNeonCli): res.check_returncode() return res - def endpoint_generate_jwt(self, endpoint_id: str) -> str: + def endpoint_generate_jwt( + self, endpoint_id: str, scope: ComputeClaimsScope | None = None + ) -> str: """ Generate a JWT for making requests to the endpoint's external HTTP server. """ args = ["endpoint", "generate-jwt", endpoint_id] + if scope: + args += ["--scope", str(scope)] cmd = self.raw_cli(args) cmd.check_returncode() @@ -552,7 +557,7 @@ class NeonLocalCli(AbstractNeonCli): endpoint_id: str, safekeepers_generation: int | None = None, safekeepers: list[int] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, create_test_user: bool = False, @@ -567,8 +572,8 @@ class NeonLocalCli(AbstractNeonCli): extra_env_vars = env or {} if basebackup_request_tries is not None: extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) - if remote_ext_config is not None: - args.extend(["--remote-ext-config", remote_ext_config]) + if remote_ext_base_url is not None: + args.extend(["--remote-ext-base-url", remote_ext_base_url]) if safekeepers_generation is not None: args.extend(["--safekeepers-generation", str(safekeepers_generation)]) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1d668d4b2d..1b4562c0b3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -51,7 +51,7 @@ from fixtures.common_types import ( TimelineId, ) from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS -from fixtures.endpoint.http import EndpointHttpClient +from fixtures.endpoint.http import ComputeClaimsScope, EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.neon_cli import NeonLocalCli, Pagectl @@ -1185,7 +1185,9 @@ class NeonEnv: "broker": {}, "safekeepers": [], "pageservers": [], - "endpoint_storage": {"port": self.port_distributor.get_port()}, + "endpoint_storage": { + "listen_addr": f"127.0.0.1:{self.port_distributor.get_port()}", + }, "generate_local_ssl_certs": self.generate_local_ssl_certs, } @@ -1194,8 +1196,7 @@ class NeonEnv: else: cfg["broker"]["listen_addr"] = self.broker.listen_addr() - if self.control_plane_api is not None: - cfg["control_plane_api"] = self.control_plane_api + cfg["control_plane_api"] = self.control_plane_api if self.control_plane_hooks_api is not None: cfg["control_plane_hooks_api"] = self.control_plane_hooks_api @@ -1280,7 +1281,8 @@ class NeonEnv: ) tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config["rel_size_v2_enabled"] = True # Enable relsize_v2 by default in tests + # This feature is pending rollout. + # tenant_config["rel_size_v2_enabled"] = True if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( @@ -1297,13 +1299,6 @@ class NeonEnv: for key, value in override.items(): ps_cfg[key] = value - if self.pageserver_virtual_file_io_mode is not None: - # TODO(christian): https://github.com/neondatabase/neon/issues/11598 - if not config.test_may_use_compatibility_snapshot_binaries: - ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode - else: - log.info("ignoring virtual_file_io_mode parametrization for compatibility test") - if self.pageserver_wal_receiver_protocol is not None: key, value = PageserverWalReceiverProtocol.to_config_key_value( self.pageserver_wal_receiver_protocol @@ -1407,30 +1402,6 @@ class NeonEnv: for f in futs: f.result() - # Last step: register safekeepers at the storage controller - if ( - self.storage_controller_config is not None - and self.storage_controller_config.get("timelines_onto_safekeepers") is True - ): - for sk_id, sk in enumerate(self.safekeepers): - # 0 is an invalid safekeeper id - sk_id = sk_id + 1 - body = { - "id": sk_id, - "created_at": "2023-10-25T09:11:25Z", - "updated_at": "2024-08-28T11:32:43Z", - "region_id": "aws-us-east-2", - "host": "127.0.0.1", - "port": sk.port.pg, - "http_port": sk.port.http, - "https_port": None, - "version": 5957, - "availability_zone_id": f"us-east-2b-{sk_id}", - } - - self.storage_controller.on_safekeeper_deploy(sk_id, body) - self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active") - self.endpoint_storage.start(timeout_in_seconds=timeout_in_seconds) def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): @@ -3864,7 +3835,7 @@ class NeonAuthBroker: external_http_port: int, auth_backend: NeonAuthBroker.ProxyV1, ): - self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1 + self.domain = "local.neon.build" # resolves to 127.0.0.1 self.host = "127.0.0.1" self.http_port = http_port self.external_http_port = external_http_port @@ -3881,7 +3852,7 @@ class NeonAuthBroker: # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path) + generate_proxy_tls_certs(f"apiauth.{self.domain}", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3925,10 +3896,10 @@ class NeonAuthBroker: log.info(f"Executing http query: {query}") - connstr = f"postgresql://{user}@{self.domain}/postgres" + connstr = f"postgresql://{user}@ep-foo-bar-1234.{self.domain}/postgres" async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client: response = await client.post( - f"https://{self.domain}:{self.external_http_port}/sql", + f"https://apiauth.{self.domain}:{self.external_http_port}/sql", json={"query": query, "params": args}, headers={ "Neon-Connection-String": connstr, @@ -4218,13 +4189,13 @@ class Endpoint(PgProtocol, LogUtils): self.config(config_lines) - self.__jwt = self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id) + self.__jwt = self.generate_jwt() return self def start( self, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, safekeeper_generation: int | None = None, safekeepers: list[int] | None = None, @@ -4250,7 +4221,7 @@ class Endpoint(PgProtocol, LogUtils): self.endpoint_id, safekeepers_generation=safekeeper_generation, safekeepers=self.active_safekeepers, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, create_test_user=create_test_user, @@ -4265,6 +4236,14 @@ class Endpoint(PgProtocol, LogUtils): return self + def generate_jwt(self, scope: ComputeClaimsScope | None = None) -> str: + """ + Generate a JWT for making requests to the endpoint's external HTTP + server. + """ + assert self.endpoint_id is not None + return self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id, scope) + def endpoint_path(self) -> Path: """Path to endpoint directory""" assert self.endpoint_id @@ -4457,7 +4436,7 @@ class Endpoint(PgProtocol, LogUtils): hot_standby: bool = False, lsn: Lsn | None = None, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, basebackup_request_tries: int | None = None, @@ -4476,7 +4455,7 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id=pageserver_id, allow_multiple=allow_multiple, ).start( - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, basebackup_request_tries=basebackup_request_tries, @@ -4560,7 +4539,7 @@ class EndpointFactory: lsn: Lsn | None = None, hot_standby: bool = False, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, basebackup_request_tries: int | None = None, ) -> Endpoint: @@ -4580,7 +4559,7 @@ class EndpointFactory: hot_standby=hot_standby, config_lines=config_lines, lsn=lsn, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, basebackup_request_tries=basebackup_request_tries, ) @@ -5467,6 +5446,13 @@ def wait_for_last_flush_lsn( if last_flush_lsn is None: last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + # The last_flush_lsn may not correspond to a record boundary. + # For example, if the compute flushed WAL on a page boundary, + # the remaining part of the record might not be flushed for a long time. + # This would prevent the pageserver from reaching last_flush_lsn promptly. + # To ensure the rest of the record reaches the pageserver quickly, + # we forcibly flush the WAL by using CHECKPOINT. + endpoint.safe_psql("CHECKPOINT") results = [] for tenant_shard_id, pageserver in shards: diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 24c856e279..43bffd919c 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -122,6 +122,10 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ ".*Call to node.*management API.*failed.*Timeout.*", ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*", + # Many tests will take safekeepers offline + ".*Call to safekeeper.*management API.*failed.*receive body.*", + ".*Call to safekeeper.*management API.*failed.*ReceiveBody.*", + ".*Call to safekeeper.*management API.*failed.*Timeout.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode diff --git a/test_runner/performance/test_cumulative_statistics_persistence.py b/test_runner/performance/test_cumulative_statistics_persistence.py index 061467bbad..5e9e55cb0f 100644 --- a/test_runner/performance/test_cumulative_statistics_persistence.py +++ b/test_runner/performance/test_cumulative_statistics_persistence.py @@ -1,4 +1,5 @@ import math # Add this import +import os import time import traceback from pathlib import Path @@ -87,7 +88,10 @@ def test_cumulative_statistics_persistence( - insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are - verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension """ - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, + f"Test cumulative statistics persistence, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}", + ) project_id = project["project"]["id"] neon_api.wait_for_operation_to_finish(project_id) endpoint_id = project["endpoints"][0]["id"] diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index bdafa2d657..c580bfcc14 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -62,7 +62,9 @@ def test_ro_replica_lag( pgbench_duration = f"-T{test_duration_min * 60 * 2}" - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, f"Test readonly replica lag, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + ) project_id = project["project"]["id"] log.info("Project ID: %s", project_id) log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"]) @@ -195,7 +197,9 @@ def test_replication_start_stop( pgbench_duration = f"-T{2**num_replicas * configuration_test_time_sec}" error_occurred = False - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, f"Test replication start stop, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + ) project_id = project["project"]["id"] log.info("Project ID: %s", project_id) log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"]) diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py index 643151fa11..645c9b7b9d 100644 --- a/test_runner/random_ops/test_random_ops.py +++ b/test_runner/random_ops/test_random_ops.py @@ -206,7 +206,7 @@ class NeonProject: self.neon_api = neon_api self.pg_bin = pg_bin proj = self.neon_api.create_project( - pg_version, f"Automatic random API test {os.getenv('GITHUB_RUN_ID')}" + pg_version, f"Automatic random API test GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" ) self.id: str = proj["project"]["id"] self.name: str = proj["project"]["name"] diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index ee408e3c65..3616467c00 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -186,7 +186,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "type": "interpreted", "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, }, - "rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it + "rel_size_v2_enabled": True, "gc_compaction_enabled": True, "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 53edf9f79e..370f57b19d 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -202,6 +202,8 @@ def test_pageserver_gc_compaction_preempt( env = neon_env_builder.init_start(initial_tenant_conf=conf) env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*") + env.pageserver.allowed_errors.append(".*flush task cancelled.*") + env.pageserver.allowed_errors.append(".*failed to pipe.*") tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -229,7 +231,7 @@ def test_pageserver_gc_compaction_preempt( @skip_in_debug_build("only run with release build") -@pytest.mark.timeout(600) # This test is slow with sanitizers enabled, especially on ARM +@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM @pytest.mark.parametrize( "with_branches", ["with_branches", "no_branches"], diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 37208c9fff..b66b326360 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -544,3 +544,69 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env ) role = cursor.fetchone() assert role is None + + +def test_db_with_custom_settings(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can work with databases that have some custom settings. + For example, role=some_other_role, default_transaction_read_only=on, + search_path=non_public_schema, statement_timeout=1 (1ms). + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + TEST_ROLE = "some_other_role" + TEST_DB = "db_with_custom_settings" + TEST_SCHEMA = "non_public_schema" + + endpoint.respec_deep( + **{ + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": [ + { + "name": TEST_DB, + "owner": TEST_ROLE, + } + ], + "roles": [ + { + "name": TEST_ROLE, + } + ], + }, + } + } + ) + + endpoint.reconfigure() + + with endpoint.cursor(dbname=TEST_DB) as cursor: + cursor.execute(f"CREATE SCHEMA {TEST_SCHEMA}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET role = {TEST_ROLE}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET default_transaction_read_only = on") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET search_path = {TEST_SCHEMA}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET statement_timeout = 1") + + with endpoint.cursor(dbname=TEST_DB) as cursor: + cursor.execute("SELECT current_role") + role = cursor.fetchone() + assert role is not None + assert role[0] == TEST_ROLE + + cursor.execute("SHOW default_transaction_read_only") + default_transaction_read_only = cursor.fetchone() + assert default_transaction_read_only is not None + assert default_transaction_read_only[0] == "on" + + cursor.execute("SHOW search_path") + search_path = cursor.fetchone() + assert search_path is not None + assert search_path[0] == TEST_SCHEMA + + # Do not check statement_timeout, because we force it to 2min + # in `endpoint.cursor()` fixture. + + endpoint.reconfigure() diff --git a/test_runner/regress/test_compute_http.py b/test_runner/regress/test_compute_http.py new file mode 100644 index 0000000000..9846d44ce2 --- /dev/null +++ b/test_runner/regress/test_compute_http.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from http.client import FORBIDDEN, UNAUTHORIZED +from typing import TYPE_CHECKING + +import jwt +import pytest +from fixtures.endpoint.http import COMPUTE_AUDIENCE, ComputeClaimsScope, EndpointHttpClient +from fixtures.utils import run_only_on_default_postgres +from requests import RequestException + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version") +def test_compute_no_scope_claim(neon_simple_env: NeonEnv): + """ + Test that if the JWT scope is not admin and no compute_id is specified, + the external HTTP server returns a 403 Forbidden error. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + # Encode nothing in the token + token = jwt.encode({}, env.auth_keys.priv, algorithm="EdDSA") + + # Create an admin-scoped HTTP client + client = EndpointHttpClient( + external_port=endpoint.external_http_port, + internal_port=endpoint.internal_http_port, + jwt=token, + ) + + try: + client.status() + pytest.fail("Exception should have been raised") + except RequestException as e: + assert e.response is not None + assert e.response.status_code == FORBIDDEN + + +@pytest.mark.parametrize( + "audience", + (COMPUTE_AUDIENCE, "invalid", None), + ids=["with_audience", "with_invalid_audience", "without_audience"], +) +@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version") +def test_compute_admin_scope_claim(neon_simple_env: NeonEnv, audience: str | None): + """ + Test that an admin-scoped JWT can access the compute's external HTTP server + without the compute_id being specified in the claims. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + data: dict[str, str | list[str]] = {"scope": str(ComputeClaimsScope.ADMIN)} + if audience: + data["aud"] = [audience] + + token = jwt.encode(data, env.auth_keys.priv, algorithm="EdDSA") + + # Create an admin-scoped HTTP client + client = EndpointHttpClient( + external_port=endpoint.external_http_port, + internal_port=endpoint.internal_http_port, + jwt=token, + ) + + try: + client.status() + if audience != COMPUTE_AUDIENCE: + pytest.fail("Exception should have been raised") + except RequestException as e: + assert e.response is not None + assert e.response.status_code == UNAUTHORIZED diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 3b6c94a268..24ba0713d2 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -14,7 +14,7 @@ from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.paths import BASE_DIR from fixtures.pg_config import PgConfigKey -from fixtures.utils import subprocess_capture +from fixtures.utils import WITH_SANITIZERS, subprocess_capture from werkzeug.wrappers.response import Response if TYPE_CHECKING: @@ -148,6 +148,15 @@ def test_remote_extensions( pg_config: PgConfig, extension: RemoteExtension, ): + if WITH_SANITIZERS and extension is RemoteExtension.WITH_LIB: + pytest.skip( + """ + For this test to work with sanitizers enabled, we would need to + compile the dummy Postgres extension with the same CFLAGS that we + compile Postgres and the neon extension with to link the sanitizers. + """ + ) + # Setup a mock nginx S3 gateway which will return our test extension. (host, port) = httpserver_listen_address extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway" @@ -212,7 +221,7 @@ def test_remote_extensions( endpoint.create_remote_extension_spec(spec) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) with endpoint.connect() as conn: with conn.cursor() as cur: @@ -240,7 +249,7 @@ def test_remote_extensions( # Remove the extension files to force a redownload of the extension. extension.remove(test_output_dir, pg_version) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions. with endpoint.connect() as conn: diff --git a/test_runner/regress/test_endpoint_storage.py b/test_runner/regress/test_endpoint_storage.py index 04029114ec..1e27ef4b14 100644 --- a/test_runner/regress/test_endpoint_storage.py +++ b/test_runner/regress/test_endpoint_storage.py @@ -4,10 +4,12 @@ import pytest from aiohttp import ClientSession from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import run_only_on_default_postgres from jwcrypto import jwk, jwt @pytest.mark.asyncio +@run_only_on_default_postgres("test doesn't use postgres") async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv): """ Inserts, retrieves, and deletes test file using a JWT token @@ -35,7 +37,6 @@ async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv) key = f"http://{base_url}/{tenant_id}/{timeline_id}/{endpoint_id}/key" headers = {"Authorization": f"Bearer {token}"} log.info(f"cache key url {key}") - log.info(f"token {token}") async with ClientSession(headers=headers) as session: async with session.get(key) as res: diff --git a/test_runner/regress/test_gist.py b/test_runner/regress/test_gist.py new file mode 100644 index 0000000000..89e3b9b2b1 --- /dev/null +++ b/test_runner/regress/test_gist.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +# +# Test unlogged build for GIST index +# +def test_gist(neon_simple_env: NeonEnv): + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + con = endpoint.connect() + cur = con.cursor() + iterations = 100 + + for _ in range(iterations): + cur.execute( + "CREATE TABLE pvactst (i INT, a INT[], p POINT) with (autovacuum_enabled = off)" + ) + cur.execute( + "INSERT INTO pvactst SELECT i, array[1,2,3], point(i, i+1) FROM generate_series(1,1000) i" + ) + cur.execute("CREATE INDEX gist_pvactst ON pvactst USING gist (p)") + cur.execute("VACUUM pvactst") + cur.execute("DROP TABLE pvactst") diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index a26c3994a5..2fda1991f7 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -641,6 +641,55 @@ def test_fast_import_binary( assert res[0][0] == 10 +def test_fast_import_event_triggers( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, +): + vanilla_pg.start() + vanilla_pg.safe_psql(""" + CREATE FUNCTION test_event_trigger_for_drops() + RETURNS event_trigger LANGUAGE plpgsql AS $$ + DECLARE + obj record; + BEGIN + FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects() + LOOP + RAISE NOTICE '% dropped object: % %.% %', + tg_tag, + obj.object_type, + obj.schema_name, + obj.object_name, + obj.object_identity; + END LOOP; + END + $$; + + CREATE EVENT TRIGGER test_event_trigger_for_drops + ON sql_drop + EXECUTE PROCEDURE test_event_trigger_for_drops(); + """) + + pg_port = port_distributor.get_port() + p = fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr()) + assert p.returncode == 0 + + vanilla_pg.stop() + + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + res = conn.safe_psql("SELECT count(*) FROM pg_event_trigger;") + log.info(f"Result: {res}") + assert res[0][0] == 0, f"Neon does not support importing event triggers, got: {res[0][0]}" + + def test_fast_import_restore_to_connstring( test_output_dir, vanilla_pg: VanillaPostgres, diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py new file mode 100644 index 0000000000..82e1e9fcba --- /dev/null +++ b/test_runner/regress/test_lfc_prewarm.py @@ -0,0 +1,221 @@ +import random +import threading +import time +from enum import Enum + +import pytest +from fixtures.endpoint.http import EndpointHttpClient +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import USE_LFC +from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl + + +class LfcQueryMethod(Enum): + COMPUTE_CTL = False + POSTGRES = True + + +PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total" +OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total" +QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL + + +def check_pinned_entries(cur): + # some LFC buffer can be temporary locked by autovacuum or background writer + for _ in range(10): + cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'") + n_pinned = cur.fetchall()[0][0] + if n_pinned == 0: + break + time.sleep(1) + assert n_pinned == 0 + + +def prom_parse(client: EndpointHttpClient) -> dict[str, float]: + return { + sample.name: sample.value + for family in prom_parse_impl(client.metrics()) + for sample in family.samples + if sample.name in (PREWARM_LABEL, OFFLOAD_LABEL) + } + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) +def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod): + env = neon_simple_env + n_records = 1000000 + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000", + ], + ) + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("create database lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + log.info(f"Inserting {n_records} rows") + lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") + lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + http_client = endpoint.http_client() + if query is LfcQueryMethod.COMPUTE_CTL: + status = http_client.prewarm_lfc_status() + assert status["status"] == "not_prewarmed" + assert "error" not in status + http_client.offload_lfc() + assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed" + assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0} + else: + pg_cur.execute("select get_local_cache_state()") + lfc_state = pg_cur.fetchall()[0][0] + + endpoint.stop() + endpoint.start() + + # wait until compute_ctl completes downgrade of extension to default version + time.sleep(1) + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("alter extension neon update to '1.6'") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.prewarm_lfc() + else: + pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'") + lfc_used_pages = pg_cur.fetchall()[0][0] + log.info(f"Used LFC size: {lfc_used_pages}") + pg_cur.execute("select * from get_prewarm_info()") + prewarm_info = pg_cur.fetchall()[0] + log.info(f"Prewarm info: {prewarm_info}") + total, prewarmed, skipped, _ = prewarm_info + progress = (prewarmed + skipped) * 100 // total + log.info(f"Prewarm progress: {progress}%") + + assert lfc_used_pages > 10000 + assert ( + prewarm_info[0] > 0 + and prewarm_info[1] > 0 + and prewarm_info[0] == prewarm_info[1] + prewarm_info[2] + ) + + lfc_cur.execute("select sum(pk) from t") + assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 + + check_pinned_entries(pg_cur) + + desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} + if query is LfcQueryMethod.COMPUTE_CTL: + assert http_client.prewarm_lfc_status() == desired + assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1} + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) +def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod): + env = neon_simple_env + n_records = 10000 + n_threads = 4 + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000000", + ], + ) + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("CREATE DATABASE lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + lfc_cur.execute( + "create table accounts(id integer primary key, balance bigint default 0, payload text default repeat('?', 1000)) with (fillfactor=10)" + ) + log.info(f"Inserting {n_records} rows") + lfc_cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + http_client = endpoint.http_client() + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.offload_lfc() + else: + pg_cur.execute("select get_local_cache_state()") + lfc_state = pg_cur.fetchall()[0][0] + + running = True + n_prewarms = 0 + + def workload(): + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + n_transfers = 0 + while running: + src = random.randint(1, n_records) + dst = random.randint(1, n_records) + lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,)) + lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,)) + n_transfers += 1 + log.info(f"Number of transfers: {n_transfers}") + + def prewarm(): + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + while running: + pg_cur.execute("alter system set neon.file_cache_size_limit='1MB'") + pg_cur.execute("select pg_reload_conf()") + pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'") + pg_cur.execute("select pg_reload_conf()") + + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.prewarm_lfc() + else: + pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + nonlocal n_prewarms + n_prewarms += 1 + log.info(f"Number of prewarms: {n_prewarms}") + + workload_threads = [] + for _ in range(n_threads): + t = threading.Thread(target=workload) + workload_threads.append(t) + t.start() + + prewarm_thread = threading.Thread(target=prewarm) + prewarm_thread.start() + + time.sleep(20) + + running = False + for t in workload_threads: + t.join() + prewarm_thread.join() + + lfc_cur.execute("select sum(balance) from accounts") + total_balance = lfc_cur.fetchall()[0][0] + assert total_balance == 0 + + check_pinned_entries(pg_cur) + if query is LfcQueryMethod.COMPUTE_CTL: + assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms} diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index fa1cd61206..e3f9982486 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -3,7 +3,7 @@ Tests in this module exercise the pageserver's behavior around generation numbers, as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require of the pageserver are: -- Do not start a tenant without a generation number if control_plane_api is set +- Do not start a tenant without a generation number - Remote objects must be suffixed with generation - Deletions may only be executed after validating generation - Updates to remote_consistent_lsn may only be made visible after validating generation diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index acec0ba44a..ffde08a73f 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -506,7 +506,6 @@ class SyntheticSizeVerifier: PER_METRIC_VERIFIERS = { "remote_storage_size": CannotVerifyAnything, - "resident_size": CannotVerifyAnything, "written_size": WrittenDataVerifier, "written_data_bytes_delta": WrittenDataDeltaVerifier, "timeline_logical_size": CannotVerifyAnything, diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 0fea706888..474002353b 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -471,7 +471,7 @@ def test_tx_abort_with_many_relations( try: # Rollback phase should be fast: this is one WAL record that we should process efficiently fut = exec.submit(rollback_and_wait) - fut.result(timeout=15) + fut.result(timeout=15 if reldir_type == "v1" else 30) except: exec.shutdown(wait=False, cancel_futures=True) raise diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 0bfc4b1d8c..4c9887fb92 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1334,6 +1334,13 @@ def test_sharding_split_failures( tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' ) + # Create bystander tenants with various shard counts. They should not be affected by the aborted + # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589. + bystanders = {} # id → shard_count + for bystander_shard_count in [1, 2, 4, 8]: + id, _ = env.create_tenant(shard_count=bystander_shard_count) + bystanders[id] = bystander_shard_count + env.storage_controller.allowed_errors.extend( [ # All split failures log a warning when then enqueue the abort operation @@ -1394,6 +1401,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == initial_shard_count if loc[1]["mode"] == "Secondary": @@ -1414,6 +1423,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == split_shard_count if loc[1]["mode"] == "Secondary": @@ -1496,6 +1507,12 @@ def test_sharding_split_failures( # the scheduler reaches an idle state env.storage_controller.reconcile_until_idle(timeout_secs=30) + # Check that all bystanders are still around. + for bystander_id, bystander_shard_count in bystanders.items(): + response = env.storage_controller.tenant_describe(bystander_id) + assert TenantId(response["tenant_id"]) == bystander_id + assert len(response["shards"]) == bystander_shard_count + env.storage_controller.consistency_check() diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index a71652af8a..d42c5d403e 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1822,7 +1822,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( endpoint2.safe_psql( "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')" ) - lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + lsn3 = wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( ["pg_replslot/test_slot_restore/state"] @@ -1839,7 +1839,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( assert all_reparented == set([]) # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN. - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] ), "main branch unaffected" diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index d3c9d61fb7..108856a4ae 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit d3c9d61fb7a362a165dac7060819dd9d6ad68c28 +Subproject commit 108856a4ae76be285b04497a0ed08fcbe60ddbe9 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 8ecb12f21d..b838c8969b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3 +Subproject commit b838c8969b7c63f3e637a769656f5f36793b797c diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 37496f87b5..05ddf212e2 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 37496f87b5324af53c56127e278ee5b1e8435253 +Subproject commit 05ddf212e2e07b788b5c8b88bdcf98630941f6ae diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index eab3a37834..b763ab54b9 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit eab3a37834cac6ec0719bf817ac918a201712d66 +Subproject commit b763ab54b98d232a0959371ab1d07f06ed77c49e diff --git a/vendor/revisions.json b/vendor/revisions.json index 90d878d0f7..4307fd1c3f 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.4", - "eab3a37834cac6ec0719bf817ac918a201712d66" + "b763ab54b98d232a0959371ab1d07f06ed77c49e" ], "v16": [ "16.8", - "37496f87b5324af53c56127e278ee5b1e8435253" + "05ddf212e2e07b788b5c8b88bdcf98630941f6ae" ], "v15": [ "15.12", - "8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3" + "b838c8969b7c63f3e637a769656f5f36793b797c" ], "v14": [ "14.17", - "d3c9d61fb7a362a165dac7060819dd9d6ad68c28" + "108856a4ae76be285b04497a0ed08fcbe60ddbe9" ] }