diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 39a30d9a39..edc456d611 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -8,6 +8,7 @@ self-hosted-runner: - small-arm64 - us-east-2 config-variables: + - AWS_ECR_REGION - AZURE_DEV_CLIENT_ID - AZURE_DEV_REGISTRY_NAME - AZURE_DEV_SUBSCRIPTION_ID @@ -15,23 +16,25 @@ config-variables: - AZURE_PROD_REGISTRY_NAME - AZURE_PROD_SUBSCRIPTION_ID - AZURE_TENANT_ID + - BENCHMARK_INGEST_TARGET_PROJECTID + - BENCHMARK_LARGE_OLTP_PROJECTID - BENCHMARK_PROJECT_ID_PUB - BENCHMARK_PROJECT_ID_SUB - - REMOTE_STORAGE_AZURE_CONTAINER - - REMOTE_STORAGE_AZURE_REGION - - SLACK_UPCOMING_RELEASE_CHANNEL_ID - DEV_AWS_OIDC_ROLE_ARN - - BENCHMARK_INGEST_TARGET_PROJECTID - - PGREGRESS_PG16_PROJECT_ID - - PGREGRESS_PG17_PROJECT_ID - - SLACK_ON_CALL_QA_STAGING_STREAM - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN - - SLACK_ON_CALL_STORAGE_STAGING_STREAM - - SLACK_CICD_CHANNEL_ID - - SLACK_STORAGE_CHANNEL_ID + - HETZNER_CACHE_BUCKET + - HETZNER_CACHE_ENDPOINT + - HETZNER_CACHE_REGION - NEON_DEV_AWS_ACCOUNT_ID - NEON_PROD_AWS_ACCOUNT_ID - - AWS_ECR_REGION - - BENCHMARK_LARGE_OLTP_PROJECTID + - PGREGRESS_PG16_PROJECT_ID + - PGREGRESS_PG17_PROJECT_ID + - REMOTE_STORAGE_AZURE_CONTAINER + - REMOTE_STORAGE_AZURE_REGION + - SLACK_CICD_CHANNEL_ID - SLACK_ON_CALL_DEVPROD_STREAM + - SLACK_ON_CALL_QA_STAGING_STREAM + - SLACK_ON_CALL_STORAGE_STAGING_STREAM - SLACK_RUST_CHANNEL_ID + - SLACK_STORAGE_CHANNEL_ID + - SLACK_UPCOMING_RELEASE_CHANNEL_ID diff --git a/.github/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py index d8f910271b..d3ec048409 100644 --- a/.github/scripts/generate_image_maps.py +++ b/.github/scripts/generate_image_maps.py @@ -39,12 +39,18 @@ registries = { ], } +release_branches = ["release", "release-proxy", "release-compute"] + outputs: dict[str, dict[str, list[str]]] = {} -target_tags = [target_tag, "latest"] if branch == "main" else [target_tag] -target_stages = ( - ["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"] +target_tags = ( + [target_tag, "latest"] + if branch == "main" + else [target_tag, "released"] + if branch in release_branches + else [target_tag] ) +target_stages = ["dev", "prod"] if branch in release_branches else ["dev"] for component_name, component_images in components.items(): for stage in target_stages: diff --git a/.github/scripts/push_with_image_map.py b/.github/scripts/push_with_image_map.py index c68f6ad407..85e2eb1937 100644 --- a/.github/scripts/push_with_image_map.py +++ b/.github/scripts/push_with_image_map.py @@ -2,6 +2,9 @@ import json import os import subprocess +RED = "\033[91m" +RESET = "\033[0m" + image_map = os.getenv("IMAGE_MAP") if not image_map: raise ValueError("IMAGE_MAP environment variable is not set") @@ -11,12 +14,32 @@ try: except json.JSONDecodeError as e: raise ValueError("Failed to parse IMAGE_MAP as JSON") from e -for source, targets in parsed_image_map.items(): - for target in targets: - cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source] - print(f"Running: {' '.join(cmd)}") - result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +failures = [] - if result.returncode != 0: - print(f"Error: {result.stdout}") - raise RuntimeError(f"Command failed: {' '.join(cmd)}") +pending = [(source, target) for source, targets in parsed_image_map.items() for target in targets] + +while len(pending) > 0: + if len(failures) > 10: + print("Error: more than 10 failures!") + for failure in failures: + print(f'"{failure[0]}" failed with the following output:') + print(failure[1]) + raise RuntimeError("Retry limit reached.") + + source, target = pending.pop(0) + cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source] + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + if result.returncode != 0: + failures.append((" ".join(cmd), result.stdout, target)) + pending.append((source, target)) + print( + f"{RED}[RETRY]{RESET} Push failed for {target}. Retrying... (failure count: {len(failures)})" + ) + print(result.stdout) + +if len(failures) > 0 and (github_output := os.getenv("GITHUB_OUTPUT")): + failed_targets = [target for _, _, target in failures] + with open(github_output, "a") as f: + f.write(f"push_failures={json.dumps(failed_targets)}\n") diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index 0703e2c4d6..df107920c1 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -8,6 +8,9 @@ defaults: run: shell: bash -euxo pipefail {0} +permissions: + contents: read + jobs: setup-databases: permissions: @@ -34,6 +37,11 @@ jobs: options: --init steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Set up Connection String id: set-up-prep-connstr run: | @@ -58,10 +66,10 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - - uses: actions/checkout@v4 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index b950187fe1..8b1314f95b 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -37,6 +37,9 @@ env: RUST_BACKTRACE: 1 COPT: '-Werror' +permissions: + contents: read + jobs: build-neon: runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -59,7 +62,12 @@ jobs: BUILD_TAG: ${{ inputs.build-tag }} steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true @@ -120,29 +128,49 @@ jobs: - name: Cache postgres v14 build id: cache_pg_14 - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/v14 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - name: Cache postgres v15 build id: cache_pg_15 - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/v15 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - name: Cache postgres v16 build id: cache_pg_16 - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - name: Cache postgres v17 build id: cache_pg_17 - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/v17 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} @@ -221,7 +249,7 @@ jobs: fi - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -330,7 +358,12 @@ jobs: fail-fast: false matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }} steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true diff --git a/.github/workflows/_check-codestyle-python.yml b/.github/workflows/_check-codestyle-python.yml index 868ac15f3c..2def92ffa2 100644 --- a/.github/workflows/_check-codestyle-python.yml +++ b/.github/workflows/_check-codestyle-python.yml @@ -12,6 +12,9 @@ defaults: run: shell: bash -euxo pipefail {0} +permissions: + contents: read + jobs: check-codestyle-python: runs-on: [ self-hosted, small ] @@ -27,10 +30,21 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 - - - uses: actions/cache@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Cache poetry deps + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 + with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: ~/.cache/pypoetry/virtualenvs key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml index 83eeb83e45..4f844b0bf6 100644 --- a/.github/workflows/_check-codestyle-rust.yml +++ b/.github/workflows/_check-codestyle-rust.yml @@ -37,14 +37,24 @@ jobs: options: --init steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - name: Cache cargo deps - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: | ~/.cargo/registry !~/.cargo/registry/src diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml index 9b1d1aa454..bfbb45e30b 100644 --- a/.github/workflows/_create-release-pr.yml +++ b/.github/workflows/_create-release-pr.yml @@ -20,6 +20,9 @@ defaults: run: shell: bash -euo pipefail {0} +permissions: + contents: read + jobs: create-release-branch: runs-on: ubuntu-22.04 @@ -28,7 +31,12 @@ jobs: contents: write # for `git push` steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: ref: ${{ inputs.source-branch }} fetch-depth: 0 diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml index 44802f0525..1ad37f13ed 100644 --- a/.github/workflows/_meta.yml +++ b/.github/workflows/_meta.yml @@ -5,10 +5,16 @@ on: github-event-name: type: string required: true + github-event-json: + type: string + required: true outputs: build-tag: description: "Tag for the current workflow run" value: ${{ jobs.tags.outputs.build-tag }} + release-tag: + description: "Tag for the release if this is an RC PR run" + value: ${{ jobs.tags.outputs.release-tag }} previous-storage-release: description: "Tag of the last storage release" value: ${{ jobs.tags.outputs.storage }} @@ -24,6 +30,9 @@ on: release-pr-run-id: description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found." value: ${{ jobs.tags.outputs.release-pr-run-id }} + sha: + description: "github.event.pull_request.head.sha on release PRs, github.sha otherwise" + value: ${{ jobs.tags.outputs.sha }} permissions: {} @@ -35,19 +44,22 @@ jobs: tags: runs-on: ubuntu-22.04 outputs: - build-tag: ${{ steps.build-tag.outputs.tag }} + build-tag: ${{ steps.build-tag.outputs.build-tag }} + release-tag: ${{ steps.build-tag.outputs.release-tag }} compute: ${{ steps.previous-releases.outputs.compute }} proxy: ${{ steps.previous-releases.outputs.proxy }} storage: ${{ steps.previous-releases.outputs.storage }} run-kind: ${{ steps.run-kind.outputs.run-kind }} release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }} + sha: ${{ steps.sha.outputs.sha }} permissions: contents: read steps: # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: - fetch-depth: 0 + egress-policy: audit - name: Get run kind id: run-kind @@ -69,6 +81,23 @@ jobs: run: | echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT + - name: Get the right SHA + id: sha + env: + SHA: > + ${{ + contains(fromJSON('["storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), steps.run-kind.outputs.run-kind) + && fromJSON(inputs.github-event-json).pull_request.head.sha + || github.sha + }} + run: | + echo "sha=$SHA" | tee -a $GITHUB_OUTPUT + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + ref: ${{ steps.sha.outputs.sha }} + - name: Get build tag id: build-tag env: @@ -79,16 +108,16 @@ jobs: run: | case $RUN_KIND in push-main) - echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + echo "build-tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; storage-release) - echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + echo "build-tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; proxy-release) - echo "tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + echo "build-tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; compute-release) - echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + echo "build-tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr) BUILD_AND_TEST_RUN_ID=$(gh api --paginate \ @@ -96,10 +125,21 @@ jobs: -H "X-GitHub-Api-Version: 2022-11-28" \ "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=${CURRENT_SHA}&branch=${CURRENT_BRANCH}" \ | jq '[.workflow_runs[] | select(.name == "Build and Test")][0].id // ("Error: No matching workflow run found." | halt_error(1))') - echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT + echo "build-tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT + case $RUN_KIND in + storage-rc-pr) + echo "release-tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + proxy-rc-pr) + echo "release-tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + compute-rc-pr) + echo "release-tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + esac ;; workflow-dispatch) - echo "tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT + echo "build-tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT ;; *) echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!" @@ -123,7 +163,7 @@ jobs: if: ${{ contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + CURRENT_SHA: ${{ github.sha }} run: | RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from RC PR!" | halt_error(1))') echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index 949eeca4b1..7d3a11409b 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -49,7 +49,12 @@ jobs: id-token: write # Required for aws/azure login packages: write # required for pushing to GHCR steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/scripts/push_with_image_map.py sparse-checkout-cone-mode: false @@ -59,7 +64,7 @@ jobs: - name: Configure AWS credentials if: contains(inputs.image-map, 'amazonaws.com/') - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: "${{ inputs.aws-region }}" role-to-assume: "arn:aws:iam::${{ inputs.aws-account-id }}:role/${{ inputs.aws-role-to-assume }}" @@ -67,7 +72,7 @@ jobs: - name: Login to ECR if: contains(inputs.image-map, 'amazonaws.com/') - uses: aws-actions/amazon-ecr-login@v2 + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 with: registries: "${{ inputs.aws-account-id }}" @@ -86,19 +91,38 @@ jobs: - name: Login to GHCR if: contains(inputs.image-map, 'ghcr.io/') - uses: docker/login-action@v3 + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Log in to Docker Hub - uses: docker/login-action@v3 + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Copy docker images to target registries + id: push run: python3 .github/scripts/push_with_image_map.py env: IMAGE_MAP: ${{ inputs.image-map }} + + - name: Notify Slack if container image pushing fails + if: steps.push.outputs.push_failures || failure() + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }} + text: > + *Container image pushing ${{ + steps.push.outcome == 'failure' && 'failed completely' || 'succeeded with some retries' + }}* in + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + + ${{ steps.push.outputs.push_failures && format( + '*Failed targets:*\n• {0}', join(fromJson(steps.push.outputs.push_failures), '\n• ') + ) || '' }} diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 0e53830040..da291bc67e 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -26,8 +26,13 @@ jobs: needs: [ check-permissions ] runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v4 - - uses: reviewdog/action-actionlint@v1 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: reviewdog/action-actionlint@a5524e1c19e62881d79c1f1b9b6f09f16356e281 # v1.65.2 env: # SC2046 - Quote this to prevent word splitting. - https://www.shellcheck.net/wiki/SC2046 # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086 diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index f4e1e2e96c..fecb86770b 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -47,6 +47,11 @@ jobs: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" create-or-update-pr-for-ci-run: @@ -63,9 +68,14 @@ jobs: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" - - uses: actions/checkout@v4 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: ref: ${{ github.event.pull_request.head.sha }} token: ${{ secrets.CI_ACCESS_TOKEN }} @@ -153,6 +163,11 @@ jobs: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch run: | CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --json 'closed' --jq '.[].closed')" diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 0cffb3787b..8af23820f4 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -94,10 +94,15 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials # necessary on Azure runners - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -164,7 +169,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | @@ -197,10 +202,15 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -252,10 +262,15 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -314,7 +329,7 @@ jobs: # Post both success and failure to the Slack channel - name: Post to a Slack channel if: ${{ github.event.schedule && !cancelled() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream slack-message: | @@ -346,6 +361,11 @@ jobs: tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }} steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Generate matrix for pgbench benchmark id: pgbench-compare-matrix run: | @@ -465,10 +485,15 @@ jobs: timeout-minutes: 480 steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -600,7 +625,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | @@ -649,10 +674,15 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -726,7 +756,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | @@ -778,10 +808,15 @@ jobs: timeout-minutes: 720 steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -854,7 +889,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | @@ -899,10 +934,15 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -979,7 +1019,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | @@ -1018,10 +1058,15 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -1091,7 +1136,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index f7c91e7412..133c8635b6 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -53,9 +53,14 @@ jobs: packages: read steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit - - uses: docker/login-action@v3 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} @@ -108,31 +113,36 @@ jobs: runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - - uses: docker/setup-buildx-action@v3 + - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 with: cache-binary: false - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: cache.neon.build username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} - - uses: docker/build-push-action@v6 + - uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0 with: file: build-tools.Dockerfile context: . @@ -154,12 +164,17 @@ jobs: packages: write steps: - - uses: docker/login-action@v3 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index b24a872152..148c1ef5af 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -28,6 +28,9 @@ env: # - You can connect up to four levels of workflows # - You can call a maximum of 20 unique reusable workflows from a single workflow file. # https://docs.github.com/en/actions/sharing-automations/reusing-workflows#limitations +permissions: + contents: read + jobs: build-pgxn: if: | @@ -46,8 +49,13 @@ jobs: # Hence keeping target/ (and general cache size) smaller BUILD_TYPE: release steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout main repo - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set pg ${{ matrix.postgres-version }} for caching id: pg_rev @@ -55,8 +63,13 @@ jobs: - name: Cache postgres ${{ matrix.postgres-version }} build id: cache_pg - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/${{ matrix.postgres-version }} key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} @@ -107,8 +120,13 @@ jobs: # Hence keeping target/ (and general cache size) smaller BUILD_TYPE: release steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout main repo - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set pg v17 for caching id: pg_rev @@ -116,15 +134,25 @@ jobs: - name: Cache postgres v17 build id: cache_pg - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/v17 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache walproposer-lib id: cache_walproposer_lib - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/build/walproposer-lib key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} @@ -165,8 +193,13 @@ jobs: # Hence keeping target/ (and general cache size) smaller BUILD_TYPE: release steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout main repo - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true @@ -185,32 +218,57 @@ jobs: - name: Cache postgres v14 build id: cache_pg - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/v14 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_v15 - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/v15 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v16 build id: cache_pg_v16 - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v17 build id: cache_pg_v17 - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/v17 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache cargo deps (only for v17) - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: | ~/.cargo/registry !~/.cargo/registry/src @@ -220,8 +278,13 @@ jobs: - name: Cache walproposer-lib id: cache_walproposer_lib - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: pg_install/build/walproposer-lib key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bc88da316a..46c8cd6fc9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -37,6 +37,11 @@ jobs: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Cancel previous e2e-tests runs for this PR env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} @@ -53,8 +58,13 @@ jobs: check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }} steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true @@ -70,6 +80,7 @@ jobs: uses: ./.github/workflows/_meta.yml with: github-event-name: ${{ github.event_name }} + github-event-json: ${{ toJSON(github.event) }} build-build-tools-image: needs: [ check-permissions ] @@ -78,8 +89,8 @@ jobs: check-codestyle-python: needs: [ meta, check-permissions, build-build-tools-image ] - # No need to run on `main` because we this in the merge queue - if: ${{ needs.meta.outputs.run-kind == 'pr' }} + # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_check-codestyle-python.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -87,7 +98,8 @@ jobs: check-codestyle-jsonnet: needs: [ meta, check-permissions, build-build-tools-image ] - if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }} + # We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -97,8 +109,13 @@ jobs: options: --init steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Check Jsonnet code formatting run: | @@ -110,12 +127,17 @@ jobs: needs: [ check-permissions ] runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - - uses: dorny/paths-filter@v3 + - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 id: check-if-submodules-changed with: filters: | @@ -124,7 +146,7 @@ jobs: - name: Check vendor/postgres-v14 submodule reference if: steps.check-if-submodules-changed.outputs.vendor == 'true' - uses: jtmullen/submodule-branch-check-action@v1 + uses: jtmullen/submodule-branch-check-action@ab0d3a69278e3fa0a2d4f3be3199d2514b676e13 # v1.3.0 with: path: "vendor/postgres-v14" fetch_depth: "50" @@ -133,7 +155,7 @@ jobs: - name: Check vendor/postgres-v15 submodule reference if: steps.check-if-submodules-changed.outputs.vendor == 'true' - uses: jtmullen/submodule-branch-check-action@v1 + uses: jtmullen/submodule-branch-check-action@ab0d3a69278e3fa0a2d4f3be3199d2514b676e13 # v1.3.0 with: path: "vendor/postgres-v15" fetch_depth: "50" @@ -142,7 +164,7 @@ jobs: - name: Check vendor/postgres-v16 submodule reference if: steps.check-if-submodules-changed.outputs.vendor == 'true' - uses: jtmullen/submodule-branch-check-action@v1 + uses: jtmullen/submodule-branch-check-action@ab0d3a69278e3fa0a2d4f3be3199d2514b676e13 # v1.3.0 with: path: "vendor/postgres-v16" fetch_depth: "50" @@ -151,7 +173,7 @@ jobs: - name: Check vendor/postgres-v17 submodule reference if: steps.check-if-submodules-changed.outputs.vendor == 'true' - uses: jtmullen/submodule-branch-check-action@v1 + uses: jtmullen/submodule-branch-check-action@ab0d3a69278e3fa0a2d4f3be3199d2514b676e13 # v1.3.0 with: path: "vendor/postgres-v17" fetch_depth: "50" @@ -160,8 +182,8 @@ jobs: check-codestyle-rust: needs: [ meta, check-permissions, build-build-tools-image ] - # No need to run on `main` because we this in the merge queue - if: ${{ needs.meta.outputs.run-kind == 'pr' }} + # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_check-codestyle-rust.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -170,7 +192,8 @@ jobs: check-dependencies-rust: needs: [ meta, files-changed, build-build-tools-image ] - if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr' }} + # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/cargo-deny.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -178,7 +201,8 @@ jobs: build-and-test-locally: needs: [ meta, build-build-tools-image ] - if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }} + # We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: fail-fast: false matrix: @@ -219,12 +243,22 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Cache poetry deps - uses: actions/cache@v4 + uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: + endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} + bucket: ${{ vars.HETZNER_CACHE_BUCKET }} + accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} + secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} + use-fallback: false path: ~/.cache/pypoetry/virtualenvs key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} @@ -265,8 +299,13 @@ jobs: pytest_split_group: [ 1, 2, 3, 4, 5 ] build_type: [ release ] steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Pytest benchmarks uses: ./.github/actions/run-python-test-set @@ -294,7 +333,12 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: slackapi/slack-github-action@v2 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} @@ -325,7 +369,12 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Create Allure report if: ${{ !cancelled() }} @@ -337,7 +386,7 @@ jobs: env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - - uses: actions/github-script@v7 + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries @@ -385,7 +434,12 @@ jobs: coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }} steps: # Need `fetch-depth: 0` for differential coverage (to get diff between two commits) - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true fetch-depth: 0 @@ -456,7 +510,7 @@ jobs: REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT - - uses: actions/github-script@v7 + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} @@ -495,6 +549,7 @@ jobs: uses: ./.github/workflows/trigger-e2e-tests.yml with: github-event-name: ${{ github.event_name }} + github-event-json: ${{ toJSON(github.event) }} secrets: inherit neon-image-arch: @@ -510,33 +565,39 @@ jobs: packages: write steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true + ref: ${{ needs.meta.outputs.sha }} - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - - uses: docker/setup-buildx-action@v3 + - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 with: cache-binary: false - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: cache.neon.build username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} - - uses: docker/build-push-action@v6 + - uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0 with: context: . # ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure) @@ -544,7 +605,7 @@ jobs: build-args: | ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG=${{ needs.meta.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm DEBIAN_VERSION=bookworm provenance: false @@ -567,7 +628,12 @@ jobs: packages: write steps: - - uses: docker/login-action@v3 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} @@ -609,12 +675,18 @@ jobs: runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true + ref: ${{ needs.meta.outputs.sha }} - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - - uses: docker/setup-buildx-action@v3 + - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 with: cache-binary: false # Disable parallelism for docker buildkit. @@ -623,31 +695,31 @@ jobs: [worker.oci] max-parallelism = 1 - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: cache.neon.build username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} - name: Build compute-node image - uses: docker/build-push-action@v6 + uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0 with: context: . build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.meta.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -661,13 +733,13 @@ jobs: - name: Build neon extensions test image if: matrix.version.pg >= 'v16' - uses: docker/build-push-action@v6 + uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0 with: context: . build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.meta.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -703,7 +775,12 @@ jobs: debian: bookworm steps: - - uses: docker/login-action@v3 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} @@ -748,7 +825,12 @@ jobs: VM_BUILDER_VERSION: v0.42.2 steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Downloading vm-builder run: | @@ -756,7 +838,7 @@ jobs: chmod +x vm-builder - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} @@ -796,7 +878,12 @@ jobs: - pg: v16 - pg: v17 steps: - - uses: docker/login-action@v3 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} @@ -830,16 +917,21 @@ jobs: runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/login-action@v3 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} @@ -888,7 +980,7 @@ jobs: TEST_EXTENSIONS_TAG: >- ${{ contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) - && 'latest' + && needs.meta.outputs.previous-compute-release || needs.meta.outputs.build-tag }} TEST_VERSION_ONLY: ${{ matrix.pg_version }} @@ -930,7 +1022,12 @@ jobs: compute-dev: ${{ steps.generate.outputs.compute-dev }} compute-prod: ${{ steps.generate.outputs.compute-prod }} steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/scripts/generate_image_maps.py sparse-checkout-cone-mode: false @@ -1098,6 +1195,11 @@ jobs: contents: write pull-requests: write steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Set PR's status to pending and request a remote CI test run: | COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }} @@ -1179,11 +1281,16 @@ jobs: runs-on: [ self-hosted, small ] container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Create git tag and GitHub release if: ${{ contains(fromJSON('["storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) }} - uses: actions/github-script@v7 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: TAG: "${{ needs.meta.outputs.build-tag }}" BRANCH: "${{ github.ref_name }}" @@ -1331,8 +1438,13 @@ jobs: if: github.ref_name == 'release' && needs.deploy.result != 'success' && always() runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Post release-deploy failure to team-storage slack channel - uses: slackapi/slack-github-action@v2 + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} @@ -1353,7 +1465,12 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: aws-actions/configure-aws-credentials@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -1441,15 +1558,20 @@ jobs: steps: # The list of possible results: # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Fail the job if any of the dependencies do not succeed run: exit 1 if: | contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') - || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr') - || (needs.build-and-test-locally.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') - || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') - || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') + || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.build-and-test-locally.result == 'skipped' && contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.check-codestyle-python.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.check-codestyle-rust.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || needs.files-changed.result == 'skipped' || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml index 389b59c1a5..c31b05fea2 100644 --- a/.github/workflows/build_and_test_with_sanitizers.yml +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -33,7 +33,12 @@ jobs: steps: # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 @@ -99,7 +104,12 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Create Allure report if: ${{ !cancelled() }} @@ -111,7 +121,7 @@ jobs: env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - - uses: actions/github-script@v7 + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml index a4f476c99a..4f1f8d344b 100644 --- a/.github/workflows/cargo-deny.yml +++ b/.github/workflows/cargo-deny.yml @@ -9,6 +9,9 @@ on: schedule: - cron: '0 10 * * *' +permissions: + contents: read + jobs: cargo-deny: strategy: @@ -35,8 +38,13 @@ jobs: options: --init steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: ref: ${{ matrix.ref }} @@ -48,7 +56,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event_name == 'schedule' && failure() }} - uses: slackapi/slack-github-action@v2 + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/check-permissions.yml b/.github/workflows/check-permissions.yml index 9c42794797..407f612887 100644 --- a/.github/workflows/check-permissions.yml +++ b/.github/workflows/check-permissions.yml @@ -18,6 +18,11 @@ jobs: check-permissions: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@v2 + with: + egress-policy: audit + - name: Disallow CI runs on PRs from forks if: | inputs.github-event-name == 'pull_request' && diff --git a/.github/workflows/cleanup-caches-by-a-branch.yml b/.github/workflows/cleanup-caches-by-a-branch.yml index 0c074e36dc..3608d8b074 100644 --- a/.github/workflows/cleanup-caches-by-a-branch.yml +++ b/.github/workflows/cleanup-caches-by-a-branch.yml @@ -11,6 +11,11 @@ jobs: cleanup: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@v2 + with: + egress-policy: audit + - name: Cleanup run: | gh extension install actions/gh-actions-cache diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 566629e15c..7ae8d46000 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -44,7 +44,12 @@ jobs: options: --init steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true @@ -121,7 +126,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} slack-message: | diff --git a/.github/workflows/fast-forward.yml b/.github/workflows/fast-forward.yml index bc63ff120d..a292522b88 100644 --- a/.github/workflows/fast-forward.yml +++ b/.github/workflows/fast-forward.yml @@ -13,6 +13,11 @@ jobs: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@v2 + with: + egress-policy: audit + - name: Remove fast-forward label to PR env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml index f2376306dc..9c9357055d 100644 --- a/.github/workflows/force-test-extensions-upgrade.yml +++ b/.github/workflows/force-test-extensions-upgrade.yml @@ -34,7 +34,12 @@ jobs: runs-on: small steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: false @@ -50,7 +55,7 @@ jobs: echo tag=${tag} >> ${GITHUB_OUTPUT} - name: Test extension upgrade - timeout-minutes: 20 + timeout-minutes: 60 env: NEW_COMPUTE_TAG: latest OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} @@ -67,7 +72,7 @@ jobs: - name: Post to the Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} slack-message: | diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index 37ee371311..35e4838a86 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -23,6 +23,9 @@ concurrency: group: ingest-bench-workflow cancel-in-progress: true +permissions: + contents: read + jobs: ingest: strategy: @@ -75,10 +78,15 @@ jobs: timeout-minutes: 1440 steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials # necessary to download artefacts - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml index b7cbc06a73..02d128179d 100644 --- a/.github/workflows/label-for-external-users.yml +++ b/.github/workflows/label-for-external-users.yml @@ -27,6 +27,11 @@ jobs: is-member: ${{ steps.check-user.outputs.is-member }} steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@v2 + with: + egress-policy: audit + - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}` id: check-user env: @@ -69,6 +74,11 @@ jobs: issues: write # for `gh issue edit` steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@v2 + with: + egress-policy: audit + - name: Add `${{ env.LABEL }}` label env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml index fea21877f8..2b63cbd044 100644 --- a/.github/workflows/large_oltp_benchmark.yml +++ b/.github/workflows/large_oltp_benchmark.yml @@ -24,6 +24,9 @@ concurrency: group: large-oltp-bench-workflow cancel-in-progress: false +permissions: + contents: read + jobs: oltp: strategy: @@ -62,10 +65,15 @@ jobs: timeout-minutes: 2880 steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials # necessary to download artefacts - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -160,7 +168,7 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Configure AWS credentials # again because prior steps could have exceeded 5 hours - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -175,7 +183,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | diff --git a/.github/workflows/lint-release-pr.yml b/.github/workflows/lint-release-pr.yml index b7d010f66d..226a060595 100644 --- a/.github/workflows/lint-release-pr.yml +++ b/.github/workflows/lint-release-pr.yml @@ -7,12 +7,20 @@ on: - release-proxy - release-compute +permissions: + contents: read + jobs: lint-release-pr: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout PR branch - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 # Fetch full history for git operations ref: ${{ github.event.pull_request.head.ref }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 558aba1e2e..11aa4d2c94 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -42,8 +42,13 @@ jobs: rebuild_everything: ${{ steps.files_changed.outputs.rebuild_neon_extra || steps.files_changed.outputs.rebuild_macos }} steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true @@ -101,8 +106,13 @@ jobs: CARGO_INCREMENTAL: 0 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true @@ -117,7 +127,7 @@ jobs: run: cargo build --all --release --timings -j$(nproc) - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -134,7 +144,7 @@ jobs: echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - name: Publish build stats report - uses: actions/github-script@v7 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: REPORT_URL: ${{ steps.upload-stats.outputs.report-url }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 433b969b0c..0fe002bc07 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -25,6 +25,9 @@ concurrency: group: ${{ github.workflow }} cancel-in-progress: false +permissions: + contents: read + jobs: trigger_bench_on_ec2_machine_in_eu_central_1: permissions: @@ -48,13 +51,18 @@ jobs: steps: # we don't need the neon source code because we run everything remotely # however we still need the local github actions to run the allure step below - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Show my own (github runner) external IP address - usefull for IP allowlisting run: curl https://ifconfig.me - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} @@ -143,7 +151,7 @@ jobs: - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" @@ -161,7 +169,7 @@ jobs: - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) if: always() && steps.poll_step.outputs.too_many_runs != 'true' - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index cb5ae556d8..098503769e 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -88,7 +88,12 @@ jobs: ports: - 8083:8083 steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Download Neon artifact uses: ./.github/actions/download @@ -138,7 +143,7 @@ jobs: - name: Post to a Slack channel if: github.event.schedule && failure() - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | @@ -158,7 +163,12 @@ jobs: options: --init --user root steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Download Neon artifact uses: ./.github/actions/download @@ -206,7 +216,7 @@ jobs: - name: Post to a Slack channel if: github.event.schedule && failure() - uses: slackapi/slack-github-action@v1 + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index ddeefe0128..f8d8172cb0 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -40,6 +40,11 @@ jobs: skip: ${{ steps.check-manifests.outputs.skip }} steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@v2 + with: + egress-policy: audit + - name: Check if we really need to pin the image id: check-manifests env: diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index bbe4638235..6fb4753fc0 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -27,7 +27,12 @@ jobs: branch: ${{ steps.group-metadata.outputs.branch }} pr-number: ${{ steps.group-metadata.outputs.pr-number }} steps: - - uses: actions/checkout@v4 + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 id: python-src @@ -125,8 +130,13 @@ jobs: - check-codestyle-rust runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Create fake `neon-cloud-e2e` check - uses: actions/github-script@v7 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 diff --git a/.github/workflows/regenerate-pg-setting.yml b/.github/workflows/regenerate-pg-setting.yml index 1e9d2ec5e2..bda3098b0e 100644 --- a/.github/workflows/regenerate-pg-setting.yml +++ b/.github/workflows/regenerate-pg-setting.yml @@ -23,8 +23,13 @@ jobs: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Add comment - uses: thollander/actions-comment-pull-request@v3 + uses: thollander/actions-comment-pull-request@65f9e5c9a1f2cd378bd74b2e057c9736982a8e74 # v3 with: comment-tag: ${{ github.job }} pr-number: ${{ github.event.number }} diff --git a/.github/workflows/release-notify.yml b/.github/workflows/release-notify.yml index 8bd10e993c..8cbed725a4 100644 --- a/.github/workflows/release-notify.yml +++ b/.github/workflows/release-notify.yml @@ -22,7 +22,12 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: neondatabase/dev-actions/release-pr-notify@main + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: neondatabase/dev-actions/release-pr-notify@483a843f2a8bcfbdc4c69d27630528a3ddc4e14b # main with: slack-token: ${{ secrets.SLACK_BOT_TOKEN }} slack-channel-id: ${{ vars.SLACK_UPCOMING_RELEASE_CHANNEL_ID || 'C05QQ9J1BRC' }} # if not set, then `#test-release-notifications` diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a88ddecd0a..4068eafb95 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,7 +3,7 @@ name: Create Release Branch on: schedule: # It should be kept in sync with if-condition in jobs - - cron: '0 6 * * THU' # Proxy release + - cron: '0 6 * * TUE' # Proxy release - cron: '0 6 * * FRI' # Storage release - cron: '0 7 * * FRI' # Compute release workflow_dispatch: @@ -43,7 +43,7 @@ jobs: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} create-proxy-release-branch: - if: ${{ github.event.schedule == '0 6 * * THU' || inputs.create-proxy-release-branch }} + if: ${{ github.event.schedule == '0 6 * * TUE' || inputs.create-proxy-release-branch }} permissions: contents: write diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml index 2ed044b780..6e5093ebd6 100644 --- a/.github/workflows/report-workflow-stats-batch.yml +++ b/.github/workflows/report-workflow-stats-batch.yml @@ -6,6 +6,9 @@ on: - cron: '25 0 * * *' - cron: '25 1 * * 6' +permissions: + contents: read + jobs: gh-workflow-stats-batch-2h: name: GitHub Workflow Stats Batch 2 hours @@ -14,8 +17,13 @@ jobs: permissions: actions: read steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Export Workflow Run for the past 2 hours - uses: neondatabase/gh-workflow-stats-action@v0.2.1 + uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} db_table: "gh_workflow_stats_neon" @@ -29,8 +37,13 @@ jobs: permissions: actions: read steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Export Workflow Run for the past 48 hours - uses: neondatabase/gh-workflow-stats-action@v0.2.1 + uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} db_table: "gh_workflow_stats_neon" @@ -44,8 +57,13 @@ jobs: permissions: actions: read steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - name: Export Workflow Run for the past 30 days - uses: neondatabase/gh-workflow-stats-action@v0.2.1 + uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} db_table: "gh_workflow_stats_neon" diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index a30da35502..ca4c465931 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -9,6 +9,9 @@ on: github-event-name: type: string required: true + github-event-json: + type: string + required: true defaults: run: @@ -31,6 +34,11 @@ jobs: runs-on: ubuntu-22.04 steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@v2 + with: + egress-policy: audit + - name: Cancel previous e2e-tests runs for this PR env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} @@ -43,6 +51,7 @@ jobs: uses: ./.github/workflows/_meta.yml with: github-event-name: ${{ inputs.github-event-name || github.event_name }} + github-event-json: ${{ inputs.github-event-json || toJSON(github.event) }} trigger-e2e-tests: needs: [ meta ] @@ -63,6 +72,11 @@ jobs: || needs.meta.outputs.build-tag }} steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@v2 + with: + egress-policy: audit + - name: Wait for `push-{neon,compute}-image-dev` job to finish # It's important to have a timeout here, the script in the step can run infinitely timeout-minutes: 60 diff --git a/Cargo.lock b/Cargo.lock index a8e400524e..dbbf2c3357 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,9 +148,9 @@ dependencies = [ [[package]] name = "arc-swap" -version = "1.6.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" [[package]] name = "archery" @@ -2809,6 +2809,7 @@ name = "http-utils" version = "0.1.0" dependencies = [ "anyhow", + "arc-swap", "bytes", "camino", "fail", @@ -2821,6 +2822,7 @@ dependencies = [ "pprof", "regex", "routerify", + "rustls 0.23.18", "rustls-pemfile 2.1.1", "serde", "serde_json", @@ -3859,11 +3861,10 @@ dependencies = [ [[package]] name = "num-bigint" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ - "autocfg", "num-integer", "num-traits", ] @@ -3912,11 +3913,10 @@ dependencies = [ [[package]] name = "num-integer" -version = "0.1.45" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", "num-traits", ] @@ -3945,9 +3945,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.15" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", "libm", @@ -4329,6 +4329,7 @@ dependencies = [ "strum", "strum_macros", "thiserror 1.0.69", + "tracing-utils", "utils", ] @@ -5360,26 +5361,25 @@ dependencies = [ [[package]] name = "redis" -version = "0.25.2" +version = "0.29.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb" +checksum = "b110459d6e323b7cda23980c46c77157601199c9da6241552b284cd565a7a133" dependencies = [ - "async-trait", + "arc-swap", "bytes", "combine", "futures-util", "itoa", + "num-bigint", "percent-encoding", "pin-project-lite", - "rustls 0.22.4", - "rustls-native-certs 0.7.0", - "rustls-pemfile 2.1.1", - "rustls-pki-types", + "rustls 0.23.18", + "rustls-native-certs 0.8.0", "ryu", "sha1_smol", "socket2", "tokio", - "tokio-rustls 0.25.0", + "tokio-rustls 0.26.0", "tokio-util", "url", ] @@ -6605,6 +6605,7 @@ version = "0.1.0" dependencies = [ "anyhow", "bytes", + "camino", "chrono", "clap", "clashmap", @@ -6648,6 +6649,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-postgres-rustls", + "tokio-rustls 0.26.0", "tokio-util", "tracing", "utils", @@ -7114,9 +7116,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.43.0" +version = "1.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" +checksum = "492a604e2fd7f814268a378409e6c92b5525d747d10db9a229723f55a417958c" dependencies = [ "backtrace", "bytes", @@ -7213,15 +7215,14 @@ dependencies = [ "bytes", "fallible-iterator", "futures-util", - "log", "parking_lot 0.12.1", - "phf", "pin-project-lite", "postgres-protocol2", "postgres-types2", "serde", "tokio", "tokio-util", + "tracing", ] [[package]] @@ -7603,6 +7604,7 @@ dependencies = [ "opentelemetry-otlp", "opentelemetry-semantic-conventions", "opentelemetry_sdk", + "pin-project-lite", "tokio", "tracing", "tracing-opentelemetry", diff --git a/Cargo.toml b/Cargo.toml index 9bbc5a1a38..1f605681db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,7 +50,7 @@ license = "Apache-2.0" [workspace.dependencies] ahash = "0.8" anyhow = { version = "1.0", features = ["backtrace"] } -arc-swap = "1.6" +arc-swap = "1.7" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } atomic-take = "1.1.0" flate2 = "1.0.26" @@ -130,7 +130,7 @@ nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal" # on compute startup metrics (start_postgres_ms), >= 25% degradation. notify = "6.0.0" num_cpus = "1.15" -num-traits = "0.2.15" +num-traits = "0.2.19" once_cell = "1.13" opentelemetry = "0.27" opentelemetry_sdk = "0.27" @@ -146,7 +146,7 @@ procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13" rand = "0.8" -redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } +redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] } @@ -183,7 +183,7 @@ test-context = "0.3" thiserror = "1.0" tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] } -tokio = { version = "1.41", features = ["macros"] } +tokio = { version = "1.43.1", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.12.0" diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index c103ceaea5..7766991a0a 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.85.0 +ENV RUSTC_VERSION=1.86.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index bdc73ab174..83cbacf034 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -369,7 +369,7 @@ FROM build-deps AS plv8-src ARG PG_VERSION WORKDIR /ext-src -COPY compute/patches/plv8-3.1.10.patch . +COPY compute/patches/plv8* . # plv8 3.2.3 supports v17 # last release v3.2.3 - Sep 7, 2024 @@ -393,7 +393,7 @@ RUN case "${PG_VERSION:?}" in \ git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ tar -czf plv8.tar.gz --exclude .git plv8-src && \ cd plv8-src && \ - if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi + if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8_v3.1.10.patch; else patch -p1 < /ext-src/plv8_v3.2.3.patch; fi # Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use # 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds. @@ -1022,67 +1022,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control -######################################################################################### -# -# Layer "pg_embedding-build" -# compile pg_embedding extension -# -######################################################################################### -FROM build-deps AS pg_embedding-src -ARG PG_VERSION - -# This is our extension, support stopped in favor of pgvector -# TODO: deprecate it -WORKDIR /ext-src -RUN case "${PG_VERSION:?}" in \ - "v14" | "v15") \ - export PG_EMBEDDING_VERSION=0.3.5 \ - export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ - ;; \ - *) \ - echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \ - esac && \ - wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \ - echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \ - mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . - -FROM pg-build AS pg_embedding-build -COPY --from=pg_embedding-src /ext-src/ /ext-src/ -WORKDIR /ext-src/ -RUN if [ -d pg_embedding-src ]; then \ - cd pg_embedding-src && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install; \ - fi - -######################################################################################### -# -# Layer "pg_anon-build" -# compile anon extension -# -######################################################################################### -FROM build-deps AS pg_anon-src -ARG PG_VERSION - -# This is an experimental extension, never got to real production. -# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. -WORKDIR /ext-src -RUN case "${PG_VERSION:?}" in "v17") \ - echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ - esac && \ - wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ - echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ - mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . - -FROM pg-build AS pg_anon-build -COPY --from=pg_anon-src /ext-src/ /ext-src/ -WORKDIR /ext-src -RUN if [ -d pg_anon-src ]; then \ - cd pg_anon-src && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \ - fi - ######################################################################################### # # Layer "pg build with nonroot user and cargo installed" @@ -1366,8 +1305,8 @@ ARG PG_VERSION # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs WORKDIR /ext-src -RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \ - echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \ + echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ @@ -1675,9 +1614,7 @@ COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql -COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1853,7 +1790,6 @@ COPY --from=pg_cron-src /ext-src/ /ext-src/ COPY --from=pg_uuidv7-src /ext-src/ /ext-src/ COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/ COPY --from=pg_semver-src /ext-src/ /ext-src/ -#COPY --from=pg_embedding-src /ext-src/ /ext-src/ #COPY --from=wal2json-src /ext-src/ /ext-src/ COPY --from=pg_ivm-src /ext-src/ /ext-src/ COPY --from=pg_partman-src /ext-src/ /ext-src/ @@ -1916,26 +1852,30 @@ RUN apt update && \ ;; \ esac && \ apt install --no-install-recommends -y \ + ca-certificates \ gdb \ - liblz4-1 \ - libreadline8 \ + iproute2 \ libboost-iostreams1.74.0 \ libboost-regex1.74.0 \ libboost-serialization1.74.0 \ libboost-system1.74.0 \ - libossp-uuid16 \ + libcurl4 \ + libevent-2.1-7 \ libgeos-c1v5 \ + liblz4-1 \ + libossp-uuid16 \ libprotobuf-c1 \ + libreadline8 \ libsfcgal1 \ libxml2 \ libxslt1.1 \ libzstd1 \ - libcurl4 \ - libevent-2.1-7 \ locales \ + lsof \ procps \ - ca-certificates \ rsyslog \ + screen \ + tcpdump \ $VERSION_INSTALLS && \ apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index da2b86d542..449e1199d0 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -33,6 +33,7 @@ import 'sql_exporter/lfc_hits.libsonnet', import 'sql_exporter/lfc_misses.libsonnet', import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_used_pages.libsonnet', import 'sql_exporter/lfc_writes.libsonnet', import 'sql_exporter/logical_slot_restart_lsn.libsonnet', import 'sql_exporter/max_cluster_size.libsonnet', diff --git a/compute/etc/sql_exporter/lfc_used_pages.libsonnet b/compute/etc/sql_exporter/lfc_used_pages.libsonnet new file mode 100644 index 0000000000..1e39a93482 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used_pages.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_used_pages', + type: 'gauge', + help: 'LFC pages used', + key_labels: null, + values: [ + 'lfc_used_pages', + ], + query: importstr 'sql_exporter/lfc_used_pages.sql', +} diff --git a/compute/etc/sql_exporter/lfc_used_pages.sql b/compute/etc/sql_exporter/lfc_used_pages.sql new file mode 100644 index 0000000000..56d92f8514 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used_pages.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_used_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used_pages'; diff --git a/compute/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch index 3f0bb84ae7..ae415a5412 100644 --- a/compute/patches/cloud_regress_pg16.patch +++ b/compute/patches/cloud_regress_pg16.patch @@ -202,10 +202,10 @@ index cf0b80d616..e8e2a14a4a 100644 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; ERROR: must be owner of relation constraint_comments_tbl diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out -index 442e7aff2b..525f732b03 100644 +index d785f92561..16377e5ac9 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out -@@ -8,7 +8,7 @@ +@@ -15,7 +15,7 @@ SELECT FROM test_enc_setup(); CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; @@ -587,16 +587,15 @@ index f551624afb..57f1e432d4 100644 SELECT * INTO TABLE ramp diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out -index 454db91ec0..01378d7081 100644 +index 4cbdbdf84d..573362850e 100644 --- a/src/test/regress/expected/database.out +++ b/src/test/regress/expected/database.out -@@ -1,8 +1,7 @@ +@@ -1,8 +1,6 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; -+WARNING: you need to manually restart any running background workers after this command ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. BEGIN; @@ -700,7 +699,7 @@ index 6ed50fdcfa..caa00a345d 100644 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out -index 6b8c2f2414..8e13b7fa46 100644 +index 84745b9f60..4883c12351 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES @@ -1112,7 +1111,7 @@ index 8475231735..0653946337 100644 DROP ROLE regress_passwd_sha_len1; DROP ROLE regress_passwd_sha_len2; diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out -index 5b9dba7b32..cc408dad42 100644 +index 620fbe8c52..0570102357 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3 @@ -1174,8 +1173,8 @@ index 5b9dba7b32..cc408dad42 100644 +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2; ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1; - SET SESSION AUTHORIZATION regress_priv_user1; -@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre + SET SESSION AUTHORIZATION regress_priv_user3; +@@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre ERROR: permission denied to grant privileges as role "regress_priv_role" DETAIL: The grantor must have the ADMIN option on role "regress_priv_role". GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE; @@ -1192,7 +1191,7 @@ index 5b9dba7b32..cc408dad42 100644 DROP ROLE regress_priv_role; SET SESSION AUTHORIZATION regress_priv_user1; SELECT session_user, current_user; -@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -1201,7 +1200,7 @@ index 5b9dba7b32..cc408dad42 100644 -- Check that index expressions and predicates are run as the table's owner -- A dummy index function checking current_user CREATE FUNCTION sro_ifun(int) RETURNS int AS $$ -@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer) +@@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer) drop cascades to function testns.priv_testproc(integer) -- Change owner of the schema & and rename of new schema owner \c - @@ -1212,7 +1211,7 @@ index 5b9dba7b32..cc408dad42 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid; -@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7; +@@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7; DROP USER regress_priv_user8; -- does not exist ERROR: role "regress_priv_user8" does not exist -- permissions with LOCK TABLE @@ -1221,7 +1220,7 @@ index 5b9dba7b32..cc408dad42 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission GRANT SELECT ON lock_table TO regress_locktable_user; -@@ -2874,7 +2878,7 @@ DROP USER regress_locktable_user; +@@ -2881,7 +2885,7 @@ DROP USER regress_locktable_user; -- pg_backend_memory_contexts. -- switch to superuser \c - @@ -1230,7 +1229,7 @@ index 5b9dba7b32..cc408dad42 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no has_table_privilege --------------------- -@@ -2918,10 +2922,10 @@ RESET ROLE; +@@ -2925,10 +2929,10 @@ RESET ROLE; -- clean up DROP ROLE regress_readallstats; -- test role grantor machinery @@ -1245,7 +1244,7 @@ index 5b9dba7b32..cc408dad42 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; SET SESSION AUTHORIZATION regress_group_direct_manager; -@@ -2950,9 +2954,9 @@ DROP ROLE regress_group_direct_manager; +@@ -2957,9 +2961,9 @@ DROP ROLE regress_group_direct_manager; DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes @@ -1841,7 +1840,7 @@ index 09a255649b..15895f0c53 100644 CREATE TABLE ruletest_t2 (x int); CREATE VIEW ruletest_v1 WITH (security_invoker=true) AS diff --git a/src/test/regress/expected/security_label.out b/src/test/regress/expected/security_label.out -index a8e01a6220..5a9cef4ede 100644 +index a8e01a6220..83543b250a 100644 --- a/src/test/regress/expected/security_label.out +++ b/src/test/regress/expected/security_label.out @@ -6,8 +6,8 @@ SET client_min_messages TO 'warning'; @@ -1855,34 +1854,6 @@ index a8e01a6220..5a9cef4ede 100644 CREATE TABLE seclabel_tbl1 (a int, b text); CREATE TABLE seclabel_tbl2 (x int, y text); CREATE VIEW seclabel_view1 AS SELECT * FROM seclabel_tbl2; -@@ -19,21 +19,21 @@ ALTER TABLE seclabel_tbl2 OWNER TO regress_seclabel_user2; - -- Test of SECURITY LABEL statement without a plugin - -- - SECURITY LABEL ON TABLE seclabel_tbl1 IS 'classified'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL FOR 'dummy' ON TABLE seclabel_tbl1 IS 'classified'; -- fail - ERROR: security label provider "dummy" is not loaded - SECURITY LABEL ON TABLE seclabel_tbl1 IS '...invalid label...'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL ON TABLE seclabel_tbl3 IS 'unclassified'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL ON ROLE regress_seclabel_user1 IS 'classified'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL FOR 'dummy' ON ROLE regress_seclabel_user1 IS 'classified'; -- fail - ERROR: security label provider "dummy" is not loaded - SECURITY LABEL ON ROLE regress_seclabel_user1 IS '...invalid label...'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL ON ROLE regress_seclabel_user3 IS 'unclassified'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - -- clean up objects - DROP FUNCTION seclabel_four(); - DROP DOMAIN seclabel_domain; diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out index b79fe9a1c0..e29fab88ab 100644 --- a/src/test/regress/expected/select_into.out @@ -2413,10 +2384,10 @@ index e3e3bea709..fa86ddc326 100644 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment'; diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql -index 9a65fca91f..58431a3056 100644 +index b567a1a572..4d1ac2e631 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql -@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r +@@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; @@ -2780,7 +2751,7 @@ index ae6841308b..47bc792e30 100644 SELECT * diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql -index 0367c0e37a..a23b98c4bd 100644 +index 46ad263478..eb05584ed5 100644 --- a/src/test/regress/sql/database.sql +++ b/src/test/regress/sql/database.sql @@ -1,8 +1,6 @@ @@ -2893,7 +2864,7 @@ index aa147b14a9..370e0dd570 100644 CREATE FOREIGN DATA WRAPPER dummy; COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql -index 45c7a534cb..32dd26b8cd 100644 +index 9f4210b26e..620d3fc87e 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES @@ -3246,7 +3217,7 @@ index 53e86b0b6c..0303fdfe96 100644 -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql -index 249df17a58..b258e7f26a 100644 +index 259f1aedd1..6e1a3d17b7 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -24,18 +24,18 @@ RESET client_min_messages; @@ -3308,7 +3279,7 @@ index 249df17a58..b258e7f26a 100644 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; -@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -3317,7 +3288,7 @@ index 249df17a58..b258e7f26a 100644 -- Check that index expressions and predicates are run as the table's owner -@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE; +@@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE; -- Change owner of the schema & and rename of new schema owner \c - @@ -3328,7 +3299,7 @@ index 249df17a58..b258e7f26a 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; -@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist +@@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist -- permissions with LOCK TABLE @@ -3337,7 +3308,7 @@ index 249df17a58..b258e7f26a 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission -@@ -1836,7 +1836,7 @@ DROP USER regress_locktable_user; +@@ -1839,7 +1839,7 @@ DROP USER regress_locktable_user; -- switch to superuser \c - @@ -3346,7 +3317,7 @@ index 249df17a58..b258e7f26a 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no -@@ -1856,10 +1856,10 @@ RESET ROLE; +@@ -1859,10 +1859,10 @@ RESET ROLE; DROP ROLE regress_readallstats; -- test role grantor machinery @@ -3361,7 +3332,7 @@ index 249df17a58..b258e7f26a 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; -@@ -1881,9 +1881,9 @@ DROP ROLE regress_group_indirect_manager; +@@ -1884,9 +1884,9 @@ DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes diff --git a/compute/patches/cloud_regress_pg17.patch b/compute/patches/cloud_regress_pg17.patch index e57447a2c6..4f10f8563a 100644 --- a/compute/patches/cloud_regress_pg17.patch +++ b/compute/patches/cloud_regress_pg17.patch @@ -202,10 +202,10 @@ index cf0b80d616..e8e2a14a4a 100644 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; ERROR: must be owner of relation constraint_comments_tbl diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out -index 442e7aff2b..525f732b03 100644 +index d785f92561..16377e5ac9 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out -@@ -8,7 +8,7 @@ +@@ -15,7 +15,7 @@ SELECT FROM test_enc_setup(); CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; @@ -587,16 +587,15 @@ index f551624afb..57f1e432d4 100644 SELECT * INTO TABLE ramp diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out -index 454db91ec0..01378d7081 100644 +index 4cbdbdf84d..573362850e 100644 --- a/src/test/regress/expected/database.out +++ b/src/test/regress/expected/database.out -@@ -1,8 +1,7 @@ +@@ -1,8 +1,6 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; -+WARNING: you need to manually restart any running background workers after this command ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. BEGIN; @@ -700,7 +699,7 @@ index 6ed50fdcfa..caa00a345d 100644 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out -index 69994c98e3..129abcfbe8 100644 +index fe6a1015f2..614b387b7d 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES @@ -1147,7 +1146,7 @@ index 924d6e001d..7fdda73439 100644 DROP ROLE regress_passwd_sha_len1; DROP ROLE regress_passwd_sha_len2; diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out -index 1296da0d57..f43fffa44c 100644 +index e8c668e0a1..03be5c2120 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3 @@ -1209,8 +1208,8 @@ index 1296da0d57..f43fffa44c 100644 +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2; ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1; - SET SESSION AUTHORIZATION regress_priv_user1; -@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre + SET SESSION AUTHORIZATION regress_priv_user3; +@@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre ERROR: permission denied to grant privileges as role "regress_priv_role" DETAIL: The grantor must have the ADMIN option on role "regress_priv_role". GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE; @@ -1227,7 +1226,7 @@ index 1296da0d57..f43fffa44c 100644 DROP ROLE regress_priv_role; SET SESSION AUTHORIZATION regress_priv_user1; SELECT session_user, current_user; -@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -1236,7 +1235,7 @@ index 1296da0d57..f43fffa44c 100644 -- Check that index expressions and predicates are run as the table's owner -- A dummy index function checking current_user CREATE FUNCTION sro_ifun(int) RETURNS int AS $$ -@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer) +@@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer) drop cascades to function testns.priv_testproc(integer) -- Change owner of the schema & and rename of new schema owner \c - @@ -1247,7 +1246,7 @@ index 1296da0d57..f43fffa44c 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid; -@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7; +@@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7; DROP USER regress_priv_user8; -- does not exist ERROR: role "regress_priv_user8" does not exist -- permissions with LOCK TABLE @@ -1256,7 +1255,7 @@ index 1296da0d57..f43fffa44c 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission GRANT SELECT ON lock_table TO regress_locktable_user; -@@ -2888,7 +2892,7 @@ DROP USER regress_locktable_user; +@@ -2895,7 +2899,7 @@ DROP USER regress_locktable_user; -- pg_backend_memory_contexts. -- switch to superuser \c - @@ -1265,7 +1264,7 @@ index 1296da0d57..f43fffa44c 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no has_table_privilege --------------------- -@@ -2932,10 +2936,10 @@ RESET ROLE; +@@ -2939,10 +2943,10 @@ RESET ROLE; -- clean up DROP ROLE regress_readallstats; -- test role grantor machinery @@ -1280,7 +1279,7 @@ index 1296da0d57..f43fffa44c 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; SET SESSION AUTHORIZATION regress_group_direct_manager; -@@ -2964,9 +2968,9 @@ DROP ROLE regress_group_direct_manager; +@@ -2971,9 +2975,9 @@ DROP ROLE regress_group_direct_manager; DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes @@ -1293,7 +1292,7 @@ index 1296da0d57..f43fffa44c 100644 CREATE SCHEMA regress_roleoption; GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC; GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE; -@@ -2995,9 +2999,9 @@ DROP ROLE regress_roleoption_protagonist; +@@ -3002,9 +3006,9 @@ DROP ROLE regress_roleoption_protagonist; DROP ROLE regress_roleoption_donor; DROP ROLE regress_roleoption_recipient; -- MAINTAIN @@ -2433,10 +2432,10 @@ index e3e3bea709..fa86ddc326 100644 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment'; diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql -index 9a65fca91f..58431a3056 100644 +index b567a1a572..4d1ac2e631 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql -@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r +@@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; @@ -2800,7 +2799,7 @@ index ae6841308b..47bc792e30 100644 SELECT * diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql -index 0367c0e37a..a23b98c4bd 100644 +index 46ad263478..eb05584ed5 100644 --- a/src/test/regress/sql/database.sql +++ b/src/test/regress/sql/database.sql @@ -1,8 +1,6 @@ @@ -2913,7 +2912,7 @@ index aa147b14a9..370e0dd570 100644 CREATE FOREIGN DATA WRAPPER dummy; COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql -index 2e710e419c..89cd481a54 100644 +index 8c4e4c7c83..e946cd2119 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES @@ -3301,7 +3300,7 @@ index bb82aa4aa2..dd8a05e24d 100644 -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql -index 5880bc018d..27aa952b18 100644 +index b7e1cb6cdd..6e5a2217f1 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -24,18 +24,18 @@ RESET client_min_messages; @@ -3363,7 +3362,7 @@ index 5880bc018d..27aa952b18 100644 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; -@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -3372,7 +3371,7 @@ index 5880bc018d..27aa952b18 100644 -- Check that index expressions and predicates are run as the table's owner -@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE; +@@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE; -- Change owner of the schema & and rename of new schema owner \c - @@ -3383,7 +3382,7 @@ index 5880bc018d..27aa952b18 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; -@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist +@@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist -- permissions with LOCK TABLE @@ -3392,7 +3391,7 @@ index 5880bc018d..27aa952b18 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission -@@ -1851,7 +1851,7 @@ DROP USER regress_locktable_user; +@@ -1854,7 +1854,7 @@ DROP USER regress_locktable_user; -- switch to superuser \c - @@ -3401,7 +3400,7 @@ index 5880bc018d..27aa952b18 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no -@@ -1871,10 +1871,10 @@ RESET ROLE; +@@ -1874,10 +1874,10 @@ RESET ROLE; DROP ROLE regress_readallstats; -- test role grantor machinery @@ -3416,7 +3415,7 @@ index 5880bc018d..27aa952b18 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; -@@ -1896,9 +1896,9 @@ DROP ROLE regress_group_indirect_manager; +@@ -1899,9 +1899,9 @@ DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes @@ -3429,7 +3428,7 @@ index 5880bc018d..27aa952b18 100644 CREATE SCHEMA regress_roleoption; GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC; GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE; -@@ -1926,9 +1926,9 @@ DROP ROLE regress_roleoption_donor; +@@ -1929,9 +1929,9 @@ DROP ROLE regress_roleoption_donor; DROP ROLE regress_roleoption_recipient; -- MAINTAIN diff --git a/compute/patches/pg_hint_plan_v16.patch b/compute/patches/pg_hint_plan_v16.patch index 1fc3ffa609..e9df2a3446 100644 --- a/compute/patches/pg_hint_plan_v16.patch +++ b/compute/patches/pg_hint_plan_v16.patch @@ -2,23 +2,6 @@ diff --git a/expected/ut-A.out b/expected/ut-A.out index da723b8..5328114 100644 --- a/expected/ut-A.out +++ b/expected/ut-A.out -@@ -9,13 +9,16 @@ SET search_path TO public; - ---- - -- No.A-1-1-3 - CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - -- No.A-1-2-3 - DROP EXTENSION pg_hint_plan; - -- No.A-1-1-4 - CREATE SCHEMA other_schema; - CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" - CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - DROP SCHEMA other_schema; - ---- - ---- No. A-5-1 comment pattern @@ -3175,6 +3178,7 @@ SELECT s.query, s.calls FROM public.pg_stat_statements s JOIN pg_catalog.pg_database d @@ -27,18 +10,6 @@ index da723b8..5328114 100644 ORDER BY 1; query | calls --------------------------------------+------- -diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out -index d372459..6282afe 100644 ---- a/expected/ut-fdw.out -+++ b/expected/ut-fdw.out -@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on; - SET client_min_messages TO LOG; - SET pg_hint_plan.enable_hint TO on; - CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw - CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; - CREATE USER MAPPING FOR PUBLIC SERVER file_server; - CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/sql/ut-A.sql b/sql/ut-A.sql index 7c7d58a..4fd1a07 100644 --- a/sql/ut-A.sql diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch index 3442a094eb..a244452cfe 100644 --- a/compute/patches/pg_hint_plan_v17.patch +++ b/compute/patches/pg_hint_plan_v17.patch @@ -1,24 +1,3 @@ -diff --git a/expected/ut-A.out b/expected/ut-A.out -index e7d68a1..65a056c 100644 ---- a/expected/ut-A.out -+++ b/expected/ut-A.out -@@ -9,13 +9,16 @@ SET search_path TO public; - ---- - -- No.A-1-1-3 - CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - -- No.A-1-2-3 - DROP EXTENSION pg_hint_plan; - -- No.A-1-1-4 - CREATE SCHEMA other_schema; - CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" - CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - DROP SCHEMA other_schema; - ---- - ---- No. A-5-1 comment pattern diff --git a/expected/ut-J.out b/expected/ut-J.out index 2fa3c70..314e929 100644 --- a/expected/ut-J.out @@ -160,15 +139,3 @@ index a09bd34..0ad227c 100644 error hint: explain_filter -diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out -index 017fa4b..98d989b 100644 ---- a/expected/ut-fdw.out -+++ b/expected/ut-fdw.out -@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on; - SET client_min_messages TO LOG; - SET pg_hint_plan.enable_hint TO on; - CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw - CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; - CREATE USER MAPPING FOR PUBLIC SERVER file_server; - CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/compute/patches/pgvector.patch b/compute/patches/pgvector.patch index da41c86140..6fe3d073ed 100644 --- a/compute/patches/pgvector.patch +++ b/compute/patches/pgvector.patch @@ -15,7 +15,7 @@ index 7a4b88c..56678af 100644 HEADERS = src/halfvec.h src/sparsevec.h src/vector.h diff --git a/src/hnswbuild.c b/src/hnswbuild.c -index b667478..fc1897c 100644 +index b667478..dc95d89 100644 --- a/src/hnswbuild.c +++ b/src/hnswbuild.c @@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) @@ -36,7 +36,7 @@ index b667478..fc1897c 100644 /* Close relations within worker */ index_close(indexRel, indexLockmode); table_close(heapRel, heapLockmode); -@@ -1100,12 +1108,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, +@@ -1100,12 +1108,39 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, SeedRandom(42); #endif @@ -62,10 +62,11 @@ index b667478..fc1897c 100644 +#else + RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; +#endif -+ -+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, -+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); -+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ if (set_lwlsn_block_range_hook) ++ set_lwlsn_block_range_hook(XactLastRecEnd, rlocator, ++ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ if (set_lwlsn_relation_hook) ++ set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM); + } +#endif + } diff --git a/compute/patches/plv8-3.1.10.patch b/compute/patches/plv8_v3.1.10.patch similarity index 80% rename from compute/patches/plv8-3.1.10.patch rename to compute/patches/plv8_v3.1.10.patch index 43cdb479f7..5cf96426d0 100644 --- a/compute/patches/plv8-3.1.10.patch +++ b/compute/patches/plv8_v3.1.10.patch @@ -1,12 +1,6 @@ -commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e -Author: Alexander Bayandin -Date: Sat Nov 30 18:29:32 2024 +0000 - - Fix v8 9.7.37 compilation on Debian 12 - diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch new file mode 100644 -index 0000000..f0a5dc7 +index 0000000..fae1cb3 --- /dev/null +++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch @@ -0,0 +1,30 @@ @@ -35,8 +29,21 @@ index 0000000..f0a5dc7 +@@ -5,6 +5,7 @@ + #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ + #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ -+ ++ ++#include + #include -+ ++ + #include "include/cppgc/prefinalizer.h" +diff --git a/plv8.cc b/plv8.cc +index c1ce883..6e47e94 100644 +--- a/plv8.cc ++++ b/plv8.cc +@@ -379,7 +379,7 @@ _PG_init(void) + NULL, + &plv8_v8_flags, + NULL, +- PGC_USERSET, 0, ++ PGC_SUSET, 0, + #if PG_VERSION_NUM >= 90100 + NULL, + #endif diff --git a/compute/patches/plv8_v3.2.3.patch b/compute/patches/plv8_v3.2.3.patch new file mode 100644 index 0000000000..5cf4ae2fa2 --- /dev/null +++ b/compute/patches/plv8_v3.2.3.patch @@ -0,0 +1,13 @@ +diff --git a/plv8.cc b/plv8.cc +index edfa2aa..623e7f2 100644 +--- a/plv8.cc ++++ b/plv8.cc +@@ -385,7 +385,7 @@ _PG_init(void) + NULL, + &plv8_v8_flags, + NULL, +- PGC_USERSET, 0, ++ PGC_SUSET, 0, + #if PG_VERSION_NUM >= 90100 + NULL, + #endif diff --git a/compute/patches/rum.patch b/compute/patches/rum.patch index 3041f8df81..5bc5d739b3 100644 --- a/compute/patches/rum.patch +++ b/compute/patches/rum.patch @@ -1,11 +1,5 @@ -commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb -Author: Anastasia Lubennikova -Date: Mon Jul 15 12:31:56 2024 +0100 - - Neon: fix unlogged index build patch - diff --git a/src/ruminsert.c b/src/ruminsert.c -index e8b209d..e89bf2a 100644 +index 255e616..7a2240f 100644 --- a/src/ruminsert.c +++ b/src/ruminsert.c @@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) @@ -30,7 +24,7 @@ index e8b209d..e89bf2a 100644 /* * Write index to xlog */ -@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) +@@ -713,6 +721,22 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) UnlockReleaseBuffer(buffer); } @@ -41,9 +35,10 @@ index e8b209d..e89bf2a 100644 +#else + RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; +#endif -+ -+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); -+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ if (set_lwlsn_block_range_hook) ++ set_lwlsn_block_range_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ if (set_lwlsn_relation_hook) ++ set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM); + + smgr_end_unlogged_build(index->rd_smgr); + } diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index fc7a3e2827..da11ac2860 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -45,7 +45,9 @@ use anyhow::{Context, Result}; use clap::Parser; use compute_api::responses::ComputeCtlConfig; use compute_api::spec::ComputeSpec; -use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal}; +use compute_tools::compute::{ + BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal, +}; use compute_tools::extension_server::get_pg_version_string; use compute_tools::logger::*; use compute_tools::params::*; @@ -57,10 +59,6 @@ use tracing::{error, info}; use url::Url; use utils::failpoint_support; -// this is an arbitrary build tag. Fine as a default / for testing purposes -// in-case of not-set environment var -const BUILD_TAG_DEFAULT: &str = "latest"; - // Compatibility hack: if the control plane specified any remote-ext-config // use the default value for extension storage proxy gateway. // Remove this once the control plane is updated to pass the gateway URL @@ -147,7 +145,7 @@ fn main() -> Result<()> { .build()?; let _rt_guard = runtime.enter(); - let build_tag = runtime.block_on(init())?; + runtime.block_on(init())?; // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; @@ -174,8 +172,6 @@ fn main() -> Result<()> { cgroup: cli.cgroup, #[cfg(target_os = "linux")] vm_monitor_addr: cli.vm_monitor_addr, - build_tag, - live_config_allowed: cli_spec.live_config_allowed, }, cli_spec.spec, @@ -189,7 +185,7 @@ fn main() -> Result<()> { deinit_and_exit(exit_code); } -async fn init() -> Result { +async fn init() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; @@ -199,12 +195,9 @@ async fn init() -> Result { } }); - let build_tag = option_env!("BUILD_TAG") - .unwrap_or(BUILD_TAG_DEFAULT) - .to_string(); - info!("build_tag: {build_tag}"); + info!("compute build_tag: {}", &BUILD_TAG.to_string()); - Ok(build_tag) + Ok(()) } fn try_spec_from_cli(cli: &Cli) -> Result { diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 47558be7a0..537028cde1 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -31,6 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version}; use nix::unistd::Pid; +use std::ops::Not; use tracing::{Instrument, error, info, info_span, warn}; use utils::fs_ext::is_directory_empty; @@ -44,7 +45,7 @@ mod s3_uri; const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600); const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300); -#[derive(Subcommand, Debug)] +#[derive(Subcommand, Debug, Clone, serde::Serialize)] enum Command { /// Runs local postgres (neon binary), restores into it, /// uploads pgdata to s3 to be consumed by pageservers @@ -84,6 +85,15 @@ enum Command { }, } +impl Command { + fn as_str(&self) -> &'static str { + match self { + Command::Pgdata { .. } => "pgdata", + Command::DumpRestore { .. } => "dump-restore", + } + } +} + #[derive(clap::Parser)] struct Args { #[clap(long, env = "NEON_IMPORTER_WORKDIR")] @@ -437,7 +447,7 @@ async fn run_dump_restore( #[allow(clippy::too_many_arguments)] async fn cmd_pgdata( - s3_client: Option, + s3_client: Option<&aws_sdk_s3::Client>, kms_client: Option, maybe_s3_prefix: Option, maybe_spec: Option, @@ -506,14 +516,14 @@ async fn cmd_pgdata( if let Some(s3_prefix) = maybe_s3_prefix { info!("upload pgdata"); aws_s3_sync::upload_dir_recursive( - s3_client.as_ref().unwrap(), + s3_client.unwrap(), Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"), ) .await .context("sync dump directory to destination")?; - info!("write status"); + info!("write pgdata status to s3"); { let status_dir = workdir.join("status"); std::fs::create_dir(&status_dir).context("create status directory")?; @@ -550,13 +560,15 @@ async fn cmd_dumprestore( &key_id, spec.source_connstring_ciphertext_base64, ) - .await?; + .await + .context("decrypt source connection string")?; let dest = if let Some(dest_ciphertext) = spec.destination_connstring_ciphertext_base64 { decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) - .await? + .await + .context("decrypt destination connection string")? } else { bail!( "destination connection string must be provided in spec for dump_restore command" @@ -601,7 +613,18 @@ pub(crate) async fn main() -> anyhow::Result<()> { // Initialize AWS clients only if s3_prefix is specified let (s3_client, kms_client) = if args.s3_prefix.is_some() { - let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + // Create AWS config with enhanced retry settings + let config = aws_config::defaults(BehaviorVersion::v2024_03_28()) + .retry_config( + aws_config::retry::RetryConfig::standard() + .with_max_attempts(5) // Retry up to 5 times + .with_initial_backoff(std::time::Duration::from_millis(200)) // Start with 200ms delay + .with_max_backoff(std::time::Duration::from_secs(5)), // Cap at 5 seconds + ) + .load() + .await; + + // Create clients from the config with enhanced retry settings let s3_client = aws_sdk_s3::Client::new(&config); let kms = aws_sdk_kms::Client::new(&config); (Some(s3_client), Some(kms)) @@ -609,79 +632,108 @@ pub(crate) async fn main() -> anyhow::Result<()> { (None, None) }; - let spec: Option = if let Some(s3_prefix) = &args.s3_prefix { - let spec_key = s3_prefix.append("/spec.json"); - let object = s3_client - .as_ref() - .unwrap() - .get_object() - .bucket(&spec_key.bucket) - .key(spec_key.key) - .send() - .await - .context("get spec from s3")? - .body - .collect() - .await - .context("download spec body")?; - serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? - } else { - None - }; - - match tokio::fs::create_dir(&args.working_directory).await { - Ok(()) => {} - Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { - if !is_directory_empty(&args.working_directory) + // Capture everything from spec assignment onwards to handle errors + let res = async { + let spec: Option = if let Some(s3_prefix) = &args.s3_prefix { + let spec_key = s3_prefix.append("/spec.json"); + let object = s3_client + .as_ref() + .unwrap() + .get_object() + .bucket(&spec_key.bucket) + .key(spec_key.key) + .send() .await - .context("check if working directory is empty")? - { - bail!("working directory is not empty"); - } else { - // ok - } - } - Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), - } + .context("get spec from s3")? + .body + .collect() + .await + .context("download spec body")?; + serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + } else { + None + }; - match args.command { - Command::Pgdata { - source_connection_string, - interactive, - pg_port, - num_cpus, - memory_mb, - } => { - cmd_pgdata( - s3_client, - kms_client, - args.s3_prefix, - spec, + match tokio::fs::create_dir(&args.working_directory).await { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + if !is_directory_empty(&args.working_directory) + .await + .context("check if working directory is empty")? + { + bail!("working directory is not empty"); + } else { + // ok + } + } + Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), + } + + match args.command.clone() { + Command::Pgdata { source_connection_string, interactive, pg_port, - args.working_directory, - args.pg_bin_dir, - args.pg_lib_dir, num_cpus, memory_mb, - ) - .await?; - } - Command::DumpRestore { - source_connection_string, - destination_connection_string, - } => { - cmd_dumprestore( - kms_client, - spec, + } => { + cmd_pgdata( + s3_client.as_ref(), + kms_client, + args.s3_prefix.clone(), + spec, + source_connection_string, + interactive, + pg_port, + args.working_directory.clone(), + args.pg_bin_dir, + args.pg_lib_dir, + num_cpus, + memory_mb, + ) + .await + } + Command::DumpRestore { source_connection_string, destination_connection_string, - args.working_directory, - args.pg_bin_dir, - args.pg_lib_dir, + } => { + cmd_dumprestore( + kms_client, + spec, + source_connection_string, + destination_connection_string, + args.working_directory.clone(), + args.pg_bin_dir, + args.pg_lib_dir, + ) + .await + } + } + } + .await; + + if let Some(s3_prefix) = args.s3_prefix { + info!("write job status to s3"); + { + let status_dir = args.working_directory.join("status"); + if std::fs::exists(&status_dir)?.not() { + std::fs::create_dir(&status_dir).context("create status directory")?; + } + let status_file = status_dir.join("fast_import"); + let res_obj = match res { + Ok(_) => serde_json::json!({"command": args.command.as_str(), "done": true}), + Err(err) => { + serde_json::json!({"command": args.command.as_str(), "done": false, "error": err.to_string()}) + } + }; + std::fs::write(&status_file, res_obj.to_string()).context("write status file")?; + aws_s3_sync::upload_dir_recursive( + s3_client.as_ref().unwrap(), + &status_dir, + &s3_prefix.append("/status/"), ) - .await?; + .await + .context("sync status directory to destination")?; } } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d31472b0c1..70b91c781a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -20,6 +20,7 @@ use futures::future::join_all; use futures::stream::FuturesUnordered; use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; +use once_cell::sync::Lazy; use postgres; use postgres::NoTls; use postgres::error::SqlState; @@ -35,6 +36,7 @@ use crate::disk_quota::set_disk_quota; use crate::installed_extensions::get_installed_extensions; use crate::logger::startup_context_from_env; use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; +use crate::metrics::COMPUTE_CTL_UP; use crate::monitor::launch_monitor; use crate::pg_helpers::*; use crate::rsyslog::{ @@ -49,6 +51,17 @@ use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); pub static PG_PID: AtomicU32 = AtomicU32::new(0); +// This is an arbitrary build tag. Fine as a default / for testing purposes +// in-case of not-set environment var +const BUILD_TAG_DEFAULT: &str = "latest"; +/// Build tag/version of the compute node binaries/image. It's tricky and ugly +/// to pass it everywhere as a part of `ComputeNodeParams`, so we use a +/// global static variable. +pub static BUILD_TAG: Lazy = Lazy::new(|| { + option_env!("BUILD_TAG") + .unwrap_or(BUILD_TAG_DEFAULT) + .to_string() +}); /// Static configuration params that don't change after startup. These mostly /// come from the CLI args, or are derived from them. @@ -72,7 +85,6 @@ pub struct ComputeNodeParams { pub pgdata: String, pub pgbin: String, pub pgversion: String, - pub build_tag: String, /// The port that the compute's external HTTP server listens on pub external_http_port: u16, @@ -173,6 +185,11 @@ impl ComputeState { info!("Changing compute status from {} to {}", prev, status); self.status = status; state_changed.notify_all(); + + COMPUTE_CTL_UP.reset(); + COMPUTE_CTL_UP + .with_label_values(&[&BUILD_TAG, status.to_string().as_str()]) + .set(1); } pub fn set_failed_status(&mut self, err: anyhow::Error, state_changed: &Condvar) { @@ -343,6 +360,14 @@ impl ComputeNode { this.prewarm_postgres()?; } + // Set the up metric with Empty status before starting the HTTP server. + // That way on the first metric scrape, an external observer will see us + // as 'up' and 'empty' (unless the compute was started with a spec or + // already configured by control plane). + COMPUTE_CTL_UP + .with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()]) + .set(1); + // Launch the external HTTP server first, so that we can serve control plane // requests while configuration is still in progress. crate::http::server::Server::External { @@ -878,6 +903,14 @@ impl ComputeNode { info!("Storage auth token not set"); } + config.application_name("compute_ctl"); + if let Some(spec) = &compute_state.pspec { + config.options(&format!( + "-c neon.compute_mode={}", + spec.spec.mode.to_type_str() + )); + } + // Connect to pageserver let mut client = config.connect(NoTls)?; let pageserver_connect_micros = start_time.elapsed().as_micros() as u64; @@ -2024,12 +2057,8 @@ LIMIT 100", let mut download_tasks = Vec::new(); for library in &libs_vec { - let (ext_name, ext_path) = remote_extensions.get_ext( - library, - true, - &self.params.build_tag, - &self.params.pgversion, - )?; + let (ext_name, ext_path) = + remote_extensions.get_ext(library, true, &BUILD_TAG, &self.params.pgversion)?; download_tasks.push(self.download_extension(ext_name, ext_path)); } let results = join_all(download_tasks).await; diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 290632e4cd..614ab076ff 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -117,6 +117,7 @@ pub fn write_postgres_conf( writeln!(file, "lc_numeric='C.UTF-8'")?; } + writeln!(file, "neon.compute_mode={}", spec.mode.to_type_str())?; match spec.mode { ComputeMode::Primary => {} ComputeMode::Static(lsn) => { @@ -158,53 +159,89 @@ pub fn write_postgres_conf( writeln!(file, "# Managed by compute_ctl: end")?; } - // If audit logging is enabled, configure pgaudit. + // If base audit logging is enabled, configure it. + // In this setup, the audit log will be written to the standard postgresql log. + // + // If compliance audit logging is enabled, configure pgaudit. // // Note, that this is called after the settings from spec are written. // This way we always override the settings from the spec // and don't allow the user or the control plane admin to change them. - if let ComputeAudit::Hipaa = spec.audit_log_level { - writeln!(file, "# Managed by compute_ctl audit settings: begin")?; - // This log level is very verbose - // but this is necessary for HIPAA compliance. - // Exclude 'misc' category, because it doesn't contain anythig relevant. - writeln!(file, "pgaudit.log='all, -misc'")?; - writeln!(file, "pgaudit.log_parameter=on")?; - // Disable logging of catalog queries - // The catalog doesn't contain sensitive data, so we don't need to audit it. - writeln!(file, "pgaudit.log_catalog=off")?; - // Set log rotation to 5 minutes - // TODO: tune this after performance testing - writeln!(file, "pgaudit.log_rotation_age=5")?; + match spec.audit_log_level { + ComputeAudit::Disabled => {} + ComputeAudit::Log => { + writeln!(file, "# Managed by compute_ctl base audit settings: start")?; + writeln!(file, "pgaudit.log='ddl,role'")?; + // Disable logging of catalog queries to reduce the noise + writeln!(file, "pgaudit.log_catalog=off")?; - // Add audit shared_preload_libraries, if they are not present. - // - // The caller who sets the flag is responsible for ensuring that the necessary - // shared_preload_libraries are present in the compute image, - // otherwise the compute start will fail. - if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { - let mut extra_shared_preload_libraries = String::new(); - if !libs.contains("pgaudit") { - extra_shared_preload_libraries.push_str(",pgaudit"); - } - if !libs.contains("pgauditlogtofile") { - extra_shared_preload_libraries.push_str(",pgauditlogtofile"); + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + let mut extra_shared_preload_libraries = String::new(); + if !libs.contains("pgaudit") { + extra_shared_preload_libraries.push_str(",pgaudit"); + } + writeln!( + file, + "shared_preload_libraries='{}{}'", + libs, extra_shared_preload_libraries + )?; + } else { + // Typically, this should be unreacheable, + // because we always set at least some shared_preload_libraries in the spec + // but let's handle it explicitly anyway. + writeln!(file, "shared_preload_libraries='neon,pgaudit'")?; } + writeln!(file, "# Managed by compute_ctl base audit settings: end")?; + } + ComputeAudit::Hipaa => { writeln!( file, - "shared_preload_libraries='{}{}'", - libs, extra_shared_preload_libraries + "# Managed by compute_ctl compliance audit settings: begin" )?; - } else { - // Typically, this should be unreacheable, - // because we always set at least some shared_preload_libraries in the spec - // but let's handle it explicitly anyway. + // This log level is very verbose + // but this is necessary for HIPAA compliance. + // Exclude 'misc' category, because it doesn't contain anythig relevant. + writeln!(file, "pgaudit.log='all, -misc'")?; + writeln!(file, "pgaudit.log_parameter=on")?; + // Disable logging of catalog queries + // The catalog doesn't contain sensitive data, so we don't need to audit it. + writeln!(file, "pgaudit.log_catalog=off")?; + // Set log rotation to 5 minutes + // TODO: tune this after performance testing + writeln!(file, "pgaudit.log_rotation_age=5")?; + + // Add audit shared_preload_libraries, if they are not present. + // + // The caller who sets the flag is responsible for ensuring that the necessary + // shared_preload_libraries are present in the compute image, + // otherwise the compute start will fail. + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + let mut extra_shared_preload_libraries = String::new(); + if !libs.contains("pgaudit") { + extra_shared_preload_libraries.push_str(",pgaudit"); + } + if !libs.contains("pgauditlogtofile") { + extra_shared_preload_libraries.push_str(",pgauditlogtofile"); + } + writeln!( + file, + "shared_preload_libraries='{}{}'", + libs, extra_shared_preload_libraries + )?; + } else { + // Typically, this should be unreacheable, + // because we always set at least some shared_preload_libraries in the spec + // but let's handle it explicitly anyway. + writeln!( + file, + "shared_preload_libraries='neon,pgaudit,pgauditlogtofile'" + )?; + } writeln!( file, - "shared_preload_libraries='neon,pgaudit,pgauditlogtofile'" + "# Managed by compute_ctl compliance audit settings: end" )?; } - writeln!(file, "# Managed by compute_ctl audit settings: end")?; } writeln!(file, "neon.extension_server_port={}", extension_server_port)?; diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs index 798dd1179b..89d55e1af3 100644 --- a/compute_tools/src/http/middleware/authorize.rs +++ b/compute_tools/src/http/middleware/authorize.rs @@ -59,9 +59,12 @@ impl AsyncAuthorizeRequest for Authorize { Box::pin(async move { let request_id = request.extract_parts::().await.unwrap(); - // TODO: Remove this check after a successful rollout - if jwks.keys.is_empty() { - warn!(%request_id, "Authorization has not been configured"); + // TODO: Remove this stanza after teaching neon_local and the + // regression tests to use a JWT + JWKS. + // + // https://github.com/neondatabase/neon/issues/11316 + if cfg!(feature = "testing") { + warn!(%request_id, "Skipping compute_ctl authorization check"); return Ok(request); } @@ -110,8 +113,6 @@ impl AsyncAuthorizeRequest for Authorize { impl Authorize { /// Verify the token using the JSON Web Key set and return the token data. fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result> { - debug_assert!(!jwks.keys.is_empty()); - for jwk in jwks.keys.iter() { let decoding_key = match DecodingKey::from_jwk(jwk) { Ok(key) => key, diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 563b73ae65..6508de6eee 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -5,7 +5,7 @@ use axum::response::{IntoResponse, Response}; use http::StatusCode; use serde::Deserialize; -use crate::compute::ComputeNode; +use crate::compute::{BUILD_TAG, ComputeNode}; use crate::http::JsonResponse; use crate::http::extract::{Path, Query}; @@ -47,7 +47,7 @@ pub(in crate::http) async fn download_extension( remote_extensions.get_ext( &filename, ext_server_params.is_library, - &compute.params.build_tag, + &BUILD_TAG, &compute.params.pgversion, ) }; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index 4caa48307e..52f1795703 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,7 +1,8 @@ use metrics::core::{AtomicF64, Collector, GenericGauge}; use metrics::proto::MetricFamily; use metrics::{ - IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec, + IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter_vec, + register_int_gauge_vec, register_uint_gauge_vec, }; use once_cell::sync::Lazy; @@ -70,8 +71,19 @@ pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy> = Lazy::new( .expect("failed to define a metric") }); +// Report that `compute_ctl` is up and what's the current compute status. +pub(crate) static COMPUTE_CTL_UP: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "compute_ctl_up", + "Whether compute_ctl is running", + &["build_tag", "status"] + ) + .expect("failed to define a metric") +}); + pub fn collect() -> Vec { - let mut metrics = INSTALLED_EXTENSIONS.collect(); + let mut metrics = COMPUTE_CTL_UP.collect(); + metrics.extend(INSTALLED_EXTENSIONS.collect()); metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); metrics.extend(DB_MIGRATION_FAILED.collect()); diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 80506b13cb..e7d67f6ac5 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -75,15 +75,12 @@ impl ComputeNode { if spec.drop_subscriptions_before_start { let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; - let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id); info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); - drop_subscriptions_done = match - client.simple_query(&query).await { - Ok(result) => { - matches!(&result[0], postgres::SimpleQueryMessage::Row(_)) - }, + drop_subscriptions_done = match + client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await { + Ok(result) => !result.is_empty(), Err(e) => { match e.code() { @@ -286,7 +283,10 @@ impl ComputeNode { phases.push(CreatePgauditlogtofileExtension); phases.push(DisablePostgresDBPgAudit); } - ComputeAudit::Log => { /* not implemented yet */ } + ComputeAudit::Log => { + phases.push(CreatePgauditExtension); + phases.push(DisablePostgresDBPgAudit); + } ComputeAudit::Disabled => {} } @@ -419,7 +419,7 @@ impl ComputeNode { .iter() .filter_map(|val| val.parse::().ok()) .map(|val| if val > 1 { val - 1 } else { 1 }) - .last() + .next_back() .unwrap_or(3) } } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index f0a11106bd..3f3794c0ee 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -165,8 +165,11 @@ pub struct NeonStorageControllerConf { /// Database url used when running multiple storage controller instances pub database_url: Option, - /// Threshold for auto-splitting a tenant into shards + /// Thresholds for auto-splitting a tenant into shards. pub split_threshold: Option, + pub max_split_shards: Option, + pub initial_split_threshold: Option, + pub initial_split_shards: Option, pub max_secondary_lag_bytes: Option, @@ -181,6 +184,8 @@ pub struct NeonStorageControllerConf { pub timelines_onto_safekeepers: bool, pub use_https_safekeeper_api: bool, + + pub use_local_compute_notifications: bool, } impl NeonStorageControllerConf { @@ -201,12 +206,16 @@ impl Default for NeonStorageControllerConf { start_as_candidate: false, database_url: None, split_threshold: None, + max_split_shards: None, + initial_split_threshold: None, + initial_split_shards: None, max_secondary_lag_bytes: None, heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, long_reconcile_threshold: None, use_https_pageserver_api: false, timelines_onto_safekeepers: false, use_https_safekeeper_api: false, + use_local_compute_notifications: true, } } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index eeaad10d26..591eb3728b 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -51,11 +51,19 @@ impl PageServerNode { parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); let port = port.unwrap_or(5432); - let ssl_ca_cert = env.ssl_ca_cert_path().map(|ssl_ca_file| { + let ssl_ca_certs = env.ssl_ca_cert_path().map(|ssl_ca_file| { let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist"); - Certificate::from_pem(&buf).expect("CA certificate should be valid") + Certificate::from_pem_bundle(&buf).expect("SSL CA file should be valid") }); + let mut http_client = reqwest::Client::builder(); + for ssl_ca_cert in ssl_ca_certs.unwrap_or_default() { + http_client = http_client.add_root_certificate(ssl_ca_cert); + } + let http_client = http_client + .build() + .expect("Client constructs with no errors"); + let endpoint = if env.storage_controller.use_https_pageserver_api { format!( "https://{}", @@ -72,6 +80,7 @@ impl PageServerNode { conf: conf.clone(), env: env.clone(), http_client: mgmt_api::Client::new( + http_client, endpoint, { match conf.http_auth_type { @@ -83,9 +92,7 @@ impl PageServerNode { } } .as_deref(), - ssl_ca_cert, - ) - .expect("Client constructs with no errors"), + ), } } @@ -142,6 +149,10 @@ impl PageServerNode { overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); } + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { + overrides.push(format!("ssl_ca_file='{}'", ssl_ca_file.to_str().unwrap())); + } + // Apply the user-provided overrides overrides.push({ let mut doc = @@ -417,11 +428,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?, - l0_flush_wait_upload: settings - .remove("l0_flush_wait_upload") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'l0_flush_wait_upload' as a boolean")?, l0_flush_stall_threshold: settings .remove("l0_flush_stall_threshold") .map(|x| x.parse::()) @@ -539,6 +545,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?, + sampling_ratio: settings + .remove("sampling_ratio") + .map(serde_json::from_str) + .transpose() + .context("Falied to parse 'sampling_ratio'")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 0c78f2e18e..8000576e87 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -1,6 +1,5 @@ use std::ffi::OsStr; use std::fs; -use std::net::SocketAddr; use std::path::PathBuf; use std::process::ExitStatus; use std::str::FromStr; @@ -18,7 +17,7 @@ use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, Timelin use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; -use reqwest::Method; +use reqwest::{Certificate, Method}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tokio::process::Command; @@ -38,9 +37,9 @@ pub struct StorageController { client: reqwest::Client, config: NeonStorageControllerConf, - // The listen addresses is learned when starting the storage controller, + // The listen port is learned when starting the storage controller, // hence the use of OnceLock to init it at the right time. - listen: OnceLock, + listen_port: OnceLock, } const COMMAND: &str = "storage_controller"; @@ -144,15 +143,26 @@ impl StorageController { } }; + let ssl_ca_certs = env.ssl_ca_cert_path().map(|ssl_ca_file| { + let buf = std::fs::read(ssl_ca_file).expect("SSL CA file should exist"); + Certificate::from_pem_bundle(&buf).expect("SSL CA file should be valid") + }); + + let mut http_client = reqwest::Client::builder(); + for ssl_ca_cert in ssl_ca_certs.unwrap_or_default() { + http_client = http_client.add_root_certificate(ssl_ca_cert); + } + let http_client = http_client + .build() + .expect("HTTP client should construct with no error"); + Self { env: env.clone(), private_key, public_key, - client: reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"), + client: http_client, config: env.storage_controller.clone(), - listen: OnceLock::default(), + listen_port: OnceLock::default(), } } @@ -337,34 +347,34 @@ impl StorageController { } } - let (listen, postgres_port) = { - if let Some(base_port) = start_args.base_port { - ( - format!("127.0.0.1:{base_port}"), - self.config - .database_url - .expect("--base-port requires NeonStorageControllerConf::database_url") - .port(), - ) - } else { - let listen_url = self.env.control_plane_api.clone(); + if self.env.generate_local_ssl_certs { + self.env.generate_ssl_cert( + &instance_dir.join("server.crt"), + &instance_dir.join("server.key"), + )?; + } - let listen = format!( - "{}:{}", - listen_url.host_str().unwrap(), - listen_url.port().unwrap() - ); + let listen_url = &self.env.control_plane_api; - (listen, listen_url.port().unwrap() + 1) - } + let scheme = listen_url.scheme(); + let host = listen_url.host_str().unwrap(); + + let (listen_port, postgres_port) = if let Some(base_port) = start_args.base_port { + ( + base_port, + self.config + .database_url + .expect("--base-port requires NeonStorageControllerConf::database_url") + .port(), + ) + } else { + let port = listen_url.port().unwrap(); + (port, port + 1) }; - let socket_addr = listen - .parse() - .expect("listen address is a valid socket address"); - self.listen - .set(socket_addr) - .expect("StorageController::listen is only set here"); + self.listen_port + .set(listen_port) + .expect("StorageController::listen_port is only set here"); // Do we remove the pid file on stop? let pg_started = self.is_postgres_running().await?; @@ -500,20 +510,15 @@ impl StorageController { drop(client); conn.await??; - let listen = self - .listen - .get() - .expect("cell is set earlier in this function"); + let addr = format!("{}:{}", host, listen_port); let address_for_peers = Uri::builder() - .scheme("http") - .authority(format!("{}:{}", listen.ip(), listen.port())) + .scheme(scheme) + .authority(addr.clone()) .path_and_query("") .build() .unwrap(); let mut args = vec![ - "-l", - &listen.to_string(), "--dev", "--database-url", &database_url, @@ -530,6 +535,14 @@ impl StorageController { .map(|s| s.to_string()) .collect::>(); + match scheme { + "http" => args.extend(["--listen".to_string(), addr]), + "https" => args.extend(["--listen-https".to_string(), addr]), + _ => { + panic!("Unexpected url scheme in control_plane_api: {scheme}"); + } + } + if self.config.start_as_candidate { args.push("--start-as-candidate".to_string()); } @@ -542,6 +555,10 @@ impl StorageController { args.push("--use-https-safekeeper-api".to_string()); } + if self.config.use_local_compute_notifications { + args.push("--use-local-compute-notifications".to_string()); + } + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } @@ -570,6 +587,20 @@ impl StorageController { args.push(format!("--split-threshold={split_threshold}")) } + if let Some(max_split_shards) = self.config.max_split_shards.as_ref() { + args.push(format!("--max-split-shards={max_split_shards}")) + } + + if let Some(initial_split_threshold) = self.config.initial_split_threshold.as_ref() { + args.push(format!( + "--initial-split-threshold={initial_split_threshold}" + )) + } + + if let Some(initial_split_shards) = self.config.initial_split_shards.as_ref() { + args.push(format!("--initial-split-shards={initial_split_shards}")) + } + if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() { args.push(format!("--max-secondary-lag-bytes={lag}")) } @@ -590,6 +621,8 @@ impl StorageController { args.push("--timelines-onto-safekeepers".to_string()); } + println!("Starting storage controller"); + background_process::start_process( COMMAND, &instance_dir, @@ -716,30 +749,26 @@ impl StorageController { { // In the special case of the `storage_controller start` subcommand, we wish // to use the API endpoint of the newly started storage controller in order - // to pass the readiness check. In this scenario [`Self::listen`] will be set - // (see [`Self::start`]). + // to pass the readiness check. In this scenario [`Self::listen_port`] will + // be set (see [`Self::start`]). // // Otherwise, we infer the storage controller api endpoint from the configured // control plane API. - let url = if let Some(socket_addr) = self.listen.get() { - Url::from_str(&format!( - "http://{}:{}/{path}", - socket_addr.ip().to_canonical(), - socket_addr.port() - )) - .unwrap() + let port = if let Some(port) = self.listen_port.get() { + *port } else { - // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out - // for general purpose API access. - let listen_url = self.env.control_plane_api.clone(); - Url::from_str(&format!( - "http://{}:{}/{path}", - listen_url.host_str().unwrap(), - listen_url.port().unwrap() - )) - .unwrap() + self.env.control_plane_api.port().unwrap() }; + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let url = Url::from_str(&format!( + "{}://{}:{port}/{path}", + self.env.control_plane_api.scheme(), + self.env.control_plane_api.host_str().unwrap(), + )) + .unwrap(); + let mut builder = self.client.request(method, url); if let Some(body) = body { builder = builder.json(&body) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index ae4bf9a519..b7e479d90c 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -20,7 +20,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::{ShardStripeSize, TenantShardId}; use pageserver_client::mgmt_api::{self}; -use reqwest::{Method, StatusCode, Url}; +use reqwest::{Certificate, Method, StatusCode, Url}; use storage_controller_client::control_api::Client; use utils::id::{NodeId, TenantId, TimelineId}; @@ -274,7 +274,7 @@ struct Cli { jwt: Option, #[arg(long)] - /// Trusted root CA certificate to use in https APIs. + /// Trusted root CA certificates to use in https APIs. ssl_ca_file: Option, #[command(subcommand)] @@ -385,19 +385,25 @@ where async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); - let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone()); - - let ssl_ca_cert = match &cli.ssl_ca_file { + let ssl_ca_certs = match &cli.ssl_ca_file { Some(ssl_ca_file) => { let buf = tokio::fs::read(ssl_ca_file).await?; - Some(reqwest::Certificate::from_pem(&buf)?) + Certificate::from_pem_bundle(&buf)? } - None => None, + None => Vec::new(), }; + let mut http_client = reqwest::Client::builder(); + for ssl_ca_cert in ssl_ca_certs { + http_client = http_client.add_root_certificate(ssl_ca_cert); + } + let http_client = http_client.build()?; + + let storcon_client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone()); + let mut trimmed = cli.api.to_string(); trimmed.pop(); - let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref(), ssl_ca_cert)?; + let vps_client = mgmt_api::Client::new(http_client.clone(), trimmed, cli.jwt.as_deref()); match cli.command { Command::NodeRegister { @@ -1050,7 +1056,7 @@ async fn main() -> anyhow::Result<()> { const DEFAULT_MIGRATE_CONCURRENCY: usize = 8; let mut stream = futures::stream::iter(moves) .map(|mv| { - let client = Client::new(cli.api.clone(), cli.jwt.clone()); + let client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone()); async move { client .dispatch::( diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 9dbdcce69f..418aaf876d 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -67,6 +67,14 @@ else fi fi +if [[ ${PG_VERSION} -ge 17 ]]; then + ulid_extension=pgx_ulid +else + ulid_extension=ulid +fi +echo "Adding pgx_ulid" +shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE}) +sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE} echo "Overwrite tenant id and timeline id in spec file" sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE} sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 0f03d600a3..9d867d97f6 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -69,7 +69,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)" # We are running tests now rm -f testout.txt testout_contrib.txt - docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \ + docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \ $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 diff --git a/docker-compose/ext-src/pg_tiktoken-src/Makefile b/docker-compose/ext-src/pg_tiktoken-src/Makefile new file mode 100644 index 0000000000..e23166554a --- /dev/null +++ b/docker-compose/ext-src/pg_tiktoken-src/Makefile @@ -0,0 +1,8 @@ +PG_CONFIG ?= pg_config +PG_REGRESS = $(shell dirname $$($(PG_CONFIG) --pgxs))/../../src/test/regress/pg_regress +REGRESS = pg_tiktoken + +installcheck: regression-test + +regression-test: + $(PG_REGRESS) --inputdir=. --outputdir=. --dbname=contrib_regression $(REGRESS) \ No newline at end of file diff --git a/docker-compose/ext-src/pg_tiktoken-src/expected/pg_tiktoken.out b/docker-compose/ext-src/pg_tiktoken-src/expected/pg_tiktoken.out new file mode 100644 index 0000000000..0bdcdc60c2 --- /dev/null +++ b/docker-compose/ext-src/pg_tiktoken-src/expected/pg_tiktoken.out @@ -0,0 +1,53 @@ +-- Load the extension +CREATE EXTENSION IF NOT EXISTS pg_tiktoken; +-- Test encoding function +SELECT tiktoken_encode('cl100k_base', 'Hello world!'); + tiktoken_encode +----------------- + {9906,1917,0} +(1 row) + +-- Test token count function +SELECT tiktoken_count('cl100k_base', 'Hello world!'); + tiktoken_count +---------------- + 3 +(1 row) + +-- Test encoding function with a different model +SELECT tiktoken_encode('r50k_base', 'PostgreSQL is amazing!'); + tiktoken_encode +------------------------- + {6307,47701,318,4998,0} +(1 row) + +-- Test token count function with the same model +SELECT tiktoken_count('r50k_base', 'PostgreSQL is amazing!'); + tiktoken_count +---------------- + 5 +(1 row) + +-- Edge cases: Empty string +SELECT tiktoken_encode('cl100k_base', ''); + tiktoken_encode +----------------- + {} +(1 row) + +SELECT tiktoken_count('cl100k_base', ''); + tiktoken_count +---------------- + 0 +(1 row) + +-- Edge cases: Long text +SELECT tiktoken_count('cl100k_base', repeat('word ', 100)); + tiktoken_count +---------------- + 101 +(1 row) + +-- Edge case: Invalid encoding +SELECT tiktoken_encode('invalid_model', 'Test') AS should_fail; +ERROR: 'invalid_model': unknown model or encoder diff --git a/docker-compose/ext-src/pg_tiktoken-src/sql/pg_tiktoken.sql b/docker-compose/ext-src/pg_tiktoken-src/sql/pg_tiktoken.sql new file mode 100644 index 0000000000..626226c82e --- /dev/null +++ b/docker-compose/ext-src/pg_tiktoken-src/sql/pg_tiktoken.sql @@ -0,0 +1,24 @@ +-- Load the extension +CREATE EXTENSION IF NOT EXISTS pg_tiktoken; + +-- Test encoding function +SELECT tiktoken_encode('cl100k_base', 'Hello world!'); + +-- Test token count function +SELECT tiktoken_count('cl100k_base', 'Hello world!'); + +-- Test encoding function with a different model +SELECT tiktoken_encode('r50k_base', 'PostgreSQL is amazing!'); + +-- Test token count function with the same model +SELECT tiktoken_count('r50k_base', 'PostgreSQL is amazing!'); + +-- Edge cases: Empty string +SELECT tiktoken_encode('cl100k_base', ''); +SELECT tiktoken_count('cl100k_base', ''); + +-- Edge cases: Long text +SELECT tiktoken_count('cl100k_base', repeat('word ', 100)); + +-- Edge case: Invalid encoding +SELECT tiktoken_encode('invalid_model', 'Test') AS should_fail; \ No newline at end of file diff --git a/docker-compose/ext-src/pgrag-src/Makefile b/docker-compose/ext-src/pgrag-src/Makefile new file mode 100644 index 0000000000..dbf91cf501 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/Makefile @@ -0,0 +1,10 @@ +EXTENSION = rag +MODULE_big = rag +OBJS = $(patsubst %.rs,%.o,$(wildcard src/*.rs)) + +REGRESS = basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions +REGRESS_OPTS = --load-extension=vector --load-extension=rag + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/docker-compose/ext-src/pgrag-src/expected/api_keys.out b/docker-compose/ext-src/pgrag-src/expected/api_keys.out new file mode 100644 index 0000000000..3da3786f9b --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/expected/api_keys.out @@ -0,0 +1,49 @@ +-- API key function tests +SELECT rag.anthropic_set_api_key('test_key'); + anthropic_set_api_key +----------------------- + +(1 row) + +SELECT rag.anthropic_get_api_key(); + anthropic_get_api_key +----------------------- + test_key +(1 row) + +SELECT rag.openai_set_api_key('test_key'); + openai_set_api_key +-------------------- + +(1 row) + +SELECT rag.openai_get_api_key(); + openai_get_api_key +-------------------- + test_key +(1 row) + +SELECT rag.fireworks_set_api_key('test_key'); + fireworks_set_api_key +----------------------- + +(1 row) + +SELECT rag.fireworks_get_api_key(); + fireworks_get_api_key +----------------------- + test_key +(1 row) + +SELECT rag.voyageai_set_api_key('test_key'); + voyageai_set_api_key +---------------------- + +(1 row) + +SELECT rag.voyageai_get_api_key(); + voyageai_get_api_key +---------------------- + test_key +(1 row) + diff --git a/docker-compose/ext-src/pgrag-src/expected/basic_functions.out b/docker-compose/ext-src/pgrag-src/expected/basic_functions.out new file mode 100644 index 0000000000..1e5414686b --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/expected/basic_functions.out @@ -0,0 +1,13 @@ +-- Basic function tests +SELECT rag.markdown_from_html('

Hello

'); + markdown_from_html +-------------------- + Hello +(1 row) + +SELECT array_length(rag.chunks_by_character_count('the cat sat on the mat', 10, 5), 1); + array_length +-------------- + 3 +(1 row) + diff --git a/docker-compose/ext-src/pgrag-src/expected/chunking_functions.out b/docker-compose/ext-src/pgrag-src/expected/chunking_functions.out new file mode 100644 index 0000000000..c0546a1a8e --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/expected/chunking_functions.out @@ -0,0 +1,31 @@ +-- Chunking function tests +SELECT rag.chunks_by_character_count('the cat sat on the mat', 10, 5); + chunks_by_character_count +--------------------------------------- + {"the cat","cat sat on","on the mat"} +(1 row) + +SELECT rag.chunks_by_character_count('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', 20, 10); + chunks_by_character_count +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + {"Lorem ipsum dolor","dolor sit amet,","amet, consectetur","adipiscing elit.","Sed do eiusmod","do eiusmod tempor","tempor incididunt ut","ut labore et dolore","et dolore magna","magna aliqua."} +(1 row) + +SELECT (rag.chunks_by_character_count('the cat', 10, 0))[1]; + chunks_by_character_count +--------------------------- + the cat +(1 row) + +SELECT rag.chunks_by_character_count('', 10, 5); + chunks_by_character_count +--------------------------- + {} +(1 row) + +SELECT rag.chunks_by_character_count('a b c d e f g h i j k l m n o p', 5, 2); + chunks_by_character_count +----------------------------------------------------------------- + {"a b c","c d e","e f g","g h i","i j k","k l m","m n o","o p"} +(1 row) + diff --git a/docker-compose/ext-src/pgrag-src/expected/document_processing.out b/docker-compose/ext-src/pgrag-src/expected/document_processing.out new file mode 100644 index 0000000000..befb6b3f23 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/expected/document_processing.out @@ -0,0 +1,56 @@ +-- HTML to Markdown conversion tests +SELECT rag.markdown_from_html('

Hello

'); + markdown_from_html +-------------------- + Hello +(1 row) + +SELECT rag.markdown_from_html('

Hello world

'); + markdown_from_html +-------------------- + Hello _world_ +(1 row) + +SELECT rag.markdown_from_html('

Title

Paragraph

'); + markdown_from_html +-------------------- + # Title + + + + Paragraph +(1 row) + +SELECT rag.markdown_from_html('
  • Item 1
  • Item 2
'); + markdown_from_html +-------------------- + * Item 1 + + * Item 2 +(1 row) + +SELECT rag.markdown_from_html('Link'); + markdown_from_html +----------------------------- + [Link](https://example.com) +(1 row) + +-- Note: text_from_pdf and text_from_docx require binary input which is harder to test in regression tests +-- We'll test that the functions exist and have the right signature +SELECT 'text_from_pdf_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'text_from_pdf' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +----------------------+-------- + text_from_pdf_exists | t +(1 row) + +SELECT 'text_from_docx_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'text_from_docx' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +-----------------------+-------- + text_from_docx_exists | t +(1 row) + diff --git a/docker-compose/ext-src/pgrag-src/expected/embedding_api_functions.out b/docker-compose/ext-src/pgrag-src/expected/embedding_api_functions.out new file mode 100644 index 0000000000..a050914d13 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/expected/embedding_api_functions.out @@ -0,0 +1,103 @@ +-- Test embedding functions exist with correct signatures +-- OpenAI embedding functions +SELECT 'openai_text_embedding_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'openai_text_embedding' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +------------------------------+-------- + openai_text_embedding_exists | t +(1 row) + +SELECT 'openai_text_embedding_3_small_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'openai_text_embedding_3_small' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +--------------------------------------+-------- + openai_text_embedding_3_small_exists | t +(1 row) + +SELECT 'openai_text_embedding_3_large_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'openai_text_embedding_3_large' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +--------------------------------------+-------- + openai_text_embedding_3_large_exists | t +(1 row) + +SELECT 'openai_text_embedding_ada_002_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'openai_text_embedding_ada_002' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +--------------------------------------+-------- + openai_text_embedding_ada_002_exists | t +(1 row) + +-- Fireworks embedding functions +SELECT 'fireworks_nomic_embed_text_v1_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_nomic_embed_text_v1' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +--------------------------------------+-------- + fireworks_nomic_embed_text_v1_exists | t +(1 row) + +SELECT 'fireworks_nomic_embed_text_v15_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_nomic_embed_text_v15' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +---------------------------------------+-------- + fireworks_nomic_embed_text_v15_exists | t +(1 row) + +SELECT 'fireworks_text_embedding_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_text_embedding' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +---------------------------------+-------- + fireworks_text_embedding_exists | t +(1 row) + +SELECT 'fireworks_text_embedding_thenlper_gte_base_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_text_embedding_thenlper_gte_base' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +---------------------------------------------------+-------- + fireworks_text_embedding_thenlper_gte_base_exists | t +(1 row) + +SELECT 'fireworks_text_embedding_thenlper_gte_large_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_text_embedding_thenlper_gte_large' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +----------------------------------------------------+-------- + fireworks_text_embedding_thenlper_gte_large_exists | t +(1 row) + +SELECT 'fireworks_text_embedding_whereisai_uae_large_v1_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_text_embedding_whereisai_uae_large_v1' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +--------------------------------------------------------+-------- + fireworks_text_embedding_whereisai_uae_large_v1_exists | t +(1 row) + diff --git a/docker-compose/ext-src/pgrag-src/expected/embedding_functions.out b/docker-compose/ext-src/pgrag-src/expected/embedding_functions.out new file mode 100644 index 0000000000..ed4c6d7343 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/expected/embedding_functions.out @@ -0,0 +1,9 @@ +BEGIN +CREATE EXTENSION IF NOT EXISTS vector; +DROP EXTENSION IF EXISTS rag CASCADE; +CREATE EXTENSION rag CASCADE; +test_name|result +openai_embedding_dimensions_test|t +test_name|result +fireworks_embedding_dimensions_test|t +COMMIT diff --git a/docker-compose/ext-src/pgrag-src/expected/text_processing.out b/docker-compose/ext-src/pgrag-src/expected/text_processing.out new file mode 100644 index 0000000000..d844ff4be0 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/expected/text_processing.out @@ -0,0 +1,13 @@ +-- Text processing function tests +SELECT rag.markdown_from_html('

Hello world

'); + markdown_from_html +-------------------- + Hello _world_ +(1 row) + +SELECT rag.chunks_by_character_count('the cat sat on the mat', 10, 5); + chunks_by_character_count +--------------------------------------- + {"the cat","cat sat on","on the mat"} +(1 row) + diff --git a/docker-compose/ext-src/pgrag-src/expected/voyageai_functions.out b/docker-compose/ext-src/pgrag-src/expected/voyageai_functions.out new file mode 100644 index 0000000000..01f347d610 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/expected/voyageai_functions.out @@ -0,0 +1,141 @@ +-- Test VoyageAI API key functions +SELECT 'voyageai_api_key_test' AS test_name, + (SELECT rag.voyageai_set_api_key('test_key') IS NULL) AS result; + test_name | result +-----------------------+-------- + voyageai_api_key_test | t +(1 row) + +SELECT 'voyageai_get_api_key_test' AS test_name, + (SELECT rag.voyageai_get_api_key() = 'test_key') AS result; + test_name | result +---------------------------+-------- + voyageai_get_api_key_test | t +(1 row) + +-- Test VoyageAI embedding functions exist +SELECT 'voyageai_embedding_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +---------------------------+-------- + voyageai_embedding_exists | t +(1 row) + +SELECT 'voyageai_embedding_3_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_3' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +-----------------------------+-------- + voyageai_embedding_3_exists | t +(1 row) + +SELECT 'voyageai_embedding_3_lite_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_3_lite' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +----------------------------------+-------- + voyageai_embedding_3_lite_exists | t +(1 row) + +SELECT 'voyageai_embedding_code_2_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_code_2' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +----------------------------------+-------- + voyageai_embedding_code_2_exists | t +(1 row) + +SELECT 'voyageai_embedding_finance_2_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_finance_2' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +-------------------------------------+-------- + voyageai_embedding_finance_2_exists | t +(1 row) + +SELECT 'voyageai_embedding_law_2_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_law_2' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +---------------------------------+-------- + voyageai_embedding_law_2_exists | t +(1 row) + +SELECT 'voyageai_embedding_multilingual_2_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_multilingual_2' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +------------------------------------------+-------- + voyageai_embedding_multilingual_2_exists | t +(1 row) + +-- Test VoyageAI reranking functions exist +SELECT 'voyageai_rerank_distance_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_rerank_distance' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +---------------------------------+-------- + voyageai_rerank_distance_exists | t +(1 row) + +SELECT 'voyageai_rerank_score_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_rerank_score' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + test_name | result +------------------------------+-------- + voyageai_rerank_score_exists | t +(1 row) + +-- Test VoyageAI function signatures +SELECT 'voyageai_embedding_signature' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') + AND pronargs = 3; + test_name | result +------------------------------+-------- + voyageai_embedding_signature | t +(1 row) + +SELECT 'voyageai_rerank_distance_signature' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_rerank_distance' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') + AND pronargs IN (3, 4); + test_name | result +------------------------------------+-------- + voyageai_rerank_distance_signature | t +(1 row) + +SELECT 'voyageai_rerank_score_signature' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_rerank_score' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') + AND pronargs IN (3, 4); + test_name | result +---------------------------------+-------- + voyageai_rerank_score_signature | t +(1 row) + diff --git a/docker-compose/ext-src/pgrag-src/sql/api_keys.sql b/docker-compose/ext-src/pgrag-src/sql/api_keys.sql new file mode 100644 index 0000000000..36b928bccc --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/sql/api_keys.sql @@ -0,0 +1,16 @@ +-- API key function tests +SELECT rag.anthropic_set_api_key('test_key'); + +SELECT rag.anthropic_get_api_key(); + +SELECT rag.openai_set_api_key('test_key'); + +SELECT rag.openai_get_api_key(); + +SELECT rag.fireworks_set_api_key('test_key'); + +SELECT rag.fireworks_get_api_key(); + +SELECT rag.voyageai_set_api_key('test_key'); + +SELECT rag.voyageai_get_api_key(); diff --git a/docker-compose/ext-src/pgrag-src/sql/basic_functions.sql b/docker-compose/ext-src/pgrag-src/sql/basic_functions.sql new file mode 100644 index 0000000000..5e73bc1639 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/sql/basic_functions.sql @@ -0,0 +1,4 @@ +-- Basic function tests +SELECT rag.markdown_from_html('

Hello

'); + +SELECT array_length(rag.chunks_by_character_count('the cat sat on the mat', 10, 5), 1); diff --git a/docker-compose/ext-src/pgrag-src/sql/chunking_functions.sql b/docker-compose/ext-src/pgrag-src/sql/chunking_functions.sql new file mode 100644 index 0000000000..1a6cea1706 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/sql/chunking_functions.sql @@ -0,0 +1,11 @@ +-- Chunking function tests +SELECT rag.chunks_by_character_count('the cat sat on the mat', 10, 5); + +SELECT rag.chunks_by_character_count('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', 20, 10); + +SELECT (rag.chunks_by_character_count('the cat', 10, 0))[1]; + +SELECT rag.chunks_by_character_count('', 10, 5); + +SELECT rag.chunks_by_character_count('a b c d e f g h i j k l m n o p', 5, 2); + diff --git a/docker-compose/ext-src/pgrag-src/sql/document_processing.sql b/docker-compose/ext-src/pgrag-src/sql/document_processing.sql new file mode 100644 index 0000000000..ed94dd0e1a --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/sql/document_processing.sql @@ -0,0 +1,24 @@ +-- HTML to Markdown conversion tests +SELECT rag.markdown_from_html('

Hello

'); + +SELECT rag.markdown_from_html('

Hello world

'); + +SELECT rag.markdown_from_html('

Title

Paragraph

'); + +SELECT rag.markdown_from_html('
  • Item 1
  • Item 2
'); + +SELECT rag.markdown_from_html('Link'); + +-- Note: text_from_pdf and text_from_docx require binary input which is harder to test in regression tests +-- We'll test that the functions exist and have the right signature +SELECT 'text_from_pdf_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'text_from_pdf' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'text_from_docx_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'text_from_docx' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); diff --git a/docker-compose/ext-src/pgrag-src/sql/embedding_api_functions.sql b/docker-compose/ext-src/pgrag-src/sql/embedding_api_functions.sql new file mode 100644 index 0000000000..b9616222bc --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/sql/embedding_api_functions.sql @@ -0,0 +1,62 @@ +-- Test embedding functions exist with correct signatures +-- OpenAI embedding functions +SELECT 'openai_text_embedding_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'openai_text_embedding' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'openai_text_embedding_3_small_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'openai_text_embedding_3_small' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'openai_text_embedding_3_large_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'openai_text_embedding_3_large' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'openai_text_embedding_ada_002_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'openai_text_embedding_ada_002' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +-- Fireworks embedding functions +SELECT 'fireworks_nomic_embed_text_v1_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_nomic_embed_text_v1' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'fireworks_nomic_embed_text_v15_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_nomic_embed_text_v15' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'fireworks_text_embedding_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_text_embedding' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'fireworks_text_embedding_thenlper_gte_base_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_text_embedding_thenlper_gte_base' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'fireworks_text_embedding_thenlper_gte_large_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_text_embedding_thenlper_gte_large' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'fireworks_text_embedding_whereisai_uae_large_v1_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'fireworks_text_embedding_whereisai_uae_large_v1' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); diff --git a/docker-compose/ext-src/pgrag-src/sql/text_processing.sql b/docker-compose/ext-src/pgrag-src/sql/text_processing.sql new file mode 100644 index 0000000000..e871e55d57 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/sql/text_processing.sql @@ -0,0 +1,4 @@ +-- Text processing function tests +SELECT rag.markdown_from_html('

Hello world

'); + +SELECT rag.chunks_by_character_count('the cat sat on the mat', 10, 5); diff --git a/docker-compose/ext-src/pgrag-src/sql/voyageai_functions.sql b/docker-compose/ext-src/pgrag-src/sql/voyageai_functions.sql new file mode 100644 index 0000000000..73d4241519 --- /dev/null +++ b/docker-compose/ext-src/pgrag-src/sql/voyageai_functions.sql @@ -0,0 +1,84 @@ +-- Test VoyageAI API key functions +SELECT 'voyageai_api_key_test' AS test_name, + (SELECT rag.voyageai_set_api_key('test_key') IS NULL) AS result; + +SELECT 'voyageai_get_api_key_test' AS test_name, + (SELECT rag.voyageai_get_api_key() = 'test_key') AS result; + +-- Test VoyageAI embedding functions exist +SELECT 'voyageai_embedding_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'voyageai_embedding_3_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_3' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'voyageai_embedding_3_lite_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_3_lite' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'voyageai_embedding_code_2_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_code_2' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'voyageai_embedding_finance_2_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_finance_2' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'voyageai_embedding_law_2_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_law_2' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'voyageai_embedding_multilingual_2_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding_multilingual_2' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +-- Test VoyageAI reranking functions exist +SELECT 'voyageai_rerank_distance_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_rerank_distance' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +SELECT 'voyageai_rerank_score_exists' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_rerank_score' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); + +-- Test VoyageAI function signatures +SELECT 'voyageai_embedding_signature' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_embedding' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') + AND pronargs = 3; + +SELECT 'voyageai_rerank_distance_signature' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_rerank_distance' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') + AND pronargs IN (3, 4); + +SELECT 'voyageai_rerank_score_signature' AS test_name, + count(*) > 0 AS result +FROM pg_proc +WHERE proname = 'voyageai_rerank_score' + AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') + AND pronargs IN (3, 4); diff --git a/docker-compose/ext-src/pgx_ulid-src/Makefile b/docker-compose/ext-src/pgx_ulid-src/Makefile new file mode 100644 index 0000000000..91aceef906 --- /dev/null +++ b/docker-compose/ext-src/pgx_ulid-src/Makefile @@ -0,0 +1,16 @@ +EXTENSION = pgx_ulid + +PGFILEDESC = "pgx_ulid - ULID type for PostgreSQL" + +PG_CONFIG ?= pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +PG_MAJOR_VERSION := $(word 2, $(subst ., , $(shell $(PG_CONFIG) --version))) +ifeq ($(shell test $(PG_MAJOR_VERSION) -lt 17; echo $$?),0) + REGRESS_OPTS = --load-extension=ulid + REGRESS = 00_ulid_generation 01_ulid_conversions 03_ulid_errors +else + REGRESS_OPTS = --load-extension=pgx_ulid + REGRESS = 00_ulid_generation 01_ulid_conversions 02_ulid_conversions 03_ulid_errors +endif + +include $(PGXS) diff --git a/docker-compose/ext-src/pgx_ulid-src/expected/00_ulid_generation.out b/docker-compose/ext-src/pgx_ulid-src/expected/00_ulid_generation.out new file mode 100644 index 0000000000..a30b620150 --- /dev/null +++ b/docker-compose/ext-src/pgx_ulid-src/expected/00_ulid_generation.out @@ -0,0 +1,60 @@ +-- Test basic ULID generation +-- Test gen_ulid() function +SELECT 'gen_ulid() returns a non-null value' as test_name, + gen_ulid() IS NOT NULL as result; + test_name | result +-------------------------------------+-------- + gen_ulid() returns a non-null value | t +(1 row) + +-- Test that multiple calls to gen_ulid() return different values +SELECT 'gen_ulid() returns unique values' as test_name, + gen_ulid() != gen_ulid() as result; + test_name | result +----------------------------------+-------- + gen_ulid() returns unique values | t +(1 row) + +-- Test that gen_ulid() returns a value with the correct format +SELECT 'gen_ulid() returns correctly formatted value' as test_name, + length(gen_ulid()::text) = 26 as result; + test_name | result +----------------------------------------------+-------- + gen_ulid() returns correctly formatted value | t +(1 row) + +-- Test monotonic ULID generation +SELECT 'gen_monotonic_ulid() returns a non-null value' as test_name, + gen_monotonic_ulid() IS NOT NULL as result; + test_name | result +-----------------------------------------------+-------- + gen_monotonic_ulid() returns a non-null value | t +(1 row) + +-- Test that multiple calls to gen_monotonic_ulid() return different values +SELECT 'gen_monotonic_ulid() returns unique values' as test_name, + gen_monotonic_ulid() != gen_monotonic_ulid() as result; + test_name | result +--------------------------------------------+-------- + gen_monotonic_ulid() returns unique values | t +(1 row) + +-- Test that gen_monotonic_ulid() returns a value with the correct format +SELECT 'gen_monotonic_ulid() returns correctly formatted value' as test_name, + length(gen_monotonic_ulid()::text) = 26 as result; + test_name | result +--------------------------------------------------------+-------- + gen_monotonic_ulid() returns correctly formatted value | t +(1 row) + +-- Test that monotonic ULIDs are ordered correctly +SELECT 'gen_monotonic_ulid() returns ordered values' as test_name, + u1 < u2 as result +FROM ( + SELECT gen_monotonic_ulid() as u1, gen_monotonic_ulid() as u2 +) subq; + test_name | result +---------------------------------------------+-------- + gen_monotonic_ulid() returns ordered values | t +(1 row) + diff --git a/docker-compose/ext-src/pgx_ulid-src/expected/01_ulid_conversions.out b/docker-compose/ext-src/pgx_ulid-src/expected/01_ulid_conversions.out new file mode 100644 index 0000000000..19474ccca1 --- /dev/null +++ b/docker-compose/ext-src/pgx_ulid-src/expected/01_ulid_conversions.out @@ -0,0 +1,55 @@ +-- Create a test ULID value +CREATE TEMP TABLE test_ulids AS +SELECT '01GV5PA9EQG7D82Q3Y4PKBZSYV'::ulid as test_ulid; +-- Test conversion to text +SELECT 'ulid to text conversion' as test_name, + test_ulid::text = '01GV5PA9EQG7D82Q3Y4PKBZSYV' as result +FROM test_ulids; + test_name | result +-------------------------+-------- + ulid to text conversion | t +(1 row) + +-- Test conversion to UUID +SELECT 'ulid to UUID conversion' as test_name, + test_ulid::uuid::text = '0186cb65-25d7-81da-815c-7e25a6bfe7db' as result +FROM test_ulids; + test_name | result +-------------------------+-------- + ulid to UUID conversion | t +(1 row) + +-- Test conversion to bytea +SELECT 'ulid to bytea conversion' as test_name, + length(test_ulid::bytea) = 16 as result +FROM test_ulids; + test_name | result +--------------------------+-------- + ulid to bytea conversion | t +(1 row) + +-- Test conversion to timestamp +SELECT 'ulid to timestamp conversion' as test_name, + to_char(test_ulid::timestamp, 'YYYY-MM-DD HH24:MI:SS.MS') = '2023-03-10 04:00:49.111' as result +FROM test_ulids; + test_name | result +------------------------------+-------- + ulid to timestamp conversion | t +(1 row) + +-- Test conversion from UUID +SELECT 'UUID to ulid conversion' as test_name, + '0186cb65-25d7-81da-815c-7e25a6bfe7db'::uuid::ulid::text = '01GV5PA9EQG7D82Q3Y4PKBZSYV' as result; + test_name | result +-------------------------+-------- + UUID to ulid conversion | t +(1 row) + +-- Test conversion from timestamp +SELECT 'timestamp to ulid conversion' as test_name, + '2023-03-10 12:00:49.111'::timestamp::ulid::text = '01GV5PA9EQ0000000000000000' as result; + test_name | result +------------------------------+-------- + timestamp to ulid conversion | t +(1 row) + diff --git a/docker-compose/ext-src/pgx_ulid-src/expected/02_ulid_conversions.out b/docker-compose/ext-src/pgx_ulid-src/expected/02_ulid_conversions.out new file mode 100644 index 0000000000..d1480f207c --- /dev/null +++ b/docker-compose/ext-src/pgx_ulid-src/expected/02_ulid_conversions.out @@ -0,0 +1,8 @@ +-- Test conversion from timestamptz +SELECT 'timestamptz to ulid conversion' as test_name, + '2023-03-10 04:00:49.111'::timestamptz::ulid::text = '01GV5PA9EQ0000000000000000' as result; + test_name | result +--------------------------------+-------- + timestamptz to ulid conversion | t +(1 row) + diff --git a/docker-compose/ext-src/pgx_ulid-src/expected/03_ulid_errors.out b/docker-compose/ext-src/pgx_ulid-src/expected/03_ulid_errors.out new file mode 100644 index 0000000000..6d5dd99298 --- /dev/null +++ b/docker-compose/ext-src/pgx_ulid-src/expected/03_ulid_errors.out @@ -0,0 +1,19 @@ +-- Test ULID error handling +-- Test invalid ULID string (too short) +SELECT '01GV5PA9EQG7D82Q3Y4PKBZSY'::ulid; +ERROR: invalid input syntax for type ulid: "01GV5PA9EQG7D82Q3Y4PKBZSY": invalid length +LINE 1: SELECT '01GV5PA9EQG7D82Q3Y4PKBZSY'::ulid; + ^ +-- Test invalid ULID string (invalid character) +SELECT '01GV5PA9EQG7D82Q3Y4PKBZSYU'::ulid; +ERROR: invalid input syntax for type ulid: "01GV5PA9EQG7D82Q3Y4PKBZSYU": invalid character +LINE 1: SELECT '01GV5PA9EQG7D82Q3Y4PKBZSYU'::ulid; + ^ +-- Test NULL handling +SELECT 'NULL to ulid conversion returns NULL' as test_name, + NULL::ulid IS NULL as result; + test_name | result +--------------------------------------+-------- + NULL to ulid conversion returns NULL | t +(1 row) + diff --git a/docker-compose/ext-src/pgx_ulid-src/sql/00_ulid_generation.sql b/docker-compose/ext-src/pgx_ulid-src/sql/00_ulid_generation.sql new file mode 100644 index 0000000000..8b110b1cf0 --- /dev/null +++ b/docker-compose/ext-src/pgx_ulid-src/sql/00_ulid_generation.sql @@ -0,0 +1,32 @@ +-- Test basic ULID generation + +-- Test gen_ulid() function +SELECT 'gen_ulid() returns a non-null value' as test_name, + gen_ulid() IS NOT NULL as result; + +-- Test that multiple calls to gen_ulid() return different values +SELECT 'gen_ulid() returns unique values' as test_name, + gen_ulid() != gen_ulid() as result; + +-- Test that gen_ulid() returns a value with the correct format +SELECT 'gen_ulid() returns correctly formatted value' as test_name, + length(gen_ulid()::text) = 26 as result; + +-- Test monotonic ULID generation +SELECT 'gen_monotonic_ulid() returns a non-null value' as test_name, + gen_monotonic_ulid() IS NOT NULL as result; + +-- Test that multiple calls to gen_monotonic_ulid() return different values +SELECT 'gen_monotonic_ulid() returns unique values' as test_name, + gen_monotonic_ulid() != gen_monotonic_ulid() as result; + +-- Test that gen_monotonic_ulid() returns a value with the correct format +SELECT 'gen_monotonic_ulid() returns correctly formatted value' as test_name, + length(gen_monotonic_ulid()::text) = 26 as result; + +-- Test that monotonic ULIDs are ordered correctly +SELECT 'gen_monotonic_ulid() returns ordered values' as test_name, + u1 < u2 as result +FROM ( + SELECT gen_monotonic_ulid() as u1, gen_monotonic_ulid() as u2 +) subq; diff --git a/docker-compose/ext-src/pgx_ulid-src/sql/01_ulid_conversions.sql b/docker-compose/ext-src/pgx_ulid-src/sql/01_ulid_conversions.sql new file mode 100644 index 0000000000..1ff2d60372 --- /dev/null +++ b/docker-compose/ext-src/pgx_ulid-src/sql/01_ulid_conversions.sql @@ -0,0 +1,32 @@ +-- Create a test ULID value +CREATE TEMP TABLE test_ulids AS +SELECT '01GV5PA9EQG7D82Q3Y4PKBZSYV'::ulid as test_ulid; + +-- Test conversion to text +SELECT 'ulid to text conversion' as test_name, + test_ulid::text = '01GV5PA9EQG7D82Q3Y4PKBZSYV' as result +FROM test_ulids; + +-- Test conversion to UUID +SELECT 'ulid to UUID conversion' as test_name, + test_ulid::uuid::text = '0186cb65-25d7-81da-815c-7e25a6bfe7db' as result +FROM test_ulids; + +-- Test conversion to bytea +SELECT 'ulid to bytea conversion' as test_name, + length(test_ulid::bytea) = 16 as result +FROM test_ulids; + +-- Test conversion to timestamp +SELECT 'ulid to timestamp conversion' as test_name, + to_char(test_ulid::timestamp, 'YYYY-MM-DD HH24:MI:SS.MS') = '2023-03-10 04:00:49.111' as result +FROM test_ulids; + +-- Test conversion from UUID +SELECT 'UUID to ulid conversion' as test_name, + '0186cb65-25d7-81da-815c-7e25a6bfe7db'::uuid::ulid::text = '01GV5PA9EQG7D82Q3Y4PKBZSYV' as result; + +-- Test conversion from timestamp +SELECT 'timestamp to ulid conversion' as test_name, + '2023-03-10 12:00:49.111'::timestamp::ulid::text = '01GV5PA9EQ0000000000000000' as result; + diff --git a/docker-compose/ext-src/pgx_ulid-src/sql/02_ulid_conversions.sql b/docker-compose/ext-src/pgx_ulid-src/sql/02_ulid_conversions.sql new file mode 100644 index 0000000000..2038512753 --- /dev/null +++ b/docker-compose/ext-src/pgx_ulid-src/sql/02_ulid_conversions.sql @@ -0,0 +1,3 @@ +-- Test conversion from timestamptz +SELECT 'timestamptz to ulid conversion' as test_name, + '2023-03-10 04:00:49.111'::timestamptz::ulid::text = '01GV5PA9EQ0000000000000000' as result; diff --git a/docker-compose/ext-src/pgx_ulid-src/sql/03_ulid_errors.sql b/docker-compose/ext-src/pgx_ulid-src/sql/03_ulid_errors.sql new file mode 100644 index 0000000000..44dc07d309 --- /dev/null +++ b/docker-compose/ext-src/pgx_ulid-src/sql/03_ulid_errors.sql @@ -0,0 +1,12 @@ +-- Test ULID error handling + +-- Test invalid ULID string (too short) +SELECT '01GV5PA9EQG7D82Q3Y4PKBZSY'::ulid; + +-- Test invalid ULID string (invalid character) +SELECT '01GV5PA9EQG7D82Q3Y4PKBZSYU'::ulid; + +-- Test NULL handling +SELECT 'NULL to ulid conversion returns NULL' as test_name, + NULL::ulid IS NULL as result; + diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile new file mode 100644 index 0000000000..de39cdc367 --- /dev/null +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile @@ -0,0 +1,10 @@ +EXTENSION = rag_bge_small_en_v15 +MODULE_big = rag_bge_small_en_v15 +OBJS = $(patsubst %.rs,%.o,$(wildcard src/*.rs)) + +REGRESS = basic_functions embedding_functions basic_functions_enhanced embedding_functions_enhanced +REGRESS_OPTS = --load-extension=vector --load-extension=rag_bge_small_en_v15 + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/basic_functions.out b/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/basic_functions.out new file mode 100644 index 0000000000..17194b79a5 --- /dev/null +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/basic_functions.out @@ -0,0 +1,7 @@ +-- Basic function tests +SELECT rag_bge_small_en_v15.chunks_by_token_count('the cat sat on the mat', 3, 2); + chunks_by_token_count +-------------------------------------------------------- + {"the cat sat","cat sat on","sat on the","on the mat"} +(1 row) + diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/basic_functions_enhanced.out b/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/basic_functions_enhanced.out new file mode 100644 index 0000000000..f191aad5db --- /dev/null +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/basic_functions_enhanced.out @@ -0,0 +1,31 @@ +-- Basic function tests for chunks_by_token_count +SELECT rag_bge_small_en_v15.chunks_by_token_count('the cat sat on the mat', 3, 2); + chunks_by_token_count +-------------------------------------------------------- + {"the cat sat","cat sat on","sat on the","on the mat"} +(1 row) + +SELECT rag_bge_small_en_v15.chunks_by_token_count('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', 5, 2); + chunks_by_token_count +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + {"Lorem ipsum","ipsum dolor sit","sit amet,",consectetur,"adipiscing elit",elit.,"Sed do","do eiusmod",tempor,"incididunt ut","ut labore et","et dolore magna","magna aliqua."} +(1 row) + +SELECT (rag_bge_small_en_v15.chunks_by_token_count('the cat', 5, 0))[1]; + chunks_by_token_count +----------------------- + the cat +(1 row) + +SELECT rag_bge_small_en_v15.chunks_by_token_count('', 5, 2); + chunks_by_token_count +----------------------- + {} +(1 row) + +SELECT rag_bge_small_en_v15.chunks_by_token_count('a b c d e f g h i j k l m n o p', 3, 1); + chunks_by_token_count +----------------------------------------------------------------- + {"a b c","c d e","e f g","g h i","i j k","k l m","m n o","o p"} +(1 row) + diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/embedding_functions.out b/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/embedding_functions.out new file mode 100644 index 0000000000..034e41bd47 --- /dev/null +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/embedding_functions.out @@ -0,0 +1,15 @@ +-- Embedding function tests +SELECT 'embedding_for_passage_test' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('the cat sat on the mat')) > 0 AS result; + test_name | result +----------------------------+-------- + embedding_for_passage_test | t +(1 row) + +SELECT 'embedding_for_query_test' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_query('the cat sat on the mat')) > 0 AS result; + test_name | result +--------------------------+-------- + embedding_for_query_test | t +(1 row) + diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/embedding_functions_enhanced.out b/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/embedding_functions_enhanced.out new file mode 100644 index 0000000000..1fdcdf4e42 --- /dev/null +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/expected/embedding_functions_enhanced.out @@ -0,0 +1,52 @@ +-- Embedding function tests +SELECT 'embedding_for_passage_test_1' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('the cat sat on the mat')) > 0 AS result; + test_name | result +------------------------------+-------- + embedding_for_passage_test_1 | t +(1 row) + +SELECT 'embedding_for_passage_test_2' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('Lorem ipsum dolor sit amet')) > 0 AS result; + test_name | result +------------------------------+-------- + embedding_for_passage_test_2 | t +(1 row) + +SELECT 'embedding_for_passage_test_3' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('')) > 0 AS result; + test_name | result +------------------------------+-------- + embedding_for_passage_test_3 | t +(1 row) + +SELECT 'embedding_for_query_test_1' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_query('the cat sat on the mat')) > 0 AS result; + test_name | result +----------------------------+-------- + embedding_for_query_test_1 | t +(1 row) + +SELECT 'embedding_for_query_test_2' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_query('Lorem ipsum dolor sit amet')) > 0 AS result; + test_name | result +----------------------------+-------- + embedding_for_query_test_2 | t +(1 row) + +SELECT 'embedding_for_query_test_3' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_query('')) > 0 AS result; + test_name | result +----------------------------+-------- + embedding_for_query_test_3 | t +(1 row) + +-- Test that passage and query embeddings have the same dimensions +SELECT 'embedding_dimensions_match' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('test')) = + vector_dims(rag_bge_small_en_v15.embedding_for_query('test')) AS result; + test_name | result +----------------------------+-------- + embedding_dimensions_match | t +(1 row) + diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/basic_functions.sql b/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/basic_functions.sql new file mode 100644 index 0000000000..f60207e074 --- /dev/null +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/basic_functions.sql @@ -0,0 +1,2 @@ +-- Basic function tests +SELECT rag_bge_small_en_v15.chunks_by_token_count('the cat sat on the mat', 3, 2); diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/basic_functions_enhanced.sql b/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/basic_functions_enhanced.sql new file mode 100644 index 0000000000..f2089cecec --- /dev/null +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/basic_functions_enhanced.sql @@ -0,0 +1,10 @@ +-- Basic function tests for chunks_by_token_count +SELECT rag_bge_small_en_v15.chunks_by_token_count('the cat sat on the mat', 3, 2); + +SELECT rag_bge_small_en_v15.chunks_by_token_count('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', 5, 2); + +SELECT (rag_bge_small_en_v15.chunks_by_token_count('the cat', 5, 0))[1]; + +SELECT rag_bge_small_en_v15.chunks_by_token_count('', 5, 2); + +SELECT rag_bge_small_en_v15.chunks_by_token_count('a b c d e f g h i j k l m n o p', 3, 1); diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/embedding_functions.sql b/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/embedding_functions.sql new file mode 100644 index 0000000000..ef9dedd9d7 --- /dev/null +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/embedding_functions.sql @@ -0,0 +1,6 @@ +-- Embedding function tests +SELECT 'embedding_for_passage_test' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('the cat sat on the mat')) > 0 AS result; + +SELECT 'embedding_for_query_test' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_query('the cat sat on the mat')) > 0 AS result; diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/embedding_functions_enhanced.sql b/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/embedding_functions_enhanced.sql new file mode 100644 index 0000000000..0ca5d28111 --- /dev/null +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/sql/embedding_functions_enhanced.sql @@ -0,0 +1,23 @@ +-- Embedding function tests +SELECT 'embedding_for_passage_test_1' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('the cat sat on the mat')) > 0 AS result; + +SELECT 'embedding_for_passage_test_2' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('Lorem ipsum dolor sit amet')) > 0 AS result; + +SELECT 'embedding_for_passage_test_3' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('')) > 0 AS result; + +SELECT 'embedding_for_query_test_1' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_query('the cat sat on the mat')) > 0 AS result; + +SELECT 'embedding_for_query_test_2' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_query('Lorem ipsum dolor sit amet')) > 0 AS result; + +SELECT 'embedding_for_query_test_3' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_query('')) > 0 AS result; + +-- Test that passage and query embeddings have the same dimensions +SELECT 'embedding_dimensions_match' AS test_name, + vector_dims(rag_bge_small_en_v15.embedding_for_passage('test')) = + vector_dims(rag_bge_small_en_v15.embedding_for_query('test')) AS result; diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile new file mode 100644 index 0000000000..6067debf56 --- /dev/null +++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile @@ -0,0 +1,10 @@ +EXTENSION = rag_jina_reranker_v1_tiny_en +MODULE_big = rag_jina_reranker_v1_tiny_en +OBJS = $(patsubst %.rs,%.o,$(wildcard src/*.rs)) + +REGRESS = reranking_functions reranking_functions_enhanced +REGRESS_OPTS = --load-extension=vector --load-extension=rag_jina_reranker_v1_tiny_en + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/expected/reranking_functions.out b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/expected/reranking_functions.out new file mode 100644 index 0000000000..475718ea99 --- /dev/null +++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/expected/reranking_functions.out @@ -0,0 +1,25 @@ +-- Reranking function tests +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the baboon played with the balloon'); + rerank_distance +----------------- + 0.8989152 +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings']); + rerank_distance +----------------------- + {0.8989152,1.3018152} +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the baboon played with the balloon'); + rerank_score +-------------- + -0.8989152 +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings']); + rerank_score +------------------------- + {-0.8989152,-1.3018152} +(1 row) + diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/expected/reranking_functions_enhanced.out b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/expected/reranking_functions_enhanced.out new file mode 100644 index 0000000000..b610896fa2 --- /dev/null +++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/expected/reranking_functions_enhanced.out @@ -0,0 +1,92 @@ +-- Reranking function tests - single passage +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the baboon played with the balloon'); + rerank_distance +----------------- + 0.8989152 +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the tanks fired at the buildings'); + rerank_distance +----------------- + 1.3018152 +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('query about cats', 'information about felines'); + rerank_distance +----------------- + 1.3133051 +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('', 'empty query test'); + rerank_distance +----------------- + 0.7075559 +(1 row) + +-- Reranking function tests - array of passages +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', + ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings']); + rerank_distance +----------------------- + {0.8989152,1.3018152} +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('query about programming', + ARRAY['Python is a programming language', 'Java is also a programming language', 'SQL is used for databases']); + rerank_distance +------------------------------------ + {0.16591403,0.33475375,0.10132827} +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('empty array test', ARRAY[]::text[]); + rerank_distance +----------------- + {} +(1 row) + +-- Reranking score function tests - single passage +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the baboon played with the balloon'); + rerank_score +-------------- + -0.8989152 +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the tanks fired at the buildings'); + rerank_score +-------------- + -1.3018152 +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('query about cats', 'information about felines'); + rerank_score +-------------- + -1.3133051 +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('', 'empty query test'); + rerank_score +-------------- + -0.7075559 +(1 row) + +-- Reranking score function tests - array of passages +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', + ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings']); + rerank_score +------------------------- + {-0.8989152,-1.3018152} +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('query about programming', + ARRAY['Python is a programming language', 'Java is also a programming language', 'SQL is used for databases']); + rerank_score +--------------------------------------- + {-0.16591403,-0.33475375,-0.10132827} +(1 row) + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('empty array test', ARRAY[]::text[]); + rerank_score +-------------- + {} +(1 row) + diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/sql/reranking_functions.sql b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/sql/reranking_functions.sql new file mode 100644 index 0000000000..0837b18ffd --- /dev/null +++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/sql/reranking_functions.sql @@ -0,0 +1,8 @@ +-- Reranking function tests +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the baboon played with the balloon'); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings']); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the baboon played with the balloon'); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings']); \ No newline at end of file diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/sql/reranking_functions_enhanced.sql b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/sql/reranking_functions_enhanced.sql new file mode 100644 index 0000000000..b967d9e98e --- /dev/null +++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/sql/reranking_functions_enhanced.sql @@ -0,0 +1,35 @@ +-- Reranking function tests - single passage +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the baboon played with the balloon'); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the tanks fired at the buildings'); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('query about cats', 'information about felines'); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('', 'empty query test'); + +-- Reranking function tests - array of passages +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', + ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings']); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('query about programming', + ARRAY['Python is a programming language', 'Java is also a programming language', 'SQL is used for databases']); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('empty array test', ARRAY[]::text[]); + +-- Reranking score function tests - single passage +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the baboon played with the balloon'); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the tanks fired at the buildings'); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('query about cats', 'information about felines'); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('', 'empty query test'); + +-- Reranking score function tests - array of passages +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', + ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings']); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('query about programming', + ARRAY['Python is a programming language', 'Java is also a programming language', 'SQL is used for databases']); + +SELECT rag_jina_reranker_v1_tiny_en.rerank_score('empty array test', ARRAY[]::text[]); diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 5fd4080c28..a6e2ac0f34 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -21,6 +21,7 @@ in this repository. - [WAL Redo](./pageserver-walredo.md) - [Page cache](./pageserver-pagecache.md) - [Storage](./pageserver-storage.md) + - [Compaction](./pageserver-compaction.md) - [Processing a GetPage request](./pageserver-processing-getpage.md) - [Processing WAL](./pageserver-processing-wal.md) diff --git a/docs/pageserver-compaction.md b/docs/pageserver-compaction.md new file mode 100644 index 0000000000..6cacb10c9c --- /dev/null +++ b/docs/pageserver-compaction.md @@ -0,0 +1,110 @@ +# Pageserver Compaction + +Lifted from . + +Updated 2025-03-26. + +## Pages and WAL + +Postgres stores data in 8 KB pages, identified by a page number. + +The WAL contains a sequence of page writes: either images (complete page contents) or deltas (patches applied to images). Each write is identified by its byte position in the WAL, aka LSN. + +Each page version is thus identified by page@LSN. Postgres may read pages at past LSNs. + +Pageservers ingest WAL by writing WAL records into a key/value store keyed by page@LSN. + +Pageservers materialize pages for Postgres reads by finding the most recent page image and applying all subsequent page deltas, up to the read LSN. + +## Compaction: Why? + +Pageservers store page@LSN keys in a key/value store using a custom variant of an LSM tree. Each timeline on each tenant shard has its own LSM tree. + +When Pageservers write new page@LSN entries, they are appended unordered to an ephemeral layer file. When the ephemeral layer file exceeds `checkpoint_distance` (default 256 MB), the key/value pairs are sorted by key and written out to a layer file (for efficient lookups). + +As WAL writes continue, more layer files accumulate. + +Reads must search through the layer files to find the page’s image and deltas. The more layer files accumulate, the more la yer files reads must search through before they find a page image, aka read amplification. + +Compaction’s job is to: + +- Reduce read amplification by reorganizing and combining layer files. +- Remove old garbage from layer files. + +As part of this, it may combine several page deltas into a single page image where possible. + +## Compaction: How? + +Neon uses a non-standard variant of an LSM tree made up of two levels of layer files: L0 and L1. + +Compaction runs in two phases: L0→L1 compaction, and L1 image compaction. + +L0 contains a stack of L0 layers at decreasing LSN ranges. These have been flushed sequentially from ephemeral layers. Each L0 layer covers the entire page space (page 0 to ~infinity) and the LSN range that was ingested into it. L0 layers are therefore particularly bad for read amp, since every read must search all L0 layers below the read LSN. For example: + +``` +| Page 0-99 @ LSN 0400-04ff | +| Page 0-99 @ LSN 0300-03ff | +| Page 0-99 @ LSN 0200-02ff | +| Page 0-99 @ LSN 0100-01ff | +| Page 0-99 @ LSN 0000-00ff | +``` + +L0→L1 compaction takes the bottom-most chunk of L0 layer files of between `compaction_threshold` (default 10) and `compaction_upper_limit` (default 20) layers. It uses merge-sort to write out sorted L1 delta layers of size `compaction_target_size` (default 128 MB). + +L1 typically consists of a “bed” of image layers with materialized page images at a specific LSN, and then delta layers of various page/LSN ranges above them with page deltas. For example: + +``` +Delta layers: | 30-84@0310-04ff | +Delta layers: | 10-42@0200-02ff | | 65-92@0174-02aa | +Image layers: | 0-39@0100 | 40-79@0100 | 80-99@0100 | +``` + +L1 image compaction scans across the L1 keyspace at some LSN, materializes page images by reading the image and delta layers below the LSN (via vectored reads), and writes out new sorted image layers of roughly size `compaction_target_size` (default 128 MB) at that LSN. + +Layer files below the new image files’ LSN can be garbage collected when they are no longer needed for PITR. + +Even though the old layer files are not immediately garbage collected, the new image layers help with read amp because reads can stop traversing the layer stack as soon as they encounter a page image. + +## Compaction: When? + +Pageservers run a `compaction_loop` background task for each tenant shard. Every `compaction_period` (default 20 seconds) it will wake up and check if any of the shard’s timelines need compaction. Additionally, L0 layer flushes will eagerly wake the compaction loop if the L0 count exceeds `compaction_threshold` (default 10). + +L0 compaction runs if the number of L0 layers exceeds `compaction_threshold` (default 10). + +L1 image compaction runs across sections of the L1 keyspace that have at least `image_creation_threshold` (default 3) delta layers overlapping image layers. + +At most `CONCURRENT_BACKGROUND_TASKS` (default 3 / 4 * CPUs = 6) background tasks can run concurrently on a Pageserver, including compaction. Further compaction tasks must wait. + +Because L0 layers cause the most read amp (they overlap the entire keyspace and only contain page deltas), they are aggressively compacted down: + +- L0 is compacted down across all tenant timelines before L1 compaction is attempted (`compaction_l0_first`). +- L0 compaction uses a separate concurrency limit of `CONCURRENT_L0_COMPACTION_TASKS` (default 3 / 4 * CPUs = 6) to avoid waiting for other tasks (`compaction_l0_semaphore`). +- If L0 compaction is needed on any tenant timeline, L1 image compaction will yield to start an immediate L0 compaction run (except for compaction run via admin APIs). + +## Backpressure + +With sustained heavy write loads, new L0 layers may be flushed faster than they can be compacted down. This can cause an unbounded buildup of read amplification and compaction debt, which can take hours to resolve even after the writes stop. + +To avoid this and allow compaction to keep up, layer flushes will slow writes down to apply backpressure on the workload: + +- At `l0_flush_delay_threshold` (default 30) L0 layers, layer flushes are delayed by the flush duration, such that they take 2x as long. +- At `l0_flush_stall_threshold` (default disabled) L0 layers, layer flushes stall entirely until the L0 count falls back below the threshold. This is currently disabled because we don’t trust L0 compaction to be responsive enough. + +This backpressure is propagated to the compute by waiting for layer flushes when WAL ingestion rolls the ephemeral layer. The compute will significantly slow down WAL writes at: + +- `max_replication_write_lag` (default 500 MB), when Pageserver WAL ingestion lags +- `max_replication_flush_lag` (default 10 GB), when Pageserver L0 flushes lag + +Combined, this means that the compute will backpressure when there are 30 L0 layers (30 * 256 MB = 7.7 GB) and the Pageserver WAL ingestion lags the compute by 500 MB, for a total of ~8 GB L0+ephemeral compaction debt on a single shard. + +Since we only delay L0 flushes by 2x when backpressuring, and haven’t enabled stalls, it is still possible for read amp to increase unbounded if compaction is too slow (although we haven’t seen this in practice). But this is considered better than stalling flushes and causing unavailability for as long as it takes L0 compaction to react, since we don’t trust it to be fast enough — at the expense of continually increasing read latency and CPU usage for this tenant. We should either enable stalls when we have enough confidence in L0 compaction, or scale the flush delay by the number of L0 layers to apply increasing backpressure. + +## Circuit Breaker + +Compaction can fail, often repeatedly. This can happen e.g. due to data corruption, faulty hardware, S3 outages, etc. + +If compaction fails, the compaction loop will naïvely try and fail again almost immediately. It may only fail after doing a significant amount of wasted work, while holding onto the background task semaphore. + +To avoid repeatedly doing wasted work and starving out other compaction jobs, each tenant has a compaction circuit breaker. After 5 repeated compaction failures, the circuit breaker trips and disables compaction for the next 24 hours, before resetting the breaker and trying again. This disables compaction across all tenant timelines (faulty or not). + +Disabling compaction for a long time is dangerous, since it can lead to unbounded read amp and compaction debt, and continuous workload backpressure. However, continually failing would not help either. Tripped circuit breakers trigger an alert and must be investigated promptly. \ No newline at end of file diff --git a/docs/rfcs/2025-02-14-storage-controller.md b/docs/rfcs/2025-02-14-storage-controller.md new file mode 100644 index 0000000000..a772fee209 --- /dev/null +++ b/docs/rfcs/2025-02-14-storage-controller.md @@ -0,0 +1,196 @@ + +## Summary + +This is a retrospective RFC to document the design of the `storage-controller` service. + +This service manages the physical mapping of Tenants and Timelines to Pageservers and Safekeepers. It +acts as the API for "storage" as an abstract concept: enabling other parts of the system to reason +about things like creating/deleting tenants and timelines without having to understand exactly which +pageserver and safekeeper to communicate, or any subtle rules about how to orchestrate these things. + +The storage controller was implemented in the first half of 2024 as an essential part +of storage sharding, especially [shard splitting](032-shard-splitting.md). + +It initially managed only pageservers, but has extended in 2025 to also manage safekeepers. In +some places you may seen unqualified references to 'nodes' -- those are pageservers. + +## Design Choices + +### Durability + +We rely on an external postgres for all durable state. No local storage is used. + +We avoid any unnecessary I/O to durable storage. For example: +- most tracking of in-flight changes to the system is done in-memory rather than recording progress/steps in a database +- When migrating tenant shards between pageservers we only touch the database to increment generation numbers, + we do not persist the total state of a tenant shard. + +Being frugal with database I/O has two benefits: +- It avoids the database becoming a practical scaling bottleneck (we expect in-memory scale issues to be hit + before we hit e.g. transactions-per-second issues) +- It reduces cost when using a cloud database service to run the controller's postgres database. + +The trade-off is that there is a "bootstrapping" problem: a controller can't be deployed in isolation, one +must first have some existing database system. In practice, we expect that Neon is deployed in one of the +following ways: +- into a cloud which has a postgres service that can be used to run the controller +- into a mature on-prem environment that has existing facilities for running databases +- into a test/dev environment where a simple one-node vanilla postgres installation is sufficient + +### Consensus + +The controller does _not_ implement any strong consensus mechanism of its own. Instead: +- Where strong consistency is required (for example, for pageserver generation numbers), this + responsibility is delegated to a transaction in our postgres database. +- Highly available deploys are done using a simple in-database record of what controller instances + are available, distinguished by timestamps, rather than having controllers directly negotiate a leader. + +Avoiding strong consensus among controller processes is a cost saving (we avoid running three controllers +all the time), and simplifies implementation (we do not have to phrase all configuration changes as e.g raft +transactions). + +The trade-off is that under some circumstances a controller with partial network isolation can cause availability +issues in the cluster, by making changes to pageserver state that might disagree with what the "true" active +controller is trying to do. The impact of this is bounded by our `controllers` database table, that enables +a rogue node to eventually realise that it is not the leader and step down. If a rogue node can't reach +the database, then it implicitly stops making progress. A rogue controller cannot durably damage the system +because pageserver data and safekeeper configs are protected by generation numbers that are only updated +via postgres transactions (i.e. no controller "trusts itself" to independently make decisions about generations). + +### Scale + +We design for high but not unlimited scale. The memory footprint of each tenant shard is small (~8kB), so +it is realistic to scale up to a million attached shards on a server with modest resources. Tenants in +a detached state (i.e. not active on pageservers) do not need to be managed by storage controller, and can +be relegated from memory to the database. + +Typically, a tenant shard is updated about once a week, when we do a deploy. During deploys, we relocate +a few thousand tenants from each pageserver while it is restarted, so it is extremely rare for the controller +to have to do O(N) work (on all shards at once). + +There are places where we do O(N) work: +- On normal startup, when loading from the database into memory +- On unclean startup (with no handover of observed state from a previous controller), where we will + scan all shards on all pageservers. + +It is important that these locations are written efficiently. At high scale we should still expect runtimes +of the order tens of seconds to complete a storage controller start. + +When the practical scale limit of a single storage controller is reached, just deploy another one with its +own pageservers & safekeepers: each controller+its storage servers should be thought of as a logical cluster +or "cell" of storage. + +# High Level Design + +The storage controller is an in-memory system (i.e. state for all attached +tenants is held in memory _as well as_ being represented in durable postgres storage). + +## Infrastructure + +The storage controller is an async rust binary using tokio. + +The storage controller is built around the `Service` type. This implements +all the entry points for the outside world's interaction with the controller (HTTP handlers are mostly thin wrappers of service functions), +and holds most in-memory state (e.g. the list of tenant shards). + +The state is held in a `ServiceInner` wrapped in a RwLock. This monolithic +lock is used to simplify reasoning about code that mutates state: each function that takes a write lock may be thought of as a serializable transaction on the in-memory state. This lock is clearly a bottleneck, but +nevertheless is scalable to managing millions of tenants. + +Persistent state is held in a postgres database, and we use the `diesel` crate to provide database client functionality. All database access is wrapped in the `Persistence` type -- this makes it easy to understand which +code is touching the database. The database is only used when necessary, i.e. for state that cannot be recovered another way. For example, we do not store the secondary pageserver locations of tenant shards in the database, rather we learn these at startup from running pageservers, and/or make scheduling decisions to fill in the gaps. This adds some complexity, but massively reduces the load on the database, and enables running the storage controller with a very cheap postgres instance. + +## Pageserver tenant scheduling & reconciliation + +### Intent & observed state + +Each tenant shard is represented by type `TenantShard`, which has an 'intent' and 'observed' state. Setting the +intent state is called _scheduling_, and doing remote I/O to make observed +state match intent state is called _reconciliation_. + +The `Scheduler` type is responsible for making choices about the intent +state, such as choosing a pageserver for a new tenant shard, or assigning +a replacement pageserver when the original one fails. + +The observed state is updated after tenant reconciliation (see below), and +has the concept of a `None` state for a pageserver, indicating unknown state. This is used to ensure that we can safely clean up after we start +but do not finish a remote call to a pageserver, or if a pageserver restarts and we are uncertain of its state. + +### Tenant Reconciliation + +The `Reconciler` type is responsible for updating pageservers to achieve +the intent state. It is instantiated when `Service` determines that a shard requires reconciliation, and owned by a background tokio task that +runs it to completion. Reconciler does not have access to the `Service` state: it is populated with a snapshot of relevant information when constructed, and submits is results to a channel that `Service` consumes +to update the tenant shard's observed state. + +The Reconciler does have access to the database, but only uses it for +a single purpose: updating shards' generation numbers immediately before +attaching them to a pageserver. + +Operations that change a tenant's scheduling will spawn a reconciler if +necessary, and there is also a background loop which checks every shard +for the need to reconcile -- this background loop ensures eventual progress +if some earlier reconciliations failed for some reason. + +The reconciler has a general purpose code path which will attach/detach from pageservers as necessary, and a special case path for live migrations. The live migration case is more common in practice, and is taken whenever the current observed state indicates that we have a healthy attached location to migrate from. This implements live migration as described in the earlier [live migration RFC](028-pageserver-migration.md). + +### Scheduling optimisation + +During the periodic background reconciliation loop, the controller also +performance _scheduling optimization_. This is the process of looking for +shards that are in sub-optimal locations, and moving them. + +Typically, this means: +- Shards attached outside their preferred AZ (e.g. after a node failure), to migrate them back to their preferred AZ +- Shards attached on the same pageserver as some other shards in the same + tenant, to migrate them elsewhere (e.g. after a shard split) + +Scheduling optimisation is a multi-step process to ensure graceful cutovers, e.g. by creating new secondary location, waiting for it to +warm up, then cutting over. This is not done as an explicit queue +of operations, but rather by iteratively calling the optimisation +function, which will recognise each intervening state as something +that can generate the next optimisation. + +### Pageserver heartbeats and failure + +The `Heartbeater` type is responsible for detecting when a pageserver +becomes unavailable. This is fed back into `Service` for action: when +a pageserver is marked unavailable, tenant shards on that pageserver are +rescheduled and Reconcilers are spawned to cut them over to their new location. + +## Pageserver timeline CRUD operations + +By CRUD operations, we mean creating and deleting timelines. The authoritative storage for which timelines exist on the pageserver +is in S3, and is governed by the pageserver's system of generation +numbers. Because a shard can be attached to multiple pageservers +concurrently, we need to handle this when doing timeline CRUD operations: +- A timeline operation is only persistent if _after_ the ack from a pageserver, that pageserver's generation is still the latest. +- For deletions in particular, they are only persistent if _all_ attached + locations have acked the deletion operation, since if only the latest one + has acked then the timeline could still return from the dead if some old-generation attachment writes an index for it. + +## Zero-downtime controller deployments + +When two storage controllers run at the same time, they coordinate via +the database to establish one leader, and the other controller may proxy +requests to this leader + +See [Storage controller restarts RFC](037-storage-controller-restarts.md). + +Note that this is not a strong consensus mechanism: the controller must also survive split-brain situations. This is respected by code that +e.g. increments version numbers, which uses database transactions that +check the expected value before modifying it. A split-brain situation can +impact availability (e.g. if two controllers are fighting over where to +attach a shard), but it should never impact durability and data integrity. + +## Graceful drain & fill of pageservers during deploys + +The storage controller has functionality for draining + filling pageservers +while deploying new pageserver binaries, so that clients are not actively +using a pageserver while it restarts. + +See [Graceful restarts RFC](033-storage-controller-drain-and-fill.md) + +## Safekeeper timeline scheduling + +This is currently under development, see [Safekeeper dynamic membership change RFC](035-safekeeper-dynamic-membership-change.md). \ No newline at end of file diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 11615b73a1..cff1f4c89a 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -275,6 +275,18 @@ pub enum ComputeMode { Replica, } +impl ComputeMode { + /// Convert the compute mode to a string that can be used to identify the type of compute, + /// which means that if it's a static compute, the LSN will not be included. + pub fn to_type_str(&self) -> &'static str { + match self { + ComputeMode::Primary => "primary", + ComputeMode::Static(_) => "static", + ComputeMode::Replica => "replica", + } + } +} + /// Log level for audit logging /// Disabled, log, hipaa /// Default is Disabled diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index 331ae4a9b8..6d24ee352a 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -6,6 +6,7 @@ license.workspace = true [dependencies] anyhow.workspace = true +arc-swap.workspace = true bytes.workspace = true camino.workspace = true fail.workspace = true @@ -18,14 +19,15 @@ pprof.workspace = true regex.workspace = true routerify.workspace = true rustls-pemfile.workspace = true -serde.workspace = true +rustls.workspace = true serde_json.workspace = true serde_path_to_error.workspace = true +serde.workspace = true thiserror.workspace = true -tracing.workspace = true -tokio.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true +tokio.workspace = true +tracing.workspace = true url.workspace = true uuid.workspace = true diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs index 33e4915e99..07fd56ac01 100644 --- a/libs/http-utils/src/server.rs +++ b/libs/http-utils/src/server.rs @@ -91,14 +91,14 @@ impl Server { Ok(tls_stream) => tls_stream, Err(err) => { if !suppress_io_error(&err) { - info!("Failed to accept TLS connection: {err:#}"); + info!(%remote_addr, "Failed to accept TLS connection: {err:#}"); } return; } }; if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await { if !suppress_hyper_error(&err) { - info!("Failed to serve HTTPS connection: {err:#}"); + info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}"); } } } @@ -106,7 +106,7 @@ impl Server { // Handle HTTP connection. if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await { if !suppress_hyper_error(&err) { - info!("Failed to serve HTTP connection: {err:#}"); + info!(%remote_addr, "Failed to serve HTTP connection: {err:#}"); } } } diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs index db9ec825ed..0c18d84d98 100644 --- a/libs/http-utils/src/tls_certs.rs +++ b/libs/http-utils/src/tls_certs.rs @@ -1,21 +1,124 @@ +use std::{sync::Arc, time::Duration}; + +use anyhow::Context; +use arc_swap::ArcSwap; use camino::Utf8Path; -use tokio_rustls::rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use rustls::{ + pki_types::{CertificateDer, PrivateKeyDer}, + server::{ClientHello, ResolvesServerCert}, + sign::CertifiedKey, +}; -pub fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { - let file = std::fs::File::open(filename)?; - let mut reader = std::io::BufReader::new(file); +pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { + let cert_data = tokio::fs::read(filename) + .await + .context(format!("failed reading certificate file {filename:?}"))?; + let mut reader = std::io::Cursor::new(&cert_data); - Ok(rustls_pemfile::certs(&mut reader).collect::, _>>()?) + let cert_chain = rustls_pemfile::certs(&mut reader) + .collect::, _>>() + .context(format!("failed parsing certificate from file {filename:?}"))?; + + Ok(cert_chain) } -pub fn load_private_key(filename: &Utf8Path) -> anyhow::Result> { - let file = std::fs::File::open(filename)?; - let mut reader = std::io::BufReader::new(file); +pub async fn load_private_key(filename: &Utf8Path) -> anyhow::Result> { + let key_data = tokio::fs::read(filename) + .await + .context(format!("failed reading private key file {filename:?}"))?; + let mut reader = std::io::Cursor::new(&key_data); - let key = rustls_pemfile::private_key(&mut reader)?; + let key = rustls_pemfile::private_key(&mut reader) + .context(format!("failed parsing private key from file {filename:?}"))?; key.ok_or(anyhow::anyhow!( "no private key found in {}", filename.as_str(), )) } + +pub async fn load_certified_key( + key_filename: &Utf8Path, + cert_filename: &Utf8Path, +) -> anyhow::Result { + let cert_chain = load_cert_chain(cert_filename).await?; + let key = load_private_key(key_filename).await?; + + let key = rustls::crypto::ring::default_provider() + .key_provider + .load_private_key(key)?; + + let certified_key = CertifiedKey::new(cert_chain, key); + certified_key.keys_match()?; + Ok(certified_key) +} + +/// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from +/// the disk periodically. +#[derive(Debug)] +pub struct ReloadingCertificateResolver { + certified_key: ArcSwap, +} + +impl ReloadingCertificateResolver { + /// Creates a new Resolver by loading certificate and private key from FS and + /// creating tokio::task to reload them with provided reload_period. + pub async fn new( + key_filename: &Utf8Path, + cert_filename: &Utf8Path, + reload_period: Duration, + ) -> anyhow::Result> { + let this = Arc::new(Self { + certified_key: ArcSwap::from_pointee( + load_certified_key(key_filename, cert_filename).await?, + ), + }); + + tokio::spawn({ + let weak_this = Arc::downgrade(&this); + let key_filename = key_filename.to_owned(); + let cert_filename = cert_filename.to_owned(); + async move { + let start = tokio::time::Instant::now() + reload_period; + let mut interval = tokio::time::interval_at(start, reload_period); + let mut last_reload_failed = false; + loop { + interval.tick().await; + let this = match weak_this.upgrade() { + Some(this) => this, + None => break, // Resolver has been destroyed, exit. + }; + match load_certified_key(&key_filename, &cert_filename).await { + Ok(new_certified_key) => { + if new_certified_key.cert == this.certified_key.load().cert { + tracing::debug!("Certificate has not changed since last reloading"); + } else { + tracing::info!("Certificate has been reloaded"); + this.certified_key.store(Arc::new(new_certified_key)); + } + last_reload_failed = false; + } + Err(err) => { + // Note: Reloading certs may fail if it conflicts with the script updating + // the files at the same time. Warn only if the error is persistent. + if last_reload_failed { + tracing::warn!("Error reloading certificate: {err:#}"); + } else { + tracing::info!("Error reloading certificate: {err:#}"); + } + last_reload_failed = true; + } + } + } + } + }); + + Ok(this) + } +} + +impl ResolvesServerCert for ReloadingCertificateResolver { + fn resolve(&self, _client_hello: ClientHello<'_>) -> Option> { + Some(self.certified_key.load_full()) + } +} diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 87dfdfb5ec..688e9de6e7 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -34,6 +34,7 @@ postgres_backend.workspace = true nix = {workspace = true, optional = true} reqwest.workspace = true rand.workspace = true +tracing-utils.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index b12ef65780..8f56d60a4a 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -51,9 +51,54 @@ pub struct NodeMetadata { /// If there cannot be a static default value because we need to make runtime /// checks to determine the default, make it an `Option` (which defaults to None). /// The runtime check should be done in the consuming crate, i.e., `pageserver`. +/// +/// Unknown fields are silently ignored during deserialization. +/// The alternative, which we used in the past, was to set `deny_unknown_fields`, +/// which fails deserialization, and hence pageserver startup, if there is an unknown field. +/// The reason we don't do that anymore is that it complicates +/// usage of config fields for feature flagging, which we commonly do for +/// region-by-region rollouts. +/// The complications mainly arise because the `pageserver.toml` contents on a +/// prod server have a separate lifecycle from the pageserver binary. +/// For instance, `pageserver.toml` contents today are defined in the internal +/// infra repo, and thus introducing a new config field to pageserver and +/// rolling it out to prod servers are separate commits in separate repos +/// that can't be made or rolled back atomically. +/// Rollbacks in particular pose a risk with deny_unknown_fields because +/// the old pageserver binary may reject a new config field, resulting in +/// an outage unless the person doing the pageserver rollback remembers +/// to also revert the commit that added the config field in to the +/// `pageserver.toml` templates in the internal infra repo. +/// (A pre-deploy config check would eliminate this risk during rollbacks, +/// cf [here](https://github.com/neondatabase/cloud/issues/24349).) +/// In addition to this compatibility problem during emergency rollbacks, +/// deny_unknown_fields adds further complications when decomissioning a feature +/// flag: with deny_unknown_fields, we can't remove a flag from the [`ConfigToml`] +/// until all prod servers' `pageserver.toml` files have been updated to a version +/// that doesn't specify the flag. Otherwise new software would fail to start up. +/// This adds the requirement for an intermediate step where the new config field +/// is accepted but ignored, prolonging the decomissioning process by an entire +/// release cycle. +/// By contrast with unknown fields silently ignored, decomissioning a feature +/// flag is a one-step process: we can skip the intermediate step and straight +/// remove the field from the [`ConfigToml`]. We leave the field in the +/// `pageserver.toml` files on prod servers until we reach certainty that we +/// will not roll back to old software whose behavior was dependent on config. +/// Then we can remove the field from the templates in the internal infra repo. +/// This process is [documented internally]( +/// https://docs.neon.build/storage/pageserver_configuration.html). +/// +/// Note that above relaxed compatbility for the config format does NOT APPLY +/// TO THE STORAGE FORMAT. As general guidance, when introducing storage format +/// changes, ensure that the potential rollback target version will be compatible +/// with the new format. This must hold regardless of what flags are set in in the `pageserver.toml`: +/// any format version that exists in an environment must be compatible with the software that runs there. +/// Use a pageserver.toml flag only to gate whether software _writes_ the new format. +/// For more compatibility considerations, refer to [internal docs]( +/// https://docs.neon.build/storage/compat.html?highlight=compat#format-versions--compatibility) #[serde_as] #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] -#[serde(default, deny_unknown_fields)] +#[serde(default)] pub struct ConfigToml { // types mapped 1:1 into the runtime PageServerConfig type pub listen_pg_addr: String, @@ -61,6 +106,9 @@ pub struct ConfigToml { pub listen_https_addr: Option, pub ssl_key_file: Utf8PathBuf, pub ssl_cert_file: Utf8PathBuf, + #[serde(with = "humantime_serde")] + pub ssl_cert_reload_period: Duration, + pub ssl_ca_file: Option, pub availability_zone: Option, #[serde(with = "humantime_serde")] pub wait_lsn_timeout: Duration, @@ -131,10 +179,10 @@ pub struct ConfigToml { pub load_previous_heatmap: Option, #[serde(skip_serializing_if = "Option::is_none")] pub generate_unarchival_heatmap: Option, + pub tracing: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -#[serde(deny_unknown_fields)] pub struct DiskUsageEvictionTaskConfig { pub max_usage_pct: utils::serde_percent::Percent, pub min_avail_bytes: u64, @@ -149,13 +197,11 @@ pub struct DiskUsageEvictionTaskConfig { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case")] -#[serde(deny_unknown_fields)] pub enum PageServicePipeliningConfig { Serial, Pipelined(PageServicePipeliningConfigPipelined), } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -#[serde(deny_unknown_fields)] pub struct PageServicePipeliningConfigPipelined { /// Causes runtime errors if larger than max get_vectored batch size. pub max_batch_size: NonZeroUsize, @@ -171,7 +217,6 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case")] -#[serde(deny_unknown_fields)] pub enum GetVectoredConcurrentIo { /// The read path is fully sequential: layers are visited /// one after the other and IOs are issued and waited upon @@ -188,6 +233,54 @@ pub enum GetVectoredConcurrentIo { SidecarTask, } +#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct Ratio { + pub numerator: usize, + pub denominator: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct OtelExporterConfig { + pub endpoint: String, + pub protocol: OtelExporterProtocol, + #[serde(with = "humantime_serde")] + pub timeout: Duration, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum OtelExporterProtocol { + Grpc, + HttpBinary, + HttpJson, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct Tracing { + pub sampling_ratio: Ratio, + pub export_config: OtelExporterConfig, +} + +impl From<&OtelExporterConfig> for tracing_utils::ExportConfig { + fn from(val: &OtelExporterConfig) -> Self { + tracing_utils::ExportConfig { + endpoint: Some(val.endpoint.clone()), + protocol: val.protocol.into(), + timeout: val.timeout, + } + } +} + +impl From for tracing_utils::Protocol { + fn from(val: OtelExporterProtocol) -> Self { + match val { + OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc, + OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson, + OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary, + } + } +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -240,13 +333,9 @@ impl Default for EvictionOrder { #[serde(transparent)] pub struct MaxVectoredReadBytes(pub NonZeroUsize); -/// A tenant's calcuated configuration, which is the result of merging a -/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. -/// -/// For storing and transmitting individual tenant's configuration, see -/// TenantConfOpt. +/// Tenant-level configuration values, used for various purposes. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -#[serde(deny_unknown_fields, default)] +#[serde(default)] pub struct TenantConfigToml { // Flush out an inmemory layer, if it's holding WAL older than this // This puts a backstop on how much WAL needs to be re-digested if the @@ -286,12 +375,6 @@ pub struct TenantConfigToml { /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold /// to avoid deadlock. 0 to disable. Disabled by default. pub l0_flush_stall_threshold: Option, - /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next - /// layer. This is a temporary backpressure mechanism which should be removed once - /// l0_flush_{delay,stall}_threshold is fully enabled. - /// - /// TODO: this is no longer enabled, remove it when the config option is no longer set. - pub l0_flush_wait_upload: bool, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is #of bytes of WAL. @@ -374,6 +457,9 @@ pub struct TenantConfigToml { /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN) /// is above this ratio, gc-compaction will be triggered. pub gc_compaction_ratio_percent: u64, + /// Tenant level performance sampling ratio override. Controls the ratio of get page requests + /// that will get perf sampling for the tenant. + pub sampling_ratio: Option, } pub mod defaults { @@ -443,6 +529,8 @@ impl Default for ConfigToml { listen_https_addr: (None), ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE), ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE), + ssl_cert_reload_period: Duration::from_secs(60), + ssl_ca_file: None, availability_zone: (None), wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) .expect("cannot parse default wait lsn timeout")), @@ -542,6 +630,7 @@ impl Default for ConfigToml { validate_wal_contiguity: None, load_previous_heatmap: None, generate_unarchival_heatmap: None, + tracing: None, } } } @@ -578,8 +667,6 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; - pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false; - pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. @@ -626,7 +713,6 @@ impl Default for TenantConfigToml { compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE, l0_flush_delay_threshold: None, l0_flush_stall_threshold: None, - l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD, gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), @@ -660,6 +746,7 @@ impl Default for TenantConfigToml { gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, + sampling_ratio: None, } } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 4a8f75413c..bdee46f1b1 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -23,6 +23,7 @@ use utils::lsn::Lsn; use utils::postgres_client::PostgresClientProtocol; use utils::{completion, serde_system_time}; +use crate::config::Ratio; use crate::key::{CompactKey, Key}; use crate::reltag::RelTag; use crate::shard::{ShardCount, ShardStripeSize, TenantShardId}; @@ -523,8 +524,6 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub l0_flush_stall_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] - pub l0_flush_wait_upload: FieldPatch, - #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_horizon: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_period: FieldPatch, @@ -570,68 +569,131 @@ pub struct TenantConfigPatch { pub gc_compaction_initial_threshold_kb: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_ratio_percent: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub sampling_ratio: FieldPatch>, } -/// An alternative representation of `pageserver::tenant::TenantConf` with -/// simpler types. -#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] +/// Like [`crate::config::TenantConfigToml`], but preserves the information +/// about which parameters are set and which are not. +/// +/// Used in many places, including durably stored ones. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] +#[serde(default)] // this maps omitted fields in deserialization to None pub struct TenantConfig { + #[serde(skip_serializing_if = "Option::is_none")] pub checkpoint_distance: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub checkpoint_timeout: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub compaction_target_size: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub compaction_period: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub compaction_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub compaction_upper_limit: Option, - // defer parsing compaction_algorithm, like eviction_policy + + #[serde(skip_serializing_if = "Option::is_none")] pub compaction_algorithm: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub compaction_l0_first: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub compaction_l0_semaphore: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub l0_flush_delay_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub l0_flush_stall_threshold: Option, - pub l0_flush_wait_upload: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub gc_horizon: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub gc_period: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub image_creation_threshold: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub pitr_interval: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub walreceiver_connect_timeout: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub lagging_wal_timeout: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub max_lsn_wal_lag: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub eviction_policy: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub min_resident_size_override: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub evictions_low_residence_duration_metric_threshold: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub heatmap_period: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub lazy_slru_download: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub timeline_get_throttle: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub image_layer_creation_check_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub image_creation_preempt_threshold: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub lsn_lease_length: Option, - #[serde(default)] + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub lsn_lease_length_for_ts: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub timeline_offloading: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub wal_receiver_protocol_override: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub rel_size_v2_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_initial_threshold_kb: Option, + + #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_ratio_percent: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub sampling_ratio: Option>, } impl TenantConfig { @@ -651,7 +713,6 @@ impl TenantConfig { mut compaction_l0_semaphore, mut l0_flush_delay_threshold, mut l0_flush_stall_threshold, - mut l0_flush_wait_upload, mut gc_horizon, mut gc_period, mut image_creation_threshold, @@ -675,6 +736,7 @@ impl TenantConfig { mut gc_compaction_enabled, mut gc_compaction_initial_threshold_kb, mut gc_compaction_ratio_percent, + mut sampling_ratio, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -704,7 +766,6 @@ impl TenantConfig { patch .l0_flush_stall_threshold .apply(&mut l0_flush_stall_threshold); - patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); patch.gc_horizon.apply(&mut gc_horizon); patch .gc_period @@ -770,6 +831,7 @@ impl TenantConfig { patch .gc_compaction_ratio_percent .apply(&mut gc_compaction_ratio_percent); + patch.sampling_ratio.apply(&mut sampling_ratio); Ok(Self { checkpoint_distance, @@ -783,7 +845,6 @@ impl TenantConfig { compaction_l0_semaphore, l0_flush_delay_threshold, l0_flush_stall_threshold, - l0_flush_wait_upload, gc_horizon, gc_period, image_creation_threshold, @@ -807,8 +868,111 @@ impl TenantConfig { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, + sampling_ratio, }) } + + pub fn merge( + &self, + global_conf: crate::config::TenantConfigToml, + ) -> crate::config::TenantConfigToml { + crate::config::TenantConfigToml { + checkpoint_distance: self + .checkpoint_distance + .unwrap_or(global_conf.checkpoint_distance), + checkpoint_timeout: self + .checkpoint_timeout + .unwrap_or(global_conf.checkpoint_timeout), + compaction_target_size: self + .compaction_target_size + .unwrap_or(global_conf.compaction_target_size), + compaction_period: self + .compaction_period + .unwrap_or(global_conf.compaction_period), + compaction_threshold: self + .compaction_threshold + .unwrap_or(global_conf.compaction_threshold), + compaction_upper_limit: self + .compaction_upper_limit + .unwrap_or(global_conf.compaction_upper_limit), + compaction_algorithm: self + .compaction_algorithm + .as_ref() + .unwrap_or(&global_conf.compaction_algorithm) + .clone(), + compaction_l0_first: self + .compaction_l0_first + .unwrap_or(global_conf.compaction_l0_first), + compaction_l0_semaphore: self + .compaction_l0_semaphore + .unwrap_or(global_conf.compaction_l0_semaphore), + l0_flush_delay_threshold: self + .l0_flush_delay_threshold + .or(global_conf.l0_flush_delay_threshold), + l0_flush_stall_threshold: self + .l0_flush_stall_threshold + .or(global_conf.l0_flush_stall_threshold), + gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), + gc_period: self.gc_period.unwrap_or(global_conf.gc_period), + image_creation_threshold: self + .image_creation_threshold + .unwrap_or(global_conf.image_creation_threshold), + pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), + walreceiver_connect_timeout: self + .walreceiver_connect_timeout + .unwrap_or(global_conf.walreceiver_connect_timeout), + lagging_wal_timeout: self + .lagging_wal_timeout + .unwrap_or(global_conf.lagging_wal_timeout), + max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), + eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), + min_resident_size_override: self + .min_resident_size_override + .or(global_conf.min_resident_size_override), + evictions_low_residence_duration_metric_threshold: self + .evictions_low_residence_duration_metric_threshold + .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), + heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period), + lazy_slru_download: self + .lazy_slru_download + .unwrap_or(global_conf.lazy_slru_download), + timeline_get_throttle: self + .timeline_get_throttle + .clone() + .unwrap_or(global_conf.timeline_get_throttle), + image_layer_creation_check_threshold: self + .image_layer_creation_check_threshold + .unwrap_or(global_conf.image_layer_creation_check_threshold), + image_creation_preempt_threshold: self + .image_creation_preempt_threshold + .unwrap_or(global_conf.image_creation_preempt_threshold), + lsn_lease_length: self + .lsn_lease_length + .unwrap_or(global_conf.lsn_lease_length), + lsn_lease_length_for_ts: self + .lsn_lease_length_for_ts + .unwrap_or(global_conf.lsn_lease_length_for_ts), + timeline_offloading: self + .timeline_offloading + .unwrap_or(global_conf.timeline_offloading), + wal_receiver_protocol_override: self + .wal_receiver_protocol_override + .or(global_conf.wal_receiver_protocol_override), + rel_size_v2_enabled: self + .rel_size_v2_enabled + .unwrap_or(global_conf.rel_size_v2_enabled), + gc_compaction_enabled: self + .gc_compaction_enabled + .unwrap_or(global_conf.gc_compaction_enabled), + gc_compaction_initial_threshold_kb: self + .gc_compaction_initial_threshold_kb + .unwrap_or(global_conf.gc_compaction_initial_threshold_kb), + gc_compaction_ratio_percent: self + .gc_compaction_ratio_percent + .unwrap_or(global_conf.gc_compaction_ratio_percent), + sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio), + } + } } /// The policy for the aux file storage. @@ -940,7 +1104,7 @@ pub struct CompactionAlgorithmSettings { } #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)] -#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +#[serde(tag = "mode", rename_all = "kebab-case")] pub enum L0FlushConfig { #[serde(rename_all = "snake_case")] Direct { max_concurrency: NonZeroUsize }, @@ -1199,6 +1363,12 @@ pub enum TimelineArchivalState { Unarchived, } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +pub enum TimelineVisibilityState { + Visible, + Invisible, +} + #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] pub struct TimelineArchivalConfigRequest { pub state: TimelineArchivalState, @@ -1258,11 +1428,6 @@ pub struct TimelineInfo { pub last_record_lsn: Lsn, pub prev_record_lsn: Option, - /// Legacy field, retained for one version to enable old storage controller to - /// decode (it was a mandatory field). - #[serde(default, rename = "latest_gc_cutoff_lsn")] - pub _unused: Lsn, - /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients. /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead, /// as it is easier to reason about. @@ -1331,6 +1496,9 @@ pub struct TimelineInfo { /// The status of the rel_size migration. pub rel_size_migration: Option, + + /// Whether the timeline is invisible in synthetic size calculations. + pub is_invisible: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 647d01c3c2..285ba06056 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -1,4 +1,4 @@ -//! Types in this file are for pageserver's upward-facing API calls to the control plane, +//! Types in this file are for pageserver's upward-facing API calls to the storage controller, //! required for acquiring and validating tenant generation numbers. //! //! See docs/rfcs/025-generation-numbers.md diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml index 540876742f..f529e3e00d 100644 --- a/libs/proxy/tokio-postgres2/Cargo.toml +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -8,10 +8,9 @@ license = "MIT/Apache-2.0" bytes.workspace = true fallible-iterator.workspace = true futures-util = { workspace = true, features = ["sink"] } -log = "0.4" +tracing.workspace = true parking_lot.workspace = true pin-project-lite.workspace = true -phf = "0.11" postgres-protocol2 = { path = "../postgres-protocol2" } postgres-types2 = { path = "../postgres-types2" } tokio = { workspace = true, features = ["io-util", "time", "net"] } diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs index 60e39b3b44..99d6f3f8e2 100644 --- a/libs/proxy/tokio-postgres2/src/connection.rs +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -6,13 +6,13 @@ use std::task::{Context, Poll}; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use futures_util::{Sink, Stream, ready}; -use log::{info, trace}; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc; use tokio_util::codec::Framed; use tokio_util::sync::PollSender; +use tracing::{info, trace}; use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; use crate::error::DbError; diff --git a/libs/proxy/tokio-postgres2/src/error/sqlstate.rs b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs index 13a1d75f95..7ae557dea8 100644 --- a/libs/proxy/tokio-postgres2/src/error/sqlstate.rs +++ b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs @@ -1,1670 +1,88 @@ -// Autogenerated file - DO NOT EDIT +//! Rust repr for /// A SQLSTATE error code #[derive(PartialEq, Eq, Clone, Debug)] -pub struct SqlState(Inner); +pub struct SqlState([u8; 5]); impl SqlState { /// Creates a `SqlState` from its error code. pub fn from_code(s: &str) -> SqlState { - match SQLSTATE_MAP.get(s) { - Some(state) => state.clone(), - None => SqlState(Inner::Other(s.into())), + let mut code = [b'0'; 5]; + if s.len() == 5 { + code.copy_from_slice(s.as_bytes()); } + SqlState(code) } /// Returns the error code corresponding to the `SqlState`. pub fn code(&self) -> &str { - match &self.0 { - Inner::E00000 => "00000", - Inner::E01000 => "01000", - Inner::E0100C => "0100C", - Inner::E01008 => "01008", - Inner::E01003 => "01003", - Inner::E01007 => "01007", - Inner::E01006 => "01006", - Inner::E01004 => "01004", - Inner::E01P01 => "01P01", - Inner::E02000 => "02000", - Inner::E02001 => "02001", - Inner::E03000 => "03000", - Inner::E08000 => "08000", - Inner::E08003 => "08003", - Inner::E08006 => "08006", - Inner::E08001 => "08001", - Inner::E08004 => "08004", - Inner::E08007 => "08007", - Inner::E08P01 => "08P01", - Inner::E09000 => "09000", - Inner::E0A000 => "0A000", - Inner::E0B000 => "0B000", - Inner::E0F000 => "0F000", - Inner::E0F001 => "0F001", - Inner::E0L000 => "0L000", - Inner::E0LP01 => "0LP01", - Inner::E0P000 => "0P000", - Inner::E0Z000 => "0Z000", - Inner::E0Z002 => "0Z002", - Inner::E20000 => "20000", - Inner::E21000 => "21000", - Inner::E22000 => "22000", - Inner::E2202E => "2202E", - Inner::E22021 => "22021", - Inner::E22008 => "22008", - Inner::E22012 => "22012", - Inner::E22005 => "22005", - Inner::E2200B => "2200B", - Inner::E22022 => "22022", - Inner::E22015 => "22015", - Inner::E2201E => "2201E", - Inner::E22014 => "22014", - Inner::E22016 => "22016", - Inner::E2201F => "2201F", - Inner::E2201G => "2201G", - Inner::E22018 => "22018", - Inner::E22007 => "22007", - Inner::E22019 => "22019", - Inner::E2200D => "2200D", - Inner::E22025 => "22025", - Inner::E22P06 => "22P06", - Inner::E22010 => "22010", - Inner::E22023 => "22023", - Inner::E22013 => "22013", - Inner::E2201B => "2201B", - Inner::E2201W => "2201W", - Inner::E2201X => "2201X", - Inner::E2202H => "2202H", - Inner::E2202G => "2202G", - Inner::E22009 => "22009", - Inner::E2200C => "2200C", - Inner::E2200G => "2200G", - Inner::E22004 => "22004", - Inner::E22002 => "22002", - Inner::E22003 => "22003", - Inner::E2200H => "2200H", - Inner::E22026 => "22026", - Inner::E22001 => "22001", - Inner::E22011 => "22011", - Inner::E22027 => "22027", - Inner::E22024 => "22024", - Inner::E2200F => "2200F", - Inner::E22P01 => "22P01", - Inner::E22P02 => "22P02", - Inner::E22P03 => "22P03", - Inner::E22P04 => "22P04", - Inner::E22P05 => "22P05", - Inner::E2200L => "2200L", - Inner::E2200M => "2200M", - Inner::E2200N => "2200N", - Inner::E2200S => "2200S", - Inner::E2200T => "2200T", - Inner::E22030 => "22030", - Inner::E22031 => "22031", - Inner::E22032 => "22032", - Inner::E22033 => "22033", - Inner::E22034 => "22034", - Inner::E22035 => "22035", - Inner::E22036 => "22036", - Inner::E22037 => "22037", - Inner::E22038 => "22038", - Inner::E22039 => "22039", - Inner::E2203A => "2203A", - Inner::E2203B => "2203B", - Inner::E2203C => "2203C", - Inner::E2203D => "2203D", - Inner::E2203E => "2203E", - Inner::E2203F => "2203F", - Inner::E2203G => "2203G", - Inner::E23000 => "23000", - Inner::E23001 => "23001", - Inner::E23502 => "23502", - Inner::E23503 => "23503", - Inner::E23505 => "23505", - Inner::E23514 => "23514", - Inner::E23P01 => "23P01", - Inner::E24000 => "24000", - Inner::E25000 => "25000", - Inner::E25001 => "25001", - Inner::E25002 => "25002", - Inner::E25008 => "25008", - Inner::E25003 => "25003", - Inner::E25004 => "25004", - Inner::E25005 => "25005", - Inner::E25006 => "25006", - Inner::E25007 => "25007", - Inner::E25P01 => "25P01", - Inner::E25P02 => "25P02", - Inner::E25P03 => "25P03", - Inner::E26000 => "26000", - Inner::E27000 => "27000", - Inner::E28000 => "28000", - Inner::E28P01 => "28P01", - Inner::E2B000 => "2B000", - Inner::E2BP01 => "2BP01", - Inner::E2D000 => "2D000", - Inner::E2F000 => "2F000", - Inner::E2F005 => "2F005", - Inner::E2F002 => "2F002", - Inner::E2F003 => "2F003", - Inner::E2F004 => "2F004", - Inner::E34000 => "34000", - Inner::E38000 => "38000", - Inner::E38001 => "38001", - Inner::E38002 => "38002", - Inner::E38003 => "38003", - Inner::E38004 => "38004", - Inner::E39000 => "39000", - Inner::E39001 => "39001", - Inner::E39004 => "39004", - Inner::E39P01 => "39P01", - Inner::E39P02 => "39P02", - Inner::E39P03 => "39P03", - Inner::E3B000 => "3B000", - Inner::E3B001 => "3B001", - Inner::E3D000 => "3D000", - Inner::E3F000 => "3F000", - Inner::E40000 => "40000", - Inner::E40002 => "40002", - Inner::E40001 => "40001", - Inner::E40003 => "40003", - Inner::E40P01 => "40P01", - Inner::E42000 => "42000", - Inner::E42601 => "42601", - Inner::E42501 => "42501", - Inner::E42846 => "42846", - Inner::E42803 => "42803", - Inner::E42P20 => "42P20", - Inner::E42P19 => "42P19", - Inner::E42830 => "42830", - Inner::E42602 => "42602", - Inner::E42622 => "42622", - Inner::E42939 => "42939", - Inner::E42804 => "42804", - Inner::E42P18 => "42P18", - Inner::E42P21 => "42P21", - Inner::E42P22 => "42P22", - Inner::E42809 => "42809", - Inner::E428C9 => "428C9", - Inner::E42703 => "42703", - Inner::E42883 => "42883", - Inner::E42P01 => "42P01", - Inner::E42P02 => "42P02", - Inner::E42704 => "42704", - Inner::E42701 => "42701", - Inner::E42P03 => "42P03", - Inner::E42P04 => "42P04", - Inner::E42723 => "42723", - Inner::E42P05 => "42P05", - Inner::E42P06 => "42P06", - Inner::E42P07 => "42P07", - Inner::E42712 => "42712", - Inner::E42710 => "42710", - Inner::E42702 => "42702", - Inner::E42725 => "42725", - Inner::E42P08 => "42P08", - Inner::E42P09 => "42P09", - Inner::E42P10 => "42P10", - Inner::E42611 => "42611", - Inner::E42P11 => "42P11", - Inner::E42P12 => "42P12", - Inner::E42P13 => "42P13", - Inner::E42P14 => "42P14", - Inner::E42P15 => "42P15", - Inner::E42P16 => "42P16", - Inner::E42P17 => "42P17", - Inner::E44000 => "44000", - Inner::E53000 => "53000", - Inner::E53100 => "53100", - Inner::E53200 => "53200", - Inner::E53300 => "53300", - Inner::E53400 => "53400", - Inner::E54000 => "54000", - Inner::E54001 => "54001", - Inner::E54011 => "54011", - Inner::E54023 => "54023", - Inner::E55000 => "55000", - Inner::E55006 => "55006", - Inner::E55P02 => "55P02", - Inner::E55P03 => "55P03", - Inner::E55P04 => "55P04", - Inner::E57000 => "57000", - Inner::E57014 => "57014", - Inner::E57P01 => "57P01", - Inner::E57P02 => "57P02", - Inner::E57P03 => "57P03", - Inner::E57P04 => "57P04", - Inner::E57P05 => "57P05", - Inner::E58000 => "58000", - Inner::E58030 => "58030", - Inner::E58P01 => "58P01", - Inner::E58P02 => "58P02", - Inner::E72000 => "72000", - Inner::EF0000 => "F0000", - Inner::EF0001 => "F0001", - Inner::EHV000 => "HV000", - Inner::EHV005 => "HV005", - Inner::EHV002 => "HV002", - Inner::EHV010 => "HV010", - Inner::EHV021 => "HV021", - Inner::EHV024 => "HV024", - Inner::EHV007 => "HV007", - Inner::EHV008 => "HV008", - Inner::EHV004 => "HV004", - Inner::EHV006 => "HV006", - Inner::EHV091 => "HV091", - Inner::EHV00B => "HV00B", - Inner::EHV00C => "HV00C", - Inner::EHV00D => "HV00D", - Inner::EHV090 => "HV090", - Inner::EHV00A => "HV00A", - Inner::EHV009 => "HV009", - Inner::EHV014 => "HV014", - Inner::EHV001 => "HV001", - Inner::EHV00P => "HV00P", - Inner::EHV00J => "HV00J", - Inner::EHV00K => "HV00K", - Inner::EHV00Q => "HV00Q", - Inner::EHV00R => "HV00R", - Inner::EHV00L => "HV00L", - Inner::EHV00M => "HV00M", - Inner::EHV00N => "HV00N", - Inner::EP0000 => "P0000", - Inner::EP0001 => "P0001", - Inner::EP0002 => "P0002", - Inner::EP0003 => "P0003", - Inner::EP0004 => "P0004", - Inner::EXX000 => "XX000", - Inner::EXX001 => "XX001", - Inner::EXX002 => "XX002", - Inner::Other(code) => code, - } + std::str::from_utf8(&self.0).unwrap() } - /// 00000 - pub const SUCCESSFUL_COMPLETION: SqlState = SqlState(Inner::E00000); - - /// 01000 - pub const WARNING: SqlState = SqlState(Inner::E01000); - - /// 0100C - pub const WARNING_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E0100C); - - /// 01008 - pub const WARNING_IMPLICIT_ZERO_BIT_PADDING: SqlState = SqlState(Inner::E01008); - - /// 01003 - pub const WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION: SqlState = SqlState(Inner::E01003); - - /// 01007 - pub const WARNING_PRIVILEGE_NOT_GRANTED: SqlState = SqlState(Inner::E01007); - - /// 01006 - pub const WARNING_PRIVILEGE_NOT_REVOKED: SqlState = SqlState(Inner::E01006); - - /// 01004 - pub const WARNING_STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E01004); - - /// 01P01 - pub const WARNING_DEPRECATED_FEATURE: SqlState = SqlState(Inner::E01P01); - - /// 02000 - pub const NO_DATA: SqlState = SqlState(Inner::E02000); - - /// 02001 - pub const NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E02001); - - /// 03000 - pub const SQL_STATEMENT_NOT_YET_COMPLETE: SqlState = SqlState(Inner::E03000); + // Class 08 - Connection Exception /// 08000 - pub const CONNECTION_EXCEPTION: SqlState = SqlState(Inner::E08000); + pub const CONNECTION_EXCEPTION: SqlState = SqlState(*b"08000"); /// 08003 - pub const CONNECTION_DOES_NOT_EXIST: SqlState = SqlState(Inner::E08003); + pub const CONNECTION_DOES_NOT_EXIST: SqlState = SqlState(*b"08003"); /// 08006 - pub const CONNECTION_FAILURE: SqlState = SqlState(Inner::E08006); + pub const CONNECTION_FAILURE: SqlState = SqlState(*b"08006"); /// 08001 - pub const SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION: SqlState = SqlState(Inner::E08001); - - /// 08004 - pub const SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION: SqlState = SqlState(Inner::E08004); - - /// 08007 - pub const TRANSACTION_RESOLUTION_UNKNOWN: SqlState = SqlState(Inner::E08007); + pub const SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION: SqlState = SqlState(*b"08001"); /// 08P01 - pub const PROTOCOL_VIOLATION: SqlState = SqlState(Inner::E08P01); + pub const PROTOCOL_VIOLATION: SqlState = SqlState(*b"08P01"); - /// 09000 - pub const TRIGGERED_ACTION_EXCEPTION: SqlState = SqlState(Inner::E09000); - - /// 0A000 - pub const FEATURE_NOT_SUPPORTED: SqlState = SqlState(Inner::E0A000); - - /// 0B000 - pub const INVALID_TRANSACTION_INITIATION: SqlState = SqlState(Inner::E0B000); - - /// 0F000 - pub const LOCATOR_EXCEPTION: SqlState = SqlState(Inner::E0F000); - - /// 0F001 - pub const L_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E0F001); - - /// 0L000 - pub const INVALID_GRANTOR: SqlState = SqlState(Inner::E0L000); - - /// 0LP01 - pub const INVALID_GRANT_OPERATION: SqlState = SqlState(Inner::E0LP01); - - /// 0P000 - pub const INVALID_ROLE_SPECIFICATION: SqlState = SqlState(Inner::E0P000); - - /// 0Z000 - pub const DIAGNOSTICS_EXCEPTION: SqlState = SqlState(Inner::E0Z000); - - /// 0Z002 - pub const STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER: SqlState = - SqlState(Inner::E0Z002); - - /// 20000 - pub const CASE_NOT_FOUND: SqlState = SqlState(Inner::E20000); - - /// 21000 - pub const CARDINALITY_VIOLATION: SqlState = SqlState(Inner::E21000); - - /// 22000 - pub const DATA_EXCEPTION: SqlState = SqlState(Inner::E22000); - - /// 2202E - pub const ARRAY_ELEMENT_ERROR: SqlState = SqlState(Inner::E2202E); - - /// 2202E - pub const ARRAY_SUBSCRIPT_ERROR: SqlState = SqlState(Inner::E2202E); - - /// 22021 - pub const CHARACTER_NOT_IN_REPERTOIRE: SqlState = SqlState(Inner::E22021); - - /// 22008 - pub const DATETIME_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22008); - - /// 22008 - pub const DATETIME_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22008); - - /// 22012 - pub const DIVISION_BY_ZERO: SqlState = SqlState(Inner::E22012); - - /// 22005 - pub const ERROR_IN_ASSIGNMENT: SqlState = SqlState(Inner::E22005); - - /// 2200B - pub const ESCAPE_CHARACTER_CONFLICT: SqlState = SqlState(Inner::E2200B); - - /// 22022 - pub const INDICATOR_OVERFLOW: SqlState = SqlState(Inner::E22022); - - /// 22015 - pub const INTERVAL_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22015); - - /// 2201E - pub const INVALID_ARGUMENT_FOR_LOG: SqlState = SqlState(Inner::E2201E); - - /// 22014 - pub const INVALID_ARGUMENT_FOR_NTILE: SqlState = SqlState(Inner::E22014); - - /// 22016 - pub const INVALID_ARGUMENT_FOR_NTH_VALUE: SqlState = SqlState(Inner::E22016); - - /// 2201F - pub const INVALID_ARGUMENT_FOR_POWER_FUNCTION: SqlState = SqlState(Inner::E2201F); - - /// 2201G - pub const INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION: SqlState = SqlState(Inner::E2201G); - - /// 22018 - pub const INVALID_CHARACTER_VALUE_FOR_CAST: SqlState = SqlState(Inner::E22018); - - /// 22007 - pub const INVALID_DATETIME_FORMAT: SqlState = SqlState(Inner::E22007); - - /// 22019 - pub const INVALID_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22019); - - /// 2200D - pub const INVALID_ESCAPE_OCTET: SqlState = SqlState(Inner::E2200D); - - /// 22025 - pub const INVALID_ESCAPE_SEQUENCE: SqlState = SqlState(Inner::E22025); - - /// 22P06 - pub const NONSTANDARD_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22P06); - - /// 22010 - pub const INVALID_INDICATOR_PARAMETER_VALUE: SqlState = SqlState(Inner::E22010); + // Class 22 - Data Exception /// 22023 - pub const INVALID_PARAMETER_VALUE: SqlState = SqlState(Inner::E22023); + pub const INVALID_PARAMETER_VALUE: SqlState = SqlState(*b"22023"); - /// 22013 - pub const INVALID_PRECEDING_OR_FOLLOWING_SIZE: SqlState = SqlState(Inner::E22013); - - /// 2201B - pub const INVALID_REGULAR_EXPRESSION: SqlState = SqlState(Inner::E2201B); - - /// 2201W - pub const INVALID_ROW_COUNT_IN_LIMIT_CLAUSE: SqlState = SqlState(Inner::E2201W); - - /// 2201X - pub const INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE: SqlState = SqlState(Inner::E2201X); - - /// 2202H - pub const INVALID_TABLESAMPLE_ARGUMENT: SqlState = SqlState(Inner::E2202H); - - /// 2202G - pub const INVALID_TABLESAMPLE_REPEAT: SqlState = SqlState(Inner::E2202G); - - /// 22009 - pub const INVALID_TIME_ZONE_DISPLACEMENT_VALUE: SqlState = SqlState(Inner::E22009); - - /// 2200C - pub const INVALID_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E2200C); - - /// 2200G - pub const MOST_SPECIFIC_TYPE_MISMATCH: SqlState = SqlState(Inner::E2200G); - - /// 22004 - pub const NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E22004); - - /// 22002 - pub const NULL_VALUE_NO_INDICATOR_PARAMETER: SqlState = SqlState(Inner::E22002); - - /// 22003 - pub const NUMERIC_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22003); - - /// 2200H - pub const SEQUENCE_GENERATOR_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E2200H); - - /// 22026 - pub const STRING_DATA_LENGTH_MISMATCH: SqlState = SqlState(Inner::E22026); - - /// 22001 - pub const STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E22001); - - /// 22011 - pub const SUBSTRING_ERROR: SqlState = SqlState(Inner::E22011); - - /// 22027 - pub const TRIM_ERROR: SqlState = SqlState(Inner::E22027); - - /// 22024 - pub const UNTERMINATED_C_STRING: SqlState = SqlState(Inner::E22024); - - /// 2200F - pub const ZERO_LENGTH_CHARACTER_STRING: SqlState = SqlState(Inner::E2200F); - - /// 22P01 - pub const FLOATING_POINT_EXCEPTION: SqlState = SqlState(Inner::E22P01); - - /// 22P02 - pub const INVALID_TEXT_REPRESENTATION: SqlState = SqlState(Inner::E22P02); - - /// 22P03 - pub const INVALID_BINARY_REPRESENTATION: SqlState = SqlState(Inner::E22P03); - - /// 22P04 - pub const BAD_COPY_FILE_FORMAT: SqlState = SqlState(Inner::E22P04); - - /// 22P05 - pub const UNTRANSLATABLE_CHARACTER: SqlState = SqlState(Inner::E22P05); - - /// 2200L - pub const NOT_AN_XML_DOCUMENT: SqlState = SqlState(Inner::E2200L); - - /// 2200M - pub const INVALID_XML_DOCUMENT: SqlState = SqlState(Inner::E2200M); - - /// 2200N - pub const INVALID_XML_CONTENT: SqlState = SqlState(Inner::E2200N); - - /// 2200S - pub const INVALID_XML_COMMENT: SqlState = SqlState(Inner::E2200S); - - /// 2200T - pub const INVALID_XML_PROCESSING_INSTRUCTION: SqlState = SqlState(Inner::E2200T); - - /// 22030 - pub const DUPLICATE_JSON_OBJECT_KEY_VALUE: SqlState = SqlState(Inner::E22030); - - /// 22031 - pub const INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION: SqlState = SqlState(Inner::E22031); - - /// 22032 - pub const INVALID_JSON_TEXT: SqlState = SqlState(Inner::E22032); - - /// 22033 - pub const INVALID_SQL_JSON_SUBSCRIPT: SqlState = SqlState(Inner::E22033); - - /// 22034 - pub const MORE_THAN_ONE_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22034); - - /// 22035 - pub const NO_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22035); - - /// 22036 - pub const NON_NUMERIC_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22036); - - /// 22037 - pub const NON_UNIQUE_KEYS_IN_A_JSON_OBJECT: SqlState = SqlState(Inner::E22037); - - /// 22038 - pub const SINGLETON_SQL_JSON_ITEM_REQUIRED: SqlState = SqlState(Inner::E22038); - - /// 22039 - pub const SQL_JSON_ARRAY_NOT_FOUND: SqlState = SqlState(Inner::E22039); - - /// 2203A - pub const SQL_JSON_MEMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203A); - - /// 2203B - pub const SQL_JSON_NUMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203B); - - /// 2203C - pub const SQL_JSON_OBJECT_NOT_FOUND: SqlState = SqlState(Inner::E2203C); - - /// 2203D - pub const TOO_MANY_JSON_ARRAY_ELEMENTS: SqlState = SqlState(Inner::E2203D); - - /// 2203E - pub const TOO_MANY_JSON_OBJECT_MEMBERS: SqlState = SqlState(Inner::E2203E); - - /// 2203F - pub const SQL_JSON_SCALAR_REQUIRED: SqlState = SqlState(Inner::E2203F); - - /// 2203G - pub const SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE: SqlState = SqlState(Inner::E2203G); - - /// 23000 - pub const INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E23000); - - /// 23001 - pub const RESTRICT_VIOLATION: SqlState = SqlState(Inner::E23001); - - /// 23502 - pub const NOT_NULL_VIOLATION: SqlState = SqlState(Inner::E23502); - - /// 23503 - pub const FOREIGN_KEY_VIOLATION: SqlState = SqlState(Inner::E23503); - - /// 23505 - pub const UNIQUE_VIOLATION: SqlState = SqlState(Inner::E23505); - - /// 23514 - pub const CHECK_VIOLATION: SqlState = SqlState(Inner::E23514); - - /// 23P01 - pub const EXCLUSION_VIOLATION: SqlState = SqlState(Inner::E23P01); - - /// 24000 - pub const INVALID_CURSOR_STATE: SqlState = SqlState(Inner::E24000); - - /// 25000 - pub const INVALID_TRANSACTION_STATE: SqlState = SqlState(Inner::E25000); - - /// 25001 - pub const ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25001); - - /// 25002 - pub const BRANCH_TRANSACTION_ALREADY_ACTIVE: SqlState = SqlState(Inner::E25002); - - /// 25008 - pub const HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL: SqlState = SqlState(Inner::E25008); - - /// 25003 - pub const INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25003); - - /// 25004 - pub const INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION: SqlState = - SqlState(Inner::E25004); - - /// 25005 - pub const NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25005); - - /// 25006 - pub const READ_ONLY_SQL_TRANSACTION: SqlState = SqlState(Inner::E25006); - - /// 25007 - pub const SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED: SqlState = SqlState(Inner::E25007); - - /// 25P01 - pub const NO_ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P01); - - /// 25P02 - pub const IN_FAILED_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P02); - - /// 25P03 - pub const IDLE_IN_TRANSACTION_SESSION_TIMEOUT: SqlState = SqlState(Inner::E25P03); - - /// 26000 - pub const INVALID_SQL_STATEMENT_NAME: SqlState = SqlState(Inner::E26000); - - /// 26000 - pub const UNDEFINED_PSTATEMENT: SqlState = SqlState(Inner::E26000); - - /// 27000 - pub const TRIGGERED_DATA_CHANGE_VIOLATION: SqlState = SqlState(Inner::E27000); - - /// 28000 - pub const INVALID_AUTHORIZATION_SPECIFICATION: SqlState = SqlState(Inner::E28000); - - /// 28P01 - pub const INVALID_PASSWORD: SqlState = SqlState(Inner::E28P01); - - /// 2B000 - pub const DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST: SqlState = SqlState(Inner::E2B000); - - /// 2BP01 - pub const DEPENDENT_OBJECTS_STILL_EXIST: SqlState = SqlState(Inner::E2BP01); - - /// 2D000 - pub const INVALID_TRANSACTION_TERMINATION: SqlState = SqlState(Inner::E2D000); - - /// 2F000 - pub const SQL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E2F000); - - /// 2F005 - pub const S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT: SqlState = SqlState(Inner::E2F005); - - /// 2F002 - pub const S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F002); - - /// 2F003 - pub const S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E2F003); - - /// 2F004 - pub const S_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F004); - - /// 34000 - pub const INVALID_CURSOR_NAME: SqlState = SqlState(Inner::E34000); - - /// 34000 - pub const UNDEFINED_CURSOR: SqlState = SqlState(Inner::E34000); - - /// 38000 - pub const EXTERNAL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E38000); - - /// 38001 - pub const E_R_E_CONTAINING_SQL_NOT_PERMITTED: SqlState = SqlState(Inner::E38001); - - /// 38002 - pub const E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38002); - - /// 38003 - pub const E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E38003); - - /// 38004 - pub const E_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38004); - - /// 39000 - pub const EXTERNAL_ROUTINE_INVOCATION_EXCEPTION: SqlState = SqlState(Inner::E39000); - - /// 39001 - pub const E_R_I_E_INVALID_SQLSTATE_RETURNED: SqlState = SqlState(Inner::E39001); - - /// 39004 - pub const E_R_I_E_NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E39004); - - /// 39P01 - pub const E_R_I_E_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P01); - - /// 39P02 - pub const E_R_I_E_SRF_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P02); - - /// 39P03 - pub const E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P03); - - /// 3B000 - pub const SAVEPOINT_EXCEPTION: SqlState = SqlState(Inner::E3B000); - - /// 3B001 - pub const S_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E3B001); + // Class 3D - Invalid Catalog Name /// 3D000 - pub const INVALID_CATALOG_NAME: SqlState = SqlState(Inner::E3D000); + pub const INVALID_CATALOG_NAME: SqlState = SqlState(*b"3D000"); - /// 3D000 - pub const UNDEFINED_DATABASE: SqlState = SqlState(Inner::E3D000); + // Class 3F - Invalid Schema Name /// 3F000 - pub const INVALID_SCHEMA_NAME: SqlState = SqlState(Inner::E3F000); + pub const INVALID_SCHEMA_NAME: SqlState = SqlState(*b"3F000"); - /// 3F000 - pub const UNDEFINED_SCHEMA: SqlState = SqlState(Inner::E3F000); - - /// 40000 - pub const TRANSACTION_ROLLBACK: SqlState = SqlState(Inner::E40000); - - /// 40002 - pub const T_R_INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E40002); + // Class 40 - Transaction Rollback /// 40001 - pub const T_R_SERIALIZATION_FAILURE: SqlState = SqlState(Inner::E40001); + pub const T_R_SERIALIZATION_FAILURE: SqlState = SqlState(*b"40001"); - /// 40003 - pub const T_R_STATEMENT_COMPLETION_UNKNOWN: SqlState = SqlState(Inner::E40003); - - /// 40P01 - pub const T_R_DEADLOCK_DETECTED: SqlState = SqlState(Inner::E40P01); - - /// 42000 - pub const SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION: SqlState = SqlState(Inner::E42000); + // Class 42 - Syntax Error or Access Rule Violation /// 42601 - pub const SYNTAX_ERROR: SqlState = SqlState(Inner::E42601); + pub const SYNTAX_ERROR: SqlState = SqlState(*b"42601"); - /// 42501 - pub const INSUFFICIENT_PRIVILEGE: SqlState = SqlState(Inner::E42501); - - /// 42846 - pub const CANNOT_COERCE: SqlState = SqlState(Inner::E42846); - - /// 42803 - pub const GROUPING_ERROR: SqlState = SqlState(Inner::E42803); - - /// 42P20 - pub const WINDOWING_ERROR: SqlState = SqlState(Inner::E42P20); - - /// 42P19 - pub const INVALID_RECURSION: SqlState = SqlState(Inner::E42P19); - - /// 42830 - pub const INVALID_FOREIGN_KEY: SqlState = SqlState(Inner::E42830); - - /// 42602 - pub const INVALID_NAME: SqlState = SqlState(Inner::E42602); - - /// 42622 - pub const NAME_TOO_LONG: SqlState = SqlState(Inner::E42622); - - /// 42939 - pub const RESERVED_NAME: SqlState = SqlState(Inner::E42939); - - /// 42804 - pub const DATATYPE_MISMATCH: SqlState = SqlState(Inner::E42804); - - /// 42P18 - pub const INDETERMINATE_DATATYPE: SqlState = SqlState(Inner::E42P18); - - /// 42P21 - pub const COLLATION_MISMATCH: SqlState = SqlState(Inner::E42P21); - - /// 42P22 - pub const INDETERMINATE_COLLATION: SqlState = SqlState(Inner::E42P22); - - /// 42809 - pub const WRONG_OBJECT_TYPE: SqlState = SqlState(Inner::E42809); - - /// 428C9 - pub const GENERATED_ALWAYS: SqlState = SqlState(Inner::E428C9); - - /// 42703 - pub const UNDEFINED_COLUMN: SqlState = SqlState(Inner::E42703); - - /// 42883 - pub const UNDEFINED_FUNCTION: SqlState = SqlState(Inner::E42883); - - /// 42P01 - pub const UNDEFINED_TABLE: SqlState = SqlState(Inner::E42P01); - - /// 42P02 - pub const UNDEFINED_PARAMETER: SqlState = SqlState(Inner::E42P02); - - /// 42704 - pub const UNDEFINED_OBJECT: SqlState = SqlState(Inner::E42704); - - /// 42701 - pub const DUPLICATE_COLUMN: SqlState = SqlState(Inner::E42701); - - /// 42P03 - pub const DUPLICATE_CURSOR: SqlState = SqlState(Inner::E42P03); - - /// 42P04 - pub const DUPLICATE_DATABASE: SqlState = SqlState(Inner::E42P04); - - /// 42723 - pub const DUPLICATE_FUNCTION: SqlState = SqlState(Inner::E42723); - - /// 42P05 - pub const DUPLICATE_PSTATEMENT: SqlState = SqlState(Inner::E42P05); - - /// 42P06 - pub const DUPLICATE_SCHEMA: SqlState = SqlState(Inner::E42P06); - - /// 42P07 - pub const DUPLICATE_TABLE: SqlState = SqlState(Inner::E42P07); - - /// 42712 - pub const DUPLICATE_ALIAS: SqlState = SqlState(Inner::E42712); - - /// 42710 - pub const DUPLICATE_OBJECT: SqlState = SqlState(Inner::E42710); - - /// 42702 - pub const AMBIGUOUS_COLUMN: SqlState = SqlState(Inner::E42702); - - /// 42725 - pub const AMBIGUOUS_FUNCTION: SqlState = SqlState(Inner::E42725); - - /// 42P08 - pub const AMBIGUOUS_PARAMETER: SqlState = SqlState(Inner::E42P08); - - /// 42P09 - pub const AMBIGUOUS_ALIAS: SqlState = SqlState(Inner::E42P09); - - /// 42P10 - pub const INVALID_COLUMN_REFERENCE: SqlState = SqlState(Inner::E42P10); - - /// 42611 - pub const INVALID_COLUMN_DEFINITION: SqlState = SqlState(Inner::E42611); - - /// 42P11 - pub const INVALID_CURSOR_DEFINITION: SqlState = SqlState(Inner::E42P11); - - /// 42P12 - pub const INVALID_DATABASE_DEFINITION: SqlState = SqlState(Inner::E42P12); - - /// 42P13 - pub const INVALID_FUNCTION_DEFINITION: SqlState = SqlState(Inner::E42P13); - - /// 42P14 - pub const INVALID_PSTATEMENT_DEFINITION: SqlState = SqlState(Inner::E42P14); - - /// 42P15 - pub const INVALID_SCHEMA_DEFINITION: SqlState = SqlState(Inner::E42P15); - - /// 42P16 - pub const INVALID_TABLE_DEFINITION: SqlState = SqlState(Inner::E42P16); - - /// 42P17 - pub const INVALID_OBJECT_DEFINITION: SqlState = SqlState(Inner::E42P17); - - /// 44000 - pub const WITH_CHECK_OPTION_VIOLATION: SqlState = SqlState(Inner::E44000); - - /// 53000 - pub const INSUFFICIENT_RESOURCES: SqlState = SqlState(Inner::E53000); - - /// 53100 - pub const DISK_FULL: SqlState = SqlState(Inner::E53100); + // Class 53 - Insufficient Resources /// 53200 - pub const OUT_OF_MEMORY: SqlState = SqlState(Inner::E53200); + pub const OUT_OF_MEMORY: SqlState = SqlState(*b"53200"); /// 53300 - pub const TOO_MANY_CONNECTIONS: SqlState = SqlState(Inner::E53300); + pub const TOO_MANY_CONNECTIONS: SqlState = SqlState(*b"53300"); - /// 53400 - pub const CONFIGURATION_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E53400); - - /// 54000 - pub const PROGRAM_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E54000); - - /// 54001 - pub const STATEMENT_TOO_COMPLEX: SqlState = SqlState(Inner::E54001); - - /// 54011 - pub const TOO_MANY_COLUMNS: SqlState = SqlState(Inner::E54011); - - /// 54023 - pub const TOO_MANY_ARGUMENTS: SqlState = SqlState(Inner::E54023); - - /// 55000 - pub const OBJECT_NOT_IN_PREREQUISITE_STATE: SqlState = SqlState(Inner::E55000); - - /// 55006 - pub const OBJECT_IN_USE: SqlState = SqlState(Inner::E55006); - - /// 55P02 - pub const CANT_CHANGE_RUNTIME_PARAM: SqlState = SqlState(Inner::E55P02); - - /// 55P03 - pub const LOCK_NOT_AVAILABLE: SqlState = SqlState(Inner::E55P03); - - /// 55P04 - pub const UNSAFE_NEW_ENUM_VALUE_USAGE: SqlState = SqlState(Inner::E55P04); - - /// 57000 - pub const OPERATOR_INTERVENTION: SqlState = SqlState(Inner::E57000); + // Class 57 - Operator Intervention /// 57014 - pub const QUERY_CANCELED: SqlState = SqlState(Inner::E57014); - - /// 57P01 - pub const ADMIN_SHUTDOWN: SqlState = SqlState(Inner::E57P01); - - /// 57P02 - pub const CRASH_SHUTDOWN: SqlState = SqlState(Inner::E57P02); - - /// 57P03 - pub const CANNOT_CONNECT_NOW: SqlState = SqlState(Inner::E57P03); - - /// 57P04 - pub const DATABASE_DROPPED: SqlState = SqlState(Inner::E57P04); - - /// 57P05 - pub const IDLE_SESSION_TIMEOUT: SqlState = SqlState(Inner::E57P05); - - /// 58000 - pub const SYSTEM_ERROR: SqlState = SqlState(Inner::E58000); - - /// 58030 - pub const IO_ERROR: SqlState = SqlState(Inner::E58030); - - /// 58P01 - pub const UNDEFINED_FILE: SqlState = SqlState(Inner::E58P01); - - /// 58P02 - pub const DUPLICATE_FILE: SqlState = SqlState(Inner::E58P02); - - /// 72000 - pub const SNAPSHOT_TOO_OLD: SqlState = SqlState(Inner::E72000); - - /// F0000 - pub const CONFIG_FILE_ERROR: SqlState = SqlState(Inner::EF0000); - - /// F0001 - pub const LOCK_FILE_EXISTS: SqlState = SqlState(Inner::EF0001); - - /// HV000 - pub const FDW_ERROR: SqlState = SqlState(Inner::EHV000); - - /// HV005 - pub const FDW_COLUMN_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV005); - - /// HV002 - pub const FDW_DYNAMIC_PARAMETER_VALUE_NEEDED: SqlState = SqlState(Inner::EHV002); - - /// HV010 - pub const FDW_FUNCTION_SEQUENCE_ERROR: SqlState = SqlState(Inner::EHV010); - - /// HV021 - pub const FDW_INCONSISTENT_DESCRIPTOR_INFORMATION: SqlState = SqlState(Inner::EHV021); - - /// HV024 - pub const FDW_INVALID_ATTRIBUTE_VALUE: SqlState = SqlState(Inner::EHV024); - - /// HV007 - pub const FDW_INVALID_COLUMN_NAME: SqlState = SqlState(Inner::EHV007); - - /// HV008 - pub const FDW_INVALID_COLUMN_NUMBER: SqlState = SqlState(Inner::EHV008); - - /// HV004 - pub const FDW_INVALID_DATA_TYPE: SqlState = SqlState(Inner::EHV004); - - /// HV006 - pub const FDW_INVALID_DATA_TYPE_DESCRIPTORS: SqlState = SqlState(Inner::EHV006); - - /// HV091 - pub const FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER: SqlState = SqlState(Inner::EHV091); - - /// HV00B - pub const FDW_INVALID_HANDLE: SqlState = SqlState(Inner::EHV00B); - - /// HV00C - pub const FDW_INVALID_OPTION_INDEX: SqlState = SqlState(Inner::EHV00C); - - /// HV00D - pub const FDW_INVALID_OPTION_NAME: SqlState = SqlState(Inner::EHV00D); - - /// HV090 - pub const FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH: SqlState = SqlState(Inner::EHV090); - - /// HV00A - pub const FDW_INVALID_STRING_FORMAT: SqlState = SqlState(Inner::EHV00A); - - /// HV009 - pub const FDW_INVALID_USE_OF_NULL_POINTER: SqlState = SqlState(Inner::EHV009); - - /// HV014 - pub const FDW_TOO_MANY_HANDLES: SqlState = SqlState(Inner::EHV014); - - /// HV001 - pub const FDW_OUT_OF_MEMORY: SqlState = SqlState(Inner::EHV001); - - /// HV00P - pub const FDW_NO_SCHEMAS: SqlState = SqlState(Inner::EHV00P); - - /// HV00J - pub const FDW_OPTION_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV00J); - - /// HV00K - pub const FDW_REPLY_HANDLE: SqlState = SqlState(Inner::EHV00K); - - /// HV00Q - pub const FDW_SCHEMA_NOT_FOUND: SqlState = SqlState(Inner::EHV00Q); - - /// HV00R - pub const FDW_TABLE_NOT_FOUND: SqlState = SqlState(Inner::EHV00R); - - /// HV00L - pub const FDW_UNABLE_TO_CREATE_EXECUTION: SqlState = SqlState(Inner::EHV00L); - - /// HV00M - pub const FDW_UNABLE_TO_CREATE_REPLY: SqlState = SqlState(Inner::EHV00M); - - /// HV00N - pub const FDW_UNABLE_TO_ESTABLISH_CONNECTION: SqlState = SqlState(Inner::EHV00N); - - /// P0000 - pub const PLPGSQL_ERROR: SqlState = SqlState(Inner::EP0000); - - /// P0001 - pub const RAISE_EXCEPTION: SqlState = SqlState(Inner::EP0001); - - /// P0002 - pub const NO_DATA_FOUND: SqlState = SqlState(Inner::EP0002); - - /// P0003 - pub const TOO_MANY_ROWS: SqlState = SqlState(Inner::EP0003); - - /// P0004 - pub const ASSERT_FAILURE: SqlState = SqlState(Inner::EP0004); - - /// XX000 - pub const INTERNAL_ERROR: SqlState = SqlState(Inner::EXX000); - - /// XX001 - pub const DATA_CORRUPTED: SqlState = SqlState(Inner::EXX001); - - /// XX002 - pub const INDEX_CORRUPTED: SqlState = SqlState(Inner::EXX002); + pub const QUERY_CANCELED: SqlState = SqlState(*b"57014"); } -#[derive(PartialEq, Eq, Clone, Debug)] -#[allow(clippy::upper_case_acronyms)] -enum Inner { - E00000, - E01000, - E0100C, - E01008, - E01003, - E01007, - E01006, - E01004, - E01P01, - E02000, - E02001, - E03000, - E08000, - E08003, - E08006, - E08001, - E08004, - E08007, - E08P01, - E09000, - E0A000, - E0B000, - E0F000, - E0F001, - E0L000, - E0LP01, - E0P000, - E0Z000, - E0Z002, - E20000, - E21000, - E22000, - E2202E, - E22021, - E22008, - E22012, - E22005, - E2200B, - E22022, - E22015, - E2201E, - E22014, - E22016, - E2201F, - E2201G, - E22018, - E22007, - E22019, - E2200D, - E22025, - E22P06, - E22010, - E22023, - E22013, - E2201B, - E2201W, - E2201X, - E2202H, - E2202G, - E22009, - E2200C, - E2200G, - E22004, - E22002, - E22003, - E2200H, - E22026, - E22001, - E22011, - E22027, - E22024, - E2200F, - E22P01, - E22P02, - E22P03, - E22P04, - E22P05, - E2200L, - E2200M, - E2200N, - E2200S, - E2200T, - E22030, - E22031, - E22032, - E22033, - E22034, - E22035, - E22036, - E22037, - E22038, - E22039, - E2203A, - E2203B, - E2203C, - E2203D, - E2203E, - E2203F, - E2203G, - E23000, - E23001, - E23502, - E23503, - E23505, - E23514, - E23P01, - E24000, - E25000, - E25001, - E25002, - E25008, - E25003, - E25004, - E25005, - E25006, - E25007, - E25P01, - E25P02, - E25P03, - E26000, - E27000, - E28000, - E28P01, - E2B000, - E2BP01, - E2D000, - E2F000, - E2F005, - E2F002, - E2F003, - E2F004, - E34000, - E38000, - E38001, - E38002, - E38003, - E38004, - E39000, - E39001, - E39004, - E39P01, - E39P02, - E39P03, - E3B000, - E3B001, - E3D000, - E3F000, - E40000, - E40002, - E40001, - E40003, - E40P01, - E42000, - E42601, - E42501, - E42846, - E42803, - E42P20, - E42P19, - E42830, - E42602, - E42622, - E42939, - E42804, - E42P18, - E42P21, - E42P22, - E42809, - E428C9, - E42703, - E42883, - E42P01, - E42P02, - E42704, - E42701, - E42P03, - E42P04, - E42723, - E42P05, - E42P06, - E42P07, - E42712, - E42710, - E42702, - E42725, - E42P08, - E42P09, - E42P10, - E42611, - E42P11, - E42P12, - E42P13, - E42P14, - E42P15, - E42P16, - E42P17, - E44000, - E53000, - E53100, - E53200, - E53300, - E53400, - E54000, - E54001, - E54011, - E54023, - E55000, - E55006, - E55P02, - E55P03, - E55P04, - E57000, - E57014, - E57P01, - E57P02, - E57P03, - E57P04, - E57P05, - E58000, - E58030, - E58P01, - E58P02, - E72000, - EF0000, - EF0001, - EHV000, - EHV005, - EHV002, - EHV010, - EHV021, - EHV024, - EHV007, - EHV008, - EHV004, - EHV006, - EHV091, - EHV00B, - EHV00C, - EHV00D, - EHV090, - EHV00A, - EHV009, - EHV014, - EHV001, - EHV00P, - EHV00J, - EHV00K, - EHV00Q, - EHV00R, - EHV00L, - EHV00M, - EHV00N, - EP0000, - EP0001, - EP0002, - EP0003, - EP0004, - EXX000, - EXX001, - EXX002, - Other(Box), -} +#[cfg(test)] +mod tests { + use super::SqlState; -#[rustfmt::skip] -static SQLSTATE_MAP: phf::Map<&'static str, SqlState> = -::phf::Map { - key: 12913932095322966823, - disps: &[ - (0, 24), - (0, 12), - (0, 74), - (0, 109), - (0, 11), - (0, 9), - (0, 0), - (4, 38), - (3, 155), - (0, 6), - (1, 242), - (0, 66), - (0, 53), - (5, 180), - (3, 221), - (7, 230), - (0, 125), - (1, 46), - (0, 11), - (1, 2), - (0, 5), - (0, 13), - (0, 171), - (0, 15), - (0, 4), - (0, 22), - (1, 85), - (0, 75), - (2, 0), - (1, 25), - (7, 47), - (0, 45), - (0, 35), - (0, 7), - (7, 124), - (0, 0), - (14, 104), - (1, 183), - (61, 50), - (3, 76), - (0, 12), - (0, 7), - (4, 189), - (0, 1), - (64, 102), - (0, 0), - (16, 192), - (24, 19), - (0, 5), - (0, 87), - (0, 89), - (0, 14), - ], - entries: &[ - ("2F000", SqlState::SQL_ROUTINE_EXCEPTION), - ("01008", SqlState::WARNING_IMPLICIT_ZERO_BIT_PADDING), - ("42501", SqlState::INSUFFICIENT_PRIVILEGE), - ("22000", SqlState::DATA_EXCEPTION), - ("0100C", SqlState::WARNING_DYNAMIC_RESULT_SETS_RETURNED), - ("2200N", SqlState::INVALID_XML_CONTENT), - ("40001", SqlState::T_R_SERIALIZATION_FAILURE), - ("28P01", SqlState::INVALID_PASSWORD), - ("38000", SqlState::EXTERNAL_ROUTINE_EXCEPTION), - ("25006", SqlState::READ_ONLY_SQL_TRANSACTION), - ("2203D", SqlState::TOO_MANY_JSON_ARRAY_ELEMENTS), - ("42P09", SqlState::AMBIGUOUS_ALIAS), - ("F0000", SqlState::CONFIG_FILE_ERROR), - ("42P18", SqlState::INDETERMINATE_DATATYPE), - ("40002", SqlState::T_R_INTEGRITY_CONSTRAINT_VIOLATION), - ("22009", SqlState::INVALID_TIME_ZONE_DISPLACEMENT_VALUE), - ("42P08", SqlState::AMBIGUOUS_PARAMETER), - ("08000", SqlState::CONNECTION_EXCEPTION), - ("25P01", SqlState::NO_ACTIVE_SQL_TRANSACTION), - ("22024", SqlState::UNTERMINATED_C_STRING), - ("55000", SqlState::OBJECT_NOT_IN_PREREQUISITE_STATE), - ("25001", SqlState::ACTIVE_SQL_TRANSACTION), - ("03000", SqlState::SQL_STATEMENT_NOT_YET_COMPLETE), - ("42710", SqlState::DUPLICATE_OBJECT), - ("2D000", SqlState::INVALID_TRANSACTION_TERMINATION), - ("2200G", SqlState::MOST_SPECIFIC_TYPE_MISMATCH), - ("22022", SqlState::INDICATOR_OVERFLOW), - ("55006", SqlState::OBJECT_IN_USE), - ("53200", SqlState::OUT_OF_MEMORY), - ("22012", SqlState::DIVISION_BY_ZERO), - ("P0002", SqlState::NO_DATA_FOUND), - ("XX001", SqlState::DATA_CORRUPTED), - ("22P05", SqlState::UNTRANSLATABLE_CHARACTER), - ("40003", SqlState::T_R_STATEMENT_COMPLETION_UNKNOWN), - ("22021", SqlState::CHARACTER_NOT_IN_REPERTOIRE), - ("25000", SqlState::INVALID_TRANSACTION_STATE), - ("42P15", SqlState::INVALID_SCHEMA_DEFINITION), - ("0B000", SqlState::INVALID_TRANSACTION_INITIATION), - ("22004", SqlState::NULL_VALUE_NOT_ALLOWED), - ("42804", SqlState::DATATYPE_MISMATCH), - ("42803", SqlState::GROUPING_ERROR), - ("02001", SqlState::NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED), - ("25002", SqlState::BRANCH_TRANSACTION_ALREADY_ACTIVE), - ("28000", SqlState::INVALID_AUTHORIZATION_SPECIFICATION), - ("HV009", SqlState::FDW_INVALID_USE_OF_NULL_POINTER), - ("22P01", SqlState::FLOATING_POINT_EXCEPTION), - ("2B000", SqlState::DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST), - ("42723", SqlState::DUPLICATE_FUNCTION), - ("21000", SqlState::CARDINALITY_VIOLATION), - ("0Z002", SqlState::STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER), - ("23505", SqlState::UNIQUE_VIOLATION), - ("HV00J", SqlState::FDW_OPTION_NAME_NOT_FOUND), - ("23P01", SqlState::EXCLUSION_VIOLATION), - ("39P03", SqlState::E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED), - ("42P10", SqlState::INVALID_COLUMN_REFERENCE), - ("2202H", SqlState::INVALID_TABLESAMPLE_ARGUMENT), - ("55P04", SqlState::UNSAFE_NEW_ENUM_VALUE_USAGE), - ("P0000", SqlState::PLPGSQL_ERROR), - ("2F005", SqlState::S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT), - ("HV00M", SqlState::FDW_UNABLE_TO_CREATE_REPLY), - ("0A000", SqlState::FEATURE_NOT_SUPPORTED), - ("24000", SqlState::INVALID_CURSOR_STATE), - ("25008", SqlState::HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL), - ("01003", SqlState::WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION), - ("42712", SqlState::DUPLICATE_ALIAS), - ("HV014", SqlState::FDW_TOO_MANY_HANDLES), - ("58030", SqlState::IO_ERROR), - ("2201W", SqlState::INVALID_ROW_COUNT_IN_LIMIT_CLAUSE), - ("22033", SqlState::INVALID_SQL_JSON_SUBSCRIPT), - ("2BP01", SqlState::DEPENDENT_OBJECTS_STILL_EXIST), - ("HV005", SqlState::FDW_COLUMN_NAME_NOT_FOUND), - ("25004", SqlState::INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION), - ("54000", SqlState::PROGRAM_LIMIT_EXCEEDED), - ("20000", SqlState::CASE_NOT_FOUND), - ("2203G", SqlState::SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE), - ("22038", SqlState::SINGLETON_SQL_JSON_ITEM_REQUIRED), - ("22007", SqlState::INVALID_DATETIME_FORMAT), - ("08004", SqlState::SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION), - ("2200H", SqlState::SEQUENCE_GENERATOR_LIMIT_EXCEEDED), - ("HV00D", SqlState::FDW_INVALID_OPTION_NAME), - ("P0004", SqlState::ASSERT_FAILURE), - ("22018", SqlState::INVALID_CHARACTER_VALUE_FOR_CAST), - ("0L000", SqlState::INVALID_GRANTOR), - ("22P04", SqlState::BAD_COPY_FILE_FORMAT), - ("22031", SqlState::INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION), - ("01P01", SqlState::WARNING_DEPRECATED_FEATURE), - ("0LP01", SqlState::INVALID_GRANT_OPERATION), - ("58P02", SqlState::DUPLICATE_FILE), - ("26000", SqlState::INVALID_SQL_STATEMENT_NAME), - ("54001", SqlState::STATEMENT_TOO_COMPLEX), - ("22010", SqlState::INVALID_INDICATOR_PARAMETER_VALUE), - ("HV00C", SqlState::FDW_INVALID_OPTION_INDEX), - ("22008", SqlState::DATETIME_FIELD_OVERFLOW), - ("42P06", SqlState::DUPLICATE_SCHEMA), - ("25007", SqlState::SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED), - ("42P20", SqlState::WINDOWING_ERROR), - ("HV091", SqlState::FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER), - ("HV021", SqlState::FDW_INCONSISTENT_DESCRIPTOR_INFORMATION), - ("42702", SqlState::AMBIGUOUS_COLUMN), - ("02000", SqlState::NO_DATA), - ("54011", SqlState::TOO_MANY_COLUMNS), - ("HV004", SqlState::FDW_INVALID_DATA_TYPE), - ("01006", SqlState::WARNING_PRIVILEGE_NOT_REVOKED), - ("42701", SqlState::DUPLICATE_COLUMN), - ("08P01", SqlState::PROTOCOL_VIOLATION), - ("42622", SqlState::NAME_TOO_LONG), - ("P0003", SqlState::TOO_MANY_ROWS), - ("22003", SqlState::NUMERIC_VALUE_OUT_OF_RANGE), - ("42P03", SqlState::DUPLICATE_CURSOR), - ("23001", SqlState::RESTRICT_VIOLATION), - ("57000", SqlState::OPERATOR_INTERVENTION), - ("22027", SqlState::TRIM_ERROR), - ("42P12", SqlState::INVALID_DATABASE_DEFINITION), - ("3B000", SqlState::SAVEPOINT_EXCEPTION), - ("2201B", SqlState::INVALID_REGULAR_EXPRESSION), - ("22030", SqlState::DUPLICATE_JSON_OBJECT_KEY_VALUE), - ("2F004", SqlState::S_R_E_READING_SQL_DATA_NOT_PERMITTED), - ("428C9", SqlState::GENERATED_ALWAYS), - ("2200S", SqlState::INVALID_XML_COMMENT), - ("22039", SqlState::SQL_JSON_ARRAY_NOT_FOUND), - ("42809", SqlState::WRONG_OBJECT_TYPE), - ("2201X", SqlState::INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE), - ("39001", SqlState::E_R_I_E_INVALID_SQLSTATE_RETURNED), - ("25P02", SqlState::IN_FAILED_SQL_TRANSACTION), - ("0P000", SqlState::INVALID_ROLE_SPECIFICATION), - ("HV00N", SqlState::FDW_UNABLE_TO_ESTABLISH_CONNECTION), - ("53100", SqlState::DISK_FULL), - ("42601", SqlState::SYNTAX_ERROR), - ("23000", SqlState::INTEGRITY_CONSTRAINT_VIOLATION), - ("HV006", SqlState::FDW_INVALID_DATA_TYPE_DESCRIPTORS), - ("HV00B", SqlState::FDW_INVALID_HANDLE), - ("HV00Q", SqlState::FDW_SCHEMA_NOT_FOUND), - ("01000", SqlState::WARNING), - ("42883", SqlState::UNDEFINED_FUNCTION), - ("57P01", SqlState::ADMIN_SHUTDOWN), - ("22037", SqlState::NON_UNIQUE_KEYS_IN_A_JSON_OBJECT), - ("00000", SqlState::SUCCESSFUL_COMPLETION), - ("55P03", SqlState::LOCK_NOT_AVAILABLE), - ("42P01", SqlState::UNDEFINED_TABLE), - ("42830", SqlState::INVALID_FOREIGN_KEY), - ("22005", SqlState::ERROR_IN_ASSIGNMENT), - ("22025", SqlState::INVALID_ESCAPE_SEQUENCE), - ("XX002", SqlState::INDEX_CORRUPTED), - ("42P16", SqlState::INVALID_TABLE_DEFINITION), - ("55P02", SqlState::CANT_CHANGE_RUNTIME_PARAM), - ("22019", SqlState::INVALID_ESCAPE_CHARACTER), - ("P0001", SqlState::RAISE_EXCEPTION), - ("72000", SqlState::SNAPSHOT_TOO_OLD), - ("42P11", SqlState::INVALID_CURSOR_DEFINITION), - ("40P01", SqlState::T_R_DEADLOCK_DETECTED), - ("57P02", SqlState::CRASH_SHUTDOWN), - ("HV00A", SqlState::FDW_INVALID_STRING_FORMAT), - ("2F002", SqlState::S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED), - ("23503", SqlState::FOREIGN_KEY_VIOLATION), - ("40000", SqlState::TRANSACTION_ROLLBACK), - ("22032", SqlState::INVALID_JSON_TEXT), - ("2202E", SqlState::ARRAY_ELEMENT_ERROR), - ("42P19", SqlState::INVALID_RECURSION), - ("42611", SqlState::INVALID_COLUMN_DEFINITION), - ("42P13", SqlState::INVALID_FUNCTION_DEFINITION), - ("25003", SqlState::INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION), - ("39P02", SqlState::E_R_I_E_SRF_PROTOCOL_VIOLATED), - ("XX000", SqlState::INTERNAL_ERROR), - ("08006", SqlState::CONNECTION_FAILURE), - ("57P04", SqlState::DATABASE_DROPPED), - ("42P07", SqlState::DUPLICATE_TABLE), - ("22P03", SqlState::INVALID_BINARY_REPRESENTATION), - ("22035", SqlState::NO_SQL_JSON_ITEM), - ("42P14", SqlState::INVALID_PSTATEMENT_DEFINITION), - ("01007", SqlState::WARNING_PRIVILEGE_NOT_GRANTED), - ("38004", SqlState::E_R_E_READING_SQL_DATA_NOT_PERMITTED), - ("42P21", SqlState::COLLATION_MISMATCH), - ("0Z000", SqlState::DIAGNOSTICS_EXCEPTION), - ("HV001", SqlState::FDW_OUT_OF_MEMORY), - ("0F000", SqlState::LOCATOR_EXCEPTION), - ("22013", SqlState::INVALID_PRECEDING_OR_FOLLOWING_SIZE), - ("2201E", SqlState::INVALID_ARGUMENT_FOR_LOG), - ("22011", SqlState::SUBSTRING_ERROR), - ("42602", SqlState::INVALID_NAME), - ("01004", SqlState::WARNING_STRING_DATA_RIGHT_TRUNCATION), - ("42P02", SqlState::UNDEFINED_PARAMETER), - ("2203C", SqlState::SQL_JSON_OBJECT_NOT_FOUND), - ("HV002", SqlState::FDW_DYNAMIC_PARAMETER_VALUE_NEEDED), - ("0F001", SqlState::L_E_INVALID_SPECIFICATION), - ("58P01", SqlState::UNDEFINED_FILE), - ("38001", SqlState::E_R_E_CONTAINING_SQL_NOT_PERMITTED), - ("42703", SqlState::UNDEFINED_COLUMN), - ("57P05", SqlState::IDLE_SESSION_TIMEOUT), - ("57P03", SqlState::CANNOT_CONNECT_NOW), - ("HV007", SqlState::FDW_INVALID_COLUMN_NAME), - ("22014", SqlState::INVALID_ARGUMENT_FOR_NTILE), - ("22P06", SqlState::NONSTANDARD_USE_OF_ESCAPE_CHARACTER), - ("2203F", SqlState::SQL_JSON_SCALAR_REQUIRED), - ("2200F", SqlState::ZERO_LENGTH_CHARACTER_STRING), - ("09000", SqlState::TRIGGERED_ACTION_EXCEPTION), - ("2201F", SqlState::INVALID_ARGUMENT_FOR_POWER_FUNCTION), - ("08003", SqlState::CONNECTION_DOES_NOT_EXIST), - ("38002", SqlState::E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED), - ("F0001", SqlState::LOCK_FILE_EXISTS), - ("42P22", SqlState::INDETERMINATE_COLLATION), - ("2200C", SqlState::INVALID_USE_OF_ESCAPE_CHARACTER), - ("2203E", SqlState::TOO_MANY_JSON_OBJECT_MEMBERS), - ("23514", SqlState::CHECK_VIOLATION), - ("22P02", SqlState::INVALID_TEXT_REPRESENTATION), - ("54023", SqlState::TOO_MANY_ARGUMENTS), - ("2200T", SqlState::INVALID_XML_PROCESSING_INSTRUCTION), - ("22016", SqlState::INVALID_ARGUMENT_FOR_NTH_VALUE), - ("25P03", SqlState::IDLE_IN_TRANSACTION_SESSION_TIMEOUT), - ("3B001", SqlState::S_E_INVALID_SPECIFICATION), - ("08001", SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), - ("22036", SqlState::NON_NUMERIC_SQL_JSON_ITEM), - ("3F000", SqlState::INVALID_SCHEMA_NAME), - ("39P01", SqlState::E_R_I_E_TRIGGER_PROTOCOL_VIOLATED), - ("22026", SqlState::STRING_DATA_LENGTH_MISMATCH), - ("42P17", SqlState::INVALID_OBJECT_DEFINITION), - ("22034", SqlState::MORE_THAN_ONE_SQL_JSON_ITEM), - ("HV000", SqlState::FDW_ERROR), - ("2200B", SqlState::ESCAPE_CHARACTER_CONFLICT), - ("HV008", SqlState::FDW_INVALID_COLUMN_NUMBER), - ("34000", SqlState::INVALID_CURSOR_NAME), - ("2201G", SqlState::INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), - ("44000", SqlState::WITH_CHECK_OPTION_VIOLATION), - ("HV010", SqlState::FDW_FUNCTION_SEQUENCE_ERROR), - ("39004", SqlState::E_R_I_E_NULL_VALUE_NOT_ALLOWED), - ("22001", SqlState::STRING_DATA_RIGHT_TRUNCATION), - ("3D000", SqlState::INVALID_CATALOG_NAME), - ("25005", SqlState::NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION), - ("2200L", SqlState::NOT_AN_XML_DOCUMENT), - ("27000", SqlState::TRIGGERED_DATA_CHANGE_VIOLATION), - ("HV090", SqlState::FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH), - ("42939", SqlState::RESERVED_NAME), - ("58000", SqlState::SYSTEM_ERROR), - ("2200M", SqlState::INVALID_XML_DOCUMENT), - ("HV00L", SqlState::FDW_UNABLE_TO_CREATE_EXECUTION), - ("57014", SqlState::QUERY_CANCELED), - ("23502", SqlState::NOT_NULL_VIOLATION), - ("22002", SqlState::NULL_VALUE_NO_INDICATOR_PARAMETER), - ("HV00R", SqlState::FDW_TABLE_NOT_FOUND), - ("HV00P", SqlState::FDW_NO_SCHEMAS), - ("38003", SqlState::E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), - ("39000", SqlState::EXTERNAL_ROUTINE_INVOCATION_EXCEPTION), - ("22015", SqlState::INTERVAL_FIELD_OVERFLOW), - ("HV00K", SqlState::FDW_REPLY_HANDLE), - ("HV024", SqlState::FDW_INVALID_ATTRIBUTE_VALUE), - ("2200D", SqlState::INVALID_ESCAPE_OCTET), - ("08007", SqlState::TRANSACTION_RESOLUTION_UNKNOWN), - ("2F003", SqlState::S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), - ("42725", SqlState::AMBIGUOUS_FUNCTION), - ("2203A", SqlState::SQL_JSON_MEMBER_NOT_FOUND), - ("42846", SqlState::CANNOT_COERCE), - ("42P04", SqlState::DUPLICATE_DATABASE), - ("42000", SqlState::SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION), - ("2203B", SqlState::SQL_JSON_NUMBER_NOT_FOUND), - ("42P05", SqlState::DUPLICATE_PSTATEMENT), - ("53300", SqlState::TOO_MANY_CONNECTIONS), - ("53400", SqlState::CONFIGURATION_LIMIT_EXCEEDED), - ("42704", SqlState::UNDEFINED_OBJECT), - ("2202G", SqlState::INVALID_TABLESAMPLE_REPEAT), - ("22023", SqlState::INVALID_PARAMETER_VALUE), - ("53000", SqlState::INSUFFICIENT_RESOURCES), - ], -}; + #[test] + fn round_trip() { + let state = SqlState::from_code("08P01"); + assert_eq!(state, SqlState::PROTOCOL_VIOLATION); + assert_eq!(state.code(), "08P01"); + } +} diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index ba13a528f6..b27eabcb0e 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -5,9 +5,9 @@ use std::sync::Arc; use bytes::Bytes; use fallible_iterator::FallibleIterator; use futures_util::{TryStreamExt, pin_mut}; -use log::debug; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; +use tracing::debug; use crate::client::{CachedTypeInfo, InnerClient}; use crate::codec::FrontendMessage; diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs index 29f05fba79..106bc69d49 100644 --- a/libs/proxy/tokio-postgres2/src/query.rs +++ b/libs/proxy/tokio-postgres2/src/query.rs @@ -7,11 +7,11 @@ use std::task::{Context, Poll}; use bytes::{BufMut, Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use futures_util::{Stream, ready}; -use log::{Level, debug, log_enabled}; use pin_project_lite::pin_project; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; use postgres_types2::{Format, ToSql, Type}; +use tracing::debug; use crate::client::{InnerClient, Responses}; use crate::codec::FrontendMessage; @@ -36,7 +36,7 @@ where I: IntoIterator, I::IntoIter: ExactSizeIterator, { - let buf = if log_enabled!(Level::Debug) { + let buf = if tracing::enabled!(tracing::Level::DEBUG) { let params = params.into_iter().collect::>(); debug!( "executing statement {} with parameters: {:?}", diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs index f13d63983f..2cf17188cf 100644 --- a/libs/proxy/tokio-postgres2/src/simple_query.rs +++ b/libs/proxy/tokio-postgres2/src/simple_query.rs @@ -6,10 +6,10 @@ use std::task::{Context, Poll}; use bytes::Bytes; use fallible_iterator::FallibleIterator; use futures_util::{Stream, ready}; -use log::debug; use pin_project_lite::pin_project; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; +use tracing::debug; use crate::client::{InnerClient, Responses}; use crate::codec::FrontendMessage; diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 6996bb27ae..d38e13fd05 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -558,7 +558,7 @@ async fn upload_large_enough_file( ) -> usize { let header = bytes::Bytes::from_static("remote blob data content".as_bytes()); let body = bytes::Bytes::from(vec![0u8; 1024]); - let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128)); + let contents = std::iter::once(header).chain(std::iter::repeat_n(body, 128)); let len = contents.clone().fold(0, |acc, next| acc + next.len()); diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 33ff636a79..51f88625da 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -71,6 +71,7 @@ pub struct PeerInfo { pub ts: Instant, pub pg_connstr: String, pub http_connstr: String, + pub https_connstr: Option, } pub type FullTransactionId = u64; @@ -227,6 +228,8 @@ pub struct TimelineDeleteResult { pub dir_existed: bool, } +pub type TenantDeleteResult = std::collections::HashMap; + fn lsn_invalid() -> Lsn { Lsn::INVALID } @@ -259,6 +262,8 @@ pub struct SkTimelineInfo { pub safekeeper_connstr: Option, #[serde(default)] pub http_connstr: Option, + #[serde(default)] + pub https_connstr: Option, // Minimum of all active RO replicas flush LSN #[serde(default = "lsn_invalid")] pub standby_horizon: Lsn, diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 60637d5b24..49a6055b1e 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -14,6 +14,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true +pin-project-lite.workspace = true [dev-dependencies] tracing-subscriber.workspace = true # For examples in docs diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 74992a7d03..0893aa173b 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -31,10 +31,10 @@ //! .init(); //! } //! ``` -#![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] pub mod http; +pub mod perf_span; use opentelemetry::KeyValue; use opentelemetry::trace::TracerProvider; diff --git a/libs/tracing-utils/src/perf_span.rs b/libs/tracing-utils/src/perf_span.rs new file mode 100644 index 0000000000..16f713c67e --- /dev/null +++ b/libs/tracing-utils/src/perf_span.rs @@ -0,0 +1,144 @@ +//! Crutch module to work around tracing infrastructure deficiencies +//! +//! We wish to collect granular request spans without impacting performance +//! by much. Ideally, we should have zero overhead for a sampling rate of 0. +//! +//! The approach taken by the pageserver crate is to use a completely different +//! span hierarchy for the performance spans. Spans are explicitly stored in +//! the request context and use a different [`tracing::Subscriber`] in order +//! to avoid expensive filtering. +//! +//! [`tracing::Span`] instances record their [`tracing::Dispatch`] and, implcitly, +//! their [`tracing::Subscriber`] at creation time. However, upon exiting the span, +//! the global default [`tracing::Dispatch`] is used. This is problematic if one +//! wishes to juggle different subscribers. +//! +//! In order to work around this, this module provides a [`PerfSpan`] type which +//! wraps a [`Span`] and sets the default subscriber when exiting the span. This +//! achieves the correct routing. +//! +//! There's also a modified version of [`tracing::Instrument`] which works with +//! [`PerfSpan`]. + +use core::{ + future::Future, + marker::Sized, + mem::ManuallyDrop, + pin::Pin, + task::{Context, Poll}, +}; +use pin_project_lite::pin_project; +use tracing::{Dispatch, span::Span}; + +#[derive(Debug, Clone)] +pub struct PerfSpan { + inner: ManuallyDrop, + dispatch: Dispatch, +} + +#[must_use = "once a span has been entered, it should be exited"] +pub struct PerfSpanEntered<'a> { + span: &'a PerfSpan, +} + +impl PerfSpan { + pub fn new(span: Span, dispatch: Dispatch) -> Self { + Self { + inner: ManuallyDrop::new(span), + dispatch, + } + } + + pub fn enter(&self) -> PerfSpanEntered { + if let Some(ref id) = self.inner.id() { + self.dispatch.enter(id); + } + + PerfSpanEntered { span: self } + } + + pub fn inner(&self) -> &Span { + &self.inner + } +} + +impl Drop for PerfSpan { + fn drop(&mut self) { + // Bring the desired dispatch into scope before explicitly calling + // the span destructor. This routes the span exit to the correct + // [`tracing::Subscriber`]. + let _dispatch_guard = tracing::dispatcher::set_default(&self.dispatch); + // SAFETY: ManuallyDrop in Drop implementation + unsafe { ManuallyDrop::drop(&mut self.inner) } + } +} + +impl Drop for PerfSpanEntered<'_> { + fn drop(&mut self) { + assert!(self.span.inner.id().is_some()); + + let _dispatch_guard = tracing::dispatcher::set_default(&self.span.dispatch); + self.span.dispatch.exit(&self.span.inner.id().unwrap()); + } +} + +pub trait PerfInstrument: Sized { + fn instrument(self, span: PerfSpan) -> PerfInstrumented { + PerfInstrumented { + inner: ManuallyDrop::new(self), + span, + } + } +} + +pin_project! { + #[project = PerfInstrumentedProj] + #[derive(Debug, Clone)] + #[must_use = "futures do nothing unless you `.await` or poll them"] + pub struct PerfInstrumented { + // `ManuallyDrop` is used here to to enter instrument `Drop` by entering + // `Span` and executing `ManuallyDrop::drop`. + #[pin] + inner: ManuallyDrop, + span: PerfSpan, + } + + impl PinnedDrop for PerfInstrumented { + fn drop(this: Pin<&mut Self>) { + let this = this.project(); + let _enter = this.span.enter(); + // SAFETY: 1. `Pin::get_unchecked_mut()` is safe, because this isn't + // different from wrapping `T` in `Option` and calling + // `Pin::set(&mut this.inner, None)`, except avoiding + // additional memory overhead. + // 2. `ManuallyDrop::drop()` is safe, because + // `PinnedDrop::drop()` is guaranteed to be called only + // once. + unsafe { ManuallyDrop::drop(this.inner.get_unchecked_mut()) } + } + } +} + +impl<'a, T> PerfInstrumentedProj<'a, T> { + /// Get a mutable reference to the [`Span`] a pinned mutable reference to + /// the wrapped type. + fn span_and_inner_pin_mut(self) -> (&'a mut PerfSpan, Pin<&'a mut T>) { + // SAFETY: As long as `ManuallyDrop` does not move, `T` won't move + // and `inner` is valid, because `ManuallyDrop::drop` is called + // only inside `Drop` of the `Instrumented`. + let inner = unsafe { self.inner.map_unchecked_mut(|v| &mut **v) }; + (self.span, inner) + } +} + +impl Future for PerfInstrumented { + type Output = T::Output; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let (span, inner) = self.project().span_and_inner_pin_mut(); + let _enter = span.enter(); + inner.poll(cx) + } +} + +impl PerfInstrument for T {} diff --git a/libs/utils/src/elapsed_accum.rs b/libs/utils/src/elapsed_accum.rs new file mode 100644 index 0000000000..efb2a34a95 --- /dev/null +++ b/libs/utils/src/elapsed_accum.rs @@ -0,0 +1,26 @@ +use std::time::{Duration, Instant}; + +#[derive(Default)] +pub struct ElapsedAccum { + accum: Duration, +} + +impl ElapsedAccum { + pub fn get(&self) -> Duration { + self.accum + } + pub fn guard(&mut self) -> impl Drop + '_ { + let start = Instant::now(); + scopeguard::guard(start, |last_wait_at| { + self.accum += Instant::now() - last_wait_at; + }) + } + + pub async fn measure(&mut self, fut: Fut) -> O + where + Fut: Future, + { + let _guard = self.guard(); + fut.await + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 9389a27bf3..206b8bbd8f 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -93,6 +93,8 @@ pub mod try_rcu; pub mod guard_arc_swap; +pub mod elapsed_accum; + #[cfg(target_os = "linux")] pub mod linux_socket_ioctl; diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 8f8401b35d..5fb4c5b460 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -111,9 +111,17 @@ impl OnceCell { } } + /// Like [`Self::get_or_init_detached_measured`], but without out parameter for time spent waiting. + pub async fn get_or_init_detached(&self) -> Result, InitPermit> { + self.get_or_init_detached_measured(None).await + } + /// Returns a guard to an existing initialized value, or returns an unique initialization /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`. - pub async fn get_or_init_detached(&self) -> Result, InitPermit> { + pub async fn get_or_init_detached_measured( + &self, + mut wait_time: Option<&mut crate::elapsed_accum::ElapsedAccum>, + ) -> Result, InitPermit> { // It looks like OnceCell::get_or_init could be implemented using this method instead of // duplication. However, that makes the future be !Send due to possibly holding on to the // MutexGuard over an await point. @@ -125,12 +133,16 @@ impl OnceCell { } guard.init_semaphore.clone() }; - { let permit = { // increment the count for the duration of queued let _guard = CountWaitingInitializers::start(self); - sem.acquire().await + let fut = sem.acquire(); + if let Some(wait_time) = wait_time.as_mut() { + wait_time.measure(fut).await + } else { + fut.await + } }; let Ok(permit) = permit else { diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 508dac231e..e0cd19817d 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -7,7 +7,7 @@ use http_utils::error::HttpErrorBody; use pageserver_api::models::*; use pageserver_api::shard::TenantShardId; pub use reqwest::Body as ReqwestBody; -use reqwest::{Certificate, IntoUrl, Method, StatusCode, Url}; +use reqwest::{IntoUrl, Method, StatusCode, Url}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -39,8 +39,8 @@ pub enum Error { #[error("Cancelled")] Cancelled, - #[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] - CreateClient(reqwest::Error), + #[error("request timed out: {0}")] + Timeout(String), } pub type Result = std::result::Result; @@ -72,24 +72,7 @@ pub enum ForceAwaitLogicalSize { } impl Client { - pub fn new( - mgmt_api_endpoint: String, - jwt: Option<&str>, - ssl_ca_cert: Option, - ) -> Result { - let mut http_client = reqwest::Client::builder(); - if let Some(ssl_ca_cert) = ssl_ca_cert { - http_client = http_client.add_root_certificate(ssl_ca_cert); - } - let http_client = http_client.build().map_err(Error::CreateClient)?; - Ok(Self::from_client(http_client, mgmt_api_endpoint, jwt)) - } - - pub fn from_client( - client: reqwest::Client, - mgmt_api_endpoint: String, - jwt: Option<&str>, - ) -> Self { + pub fn new(client: reqwest::Client, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { Self { mgmt_api_endpoint, authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")), @@ -103,17 +86,17 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } - /// Get an arbitrary path and returning a streaming Response. This function is suitable - /// for pass-through/proxy use cases where we don't care what the response content looks - /// like. + /// Send an HTTP request to an arbitrary path with a desired HTTP method and returning a streaming + /// Response. This function is suitable for pass-through/proxy use cases where we don't care + /// what the response content looks like. /// /// Use/add one of the properly typed methods below if you know aren't proxying, and /// know what kind of response you expect. - pub async fn get_raw(&self, path: String) -> Result { + pub async fn op_raw(&self, method: Method, path: String) -> Result { debug_assert!(path.starts_with('/')); let uri = format!("{}{}", self.mgmt_api_endpoint, path); - let mut req = self.client.request(Method::GET, uri); + let mut req = self.client.request(method, uri); if let Some(value) = &self.authorization_header { req = req.header(reqwest::header::AUTHORIZATION, value); } diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs index 394a954c30..6441c047c2 100644 --- a/pageserver/pagebench/src/cmd/aux_files.rs +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -34,10 +34,10 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - None, // TODO: support ssl_ca_file for https APIs in pagebench. - )?); + )); // discover targets let timelines: Vec = crate::util::cli::targets::discover( diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index d3013ded70..43ad92980c 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -75,10 +75,10 @@ async fn main_impl( let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - None, // TODO: support ssl_ca_file for https APIs in pagebench. - )?); + )); // discover targets let timelines: Vec = crate::util::cli::targets::discover( diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 969cf24b93..6fd1c00eca 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -123,10 +123,10 @@ async fn main_impl( let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - None, // TODO: support ssl_ca_file for https APIs in pagebench. - )?); + )); if let Some(engine_str) = &args.set_io_engine { mgmt_api_client.put_io_engine(engine_str).await?; diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs index a77d3000cc..9ff1e638c4 100644 --- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -81,10 +81,10 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - None, // TODO: support ssl_ca_file for https APIs in pagebench. - )?); + )); if let Some(engine_str) = &args.set_io_engine { mgmt_api_client.put_io_engine(engine_str).await?; diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs index 2f919ec652..779bacbfd4 100644 --- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -38,10 +38,10 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - None, // TODO: support ssl_ca_file for https APIs in pagebench. - )?); + )); // discover targets let timelines: Vec = crate::util::cli::targets::discover( diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 3ab6d79546..9a8494292d 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -12,11 +12,12 @@ use std::time::Duration; use anyhow::{Context, anyhow}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; +use http_utils::tls_certs::ReloadingCertificateResolver; use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; use metrics::set_build_info_metric; use nix::sys::socket::{setsockopt, sockopt}; -use pageserver::config::{PageServerConf, PageserverIdentity}; -use pageserver::controller_upcall_client::ControllerUpcallClient; +use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields}; +use pageserver::controller_upcall_client::StorageControllerUpcallClient; use pageserver::deletion_queue::DeletionQueue; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; @@ -34,6 +35,7 @@ use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; +use tracing_utils::OtelGuard; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::crashsafe::syncfs; use utils::logging::TracingErrorLayerEnablement; @@ -96,7 +98,7 @@ fn main() -> anyhow::Result<()> { env::set_current_dir(&workdir) .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?; - let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?; + let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?; // Initialize logging. // @@ -117,6 +119,21 @@ fn main() -> anyhow::Result<()> { logging::Output::Stdout, )?; + let otel_enablement = match &conf.tracing { + Some(cfg) => tracing_utils::OtelEnablement::Enabled { + service_name: "pageserver".to_string(), + export_config: (&cfg.export_config).into(), + runtime: *COMPUTE_REQUEST_RUNTIME, + }, + None => tracing_utils::OtelEnablement::Disabled, + }; + + let otel_guard = tracing_utils::init_performance_tracing(otel_enablement); + + if otel_guard.is_some() { + info!(?conf.tracing, "starting with OTEL tracing enabled"); + } + // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. // disarming this hook on pageserver, because we never tear down tracing. logging::replace_panic_hook_with_tracing_panic_hook().forget(); @@ -127,7 +144,17 @@ fn main() -> anyhow::Result<()> { &[("node_id", &conf.id.to_string())], ); - // after setting up logging, log the effective IO engine choice and read path implementations + // Warn about ignored config items; see pageserver_api::config::ConfigToml + // doc comment for rationale why we prefer this over serde(deny_unknown_fields). + { + let ignored_fields::Paths { paths } = &ignored; + for path in paths { + warn!(?path, "ignoring unknown configuration item"); + } + } + + // Log configuration items for feature-flag-like config + // (maybe we should automate this with a visitor?). info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); @@ -190,7 +217,7 @@ fn main() -> anyhow::Result<()> { tracing::info!("Initializing page_cache..."); page_cache::init(conf.page_cache_size); - start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; + start_pageserver(launch_ts, conf, ignored, otel_guard).context("Failed to start pageserver")?; scenario.teardown(); Ok(()) @@ -200,7 +227,7 @@ fn initialize_config( identity_file_path: &Utf8Path, cfg_file_path: &Utf8Path, workdir: &Utf8Path, -) -> anyhow::Result<&'static PageServerConf> { +) -> anyhow::Result<(&'static PageServerConf, ignored_fields::Paths)> { // The deployment orchestrator writes out an indentity file containing the node id // for all pageservers. This file is the source of truth for the node id. In order // to allow for rolling back pageserver releases, the node id is also included in @@ -229,15 +256,36 @@ fn initialize_config( let config_file_contents = std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?; - let config_toml = serde_path_to_error::deserialize( - toml_edit::de::Deserializer::from_str(&config_file_contents) - .context("build toml deserializer")?, - ) - .context("deserialize config toml")?; + + // Deserialize the config file contents into a ConfigToml. + let config_toml: pageserver_api::config::ConfigToml = { + let deserializer = toml_edit::de::Deserializer::from_str(&config_file_contents) + .context("build toml deserializer")?; + let mut path_to_error_track = serde_path_to_error::Track::new(); + let deserializer = + serde_path_to_error::Deserializer::new(deserializer, &mut path_to_error_track); + serde::Deserialize::deserialize(deserializer).context("deserialize config toml")? + }; + + // Find unknown fields by re-serializing the parsed ConfigToml and comparing it to the on-disk file. + // Any fields that are only in the on-disk version are unknown. + // (The assumption here is that the ConfigToml doesn't to skip_serializing_if.) + // (Make sure to read the ConfigToml doc comment on why we only want to warn about, but not fail startup, on unknown fields). + let ignored = { + let ondisk_toml = config_file_contents + .parse::() + .context("parse original config as toml document")?; + let parsed_toml = toml_edit::ser::to_document(&config_toml) + .context("re-serialize config to toml document")?; + pageserver::config::ignored_fields::find(ondisk_toml, parsed_toml) + }; + + // Construct the runtime god object (it's called PageServerConf but actually is just global shared state). let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir) .context("runtime-validation of config toml")?; + let conf = Box::leak(Box::new(conf)); - Ok(Box::leak(Box::new(conf))) + Ok((conf, ignored)) } struct WaitForPhaseResult { @@ -288,6 +336,8 @@ fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) { fn start_pageserver( launch_ts: &'static LaunchTimestamp, conf: &'static PageServerConf, + ignored: ignored_fields::Paths, + otel_guard: Option, ) -> anyhow::Result<()> { // Monotonic time for later calculating startup duration let started_startup_at = Instant::now(); @@ -310,7 +360,7 @@ fn start_pageserver( pageserver::metrics::tokio_epoll_uring::Collector::new(), )) .unwrap(); - pageserver::preinitialize_metrics(conf); + pageserver::preinitialize_metrics(conf, ignored); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes @@ -427,7 +477,7 @@ fn start_pageserver( // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( remote_storage.clone(), - ControllerUpcallClient::new(conf, &shutdown_pageserver), + StorageControllerUpcallClient::new(conf, &shutdown_pageserver)?, conf, ); deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle()); @@ -621,12 +671,15 @@ fn start_pageserver( let https_task = match https_listener { Some(https_listener) => { - let certs = http_utils::tls_certs::load_cert_chain(&conf.ssl_cert_file)?; - let key = http_utils::tls_certs::load_private_key(&conf.ssl_key_file)?; + let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new( + &conf.ssl_key_file, + &conf.ssl_cert_file, + conf.ssl_cert_reload_period, + ))?; let server_config = rustls::ServerConfig::builder() .with_no_client_auth() - .with_single_cert(certs, key)?; + .with_cert_resolver(resolver); let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); @@ -670,13 +723,21 @@ fn start_pageserver( // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. - let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, { - let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it - pageserver_listener - .set_nonblocking(true) - .context("set listener to nonblocking")?; - tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")? - }); + let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone()); + let page_service = page_service::spawn( + conf, + tenant_manager.clone(), + pg_auth, + perf_trace_dispatch, + { + let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it + pageserver_listener + .set_nonblocking(true) + .context("set listener to nonblocking")?; + tokio::net::TcpListener::from_std(pageserver_listener) + .context("create tokio listener")? + }, + ); // All started up! Now just sit and wait for shutdown signal. BACKGROUND_RUNTIME.block_on(async move { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 562a16a14e..ccc29e59d4 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,6 +4,8 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. +pub mod ignored_fields; + use std::env; use std::num::NonZeroUsize; use std::sync::Arc; @@ -17,7 +19,7 @@ use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use postgres_backend::AuthType; use remote_storage::{RemotePath, RemoteStorageConfig}; -use reqwest::Url; +use reqwest::{Certificate, Url}; use storage_broker::Uri; use utils::id::{NodeId, TimelineId}; use utils::logging::{LogFormat, SecretString}; @@ -43,7 +45,7 @@ use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, virtual_file}; /// /// For fields that require additional validation or filling in of defaults at runtime, /// check for examples in the [`PageServerConf::parse_and_validate`] method. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone)] pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers // can safely distinguish different pageservers @@ -56,8 +58,17 @@ pub struct PageServerConf { /// Example: 127.0.0.1:9899 pub listen_https_addr: Option, + /// Path to a file with certificate's private key for https API. + /// Default: server.key pub ssl_key_file: Utf8PathBuf, + /// Path to a file with a X509 certificate for https API. + /// Default: server.crt pub ssl_cert_file: Utf8PathBuf, + /// Period to reload certificate and private key from files. + /// Default: 60s. + pub ssl_cert_reload_period: Duration, + /// Trusted root CA certificates to use in https APIs. + pub ssl_ca_certs: Vec, /// Current availability zone. Used for traffic metrics. pub availability_zone: Option, @@ -94,7 +105,7 @@ pub struct PageServerConf { pub remote_storage_config: Option, - pub default_tenant_conf: crate::tenant::config::TenantConf, + pub default_tenant_conf: pageserver_api::config::TenantConfigToml, /// Storage broker endpoints to connect to. pub broker_endpoint: Uri, @@ -206,6 +217,8 @@ pub struct PageServerConf { /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline. pub generate_unarchival_heatmap: bool, + + pub tracing: Option, } /// Token for authentication to safekeepers @@ -325,6 +338,8 @@ impl PageServerConf { listen_https_addr, ssl_key_file, ssl_cert_file, + ssl_cert_reload_period, + ssl_ca_file, availability_zone, wait_lsn_timeout, wal_redo_timeout, @@ -375,6 +390,7 @@ impl PageServerConf { validate_wal_contiguity, load_previous_heatmap, generate_unarchival_heatmap, + tracing, } = config_toml; let mut conf = PageServerConf { @@ -386,6 +402,7 @@ impl PageServerConf { listen_https_addr, ssl_key_file, ssl_cert_file, + ssl_cert_reload_period, availability_zone, wait_lsn_timeout, wal_redo_timeout, @@ -423,6 +440,7 @@ impl PageServerConf { wal_receiver_protocol, page_service_pipelining, get_vectored_concurrent_io, + tracing, // ------------------------------------------------------------ // fields that require additional validation or custom handling @@ -469,6 +487,13 @@ impl PageServerConf { validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false), load_previous_heatmap: load_previous_heatmap.unwrap_or(true), generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true), + ssl_ca_certs: match ssl_ca_file { + Some(ssl_ca_file) => { + let buf = std::fs::read(ssl_ca_file)?; + Certificate::from_pem_bundle(&buf)? + } + None => Vec::new(), + }, }; // ------------------------------------------------------------ @@ -487,6 +512,17 @@ impl PageServerConf { ); } + if let Some(tracing_config) = conf.tracing.as_ref() { + let ratio = &tracing_config.sampling_ratio; + ensure!( + ratio.denominator != 0 && ratio.denominator >= ratio.numerator, + format!( + "Invalid sampling ratio: {}/{}", + ratio.numerator, ratio.denominator + ) + ); + } + IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) .map_err(anyhow::Error::msg) .with_context(|| { @@ -526,7 +562,6 @@ impl PageServerConf { } #[derive(serde::Deserialize, serde::Serialize)] -#[serde(deny_unknown_fields)] pub struct PageserverIdentity { pub id: NodeId, } @@ -598,82 +633,4 @@ mod tests { PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } - - /// If there's a typo in the pageserver config, we'd rather catch that typo - /// and fail pageserver startup than silently ignoring the typo, leaving whoever - /// made it in the believe that their config change is effective. - /// - /// The default in serde is to allow unknown fields, so, we rely - /// on developer+review discipline to add `deny_unknown_fields` when adding - /// new structs to the config, and these tests here as a regression test. - /// - /// The alternative to all of this would be to allow unknown fields in the config. - /// To catch them, we could have a config check tool or mgmt API endpoint that - /// compares the effective config with the TOML on disk and makes sure that - /// the on-disk TOML is a strict subset of the effective config. - mod unknown_fields_handling { - macro_rules! test { - ($short_name:ident, $input:expr) => { - #[test] - fn $short_name() { - let input = $input; - let err = toml_edit::de::from_str::(&input) - .expect_err("some_invalid_field is an invalid field"); - dbg!(&err); - assert!(err.to_string().contains("some_invalid_field")); - } - }; - } - use indoc::indoc; - - test!( - toplevel, - indoc! {r#" - some_invalid_field = 23 - "#} - ); - - test!( - toplevel_nested, - indoc! {r#" - [some_invalid_field] - foo = 23 - "#} - ); - - test!( - disk_usage_based_eviction, - indoc! {r#" - [disk_usage_based_eviction] - some_invalid_field = 23 - "#} - ); - - test!( - tenant_config, - indoc! {r#" - [tenant_config] - some_invalid_field = 23 - "#} - ); - - test!( - l0_flush, - indoc! {r#" - [l0_flush] - mode = "direct" - some_invalid_field = 23 - "#} - ); - - // TODO: fix this => https://github.com/neondatabase/neon/issues/8915 - // test!( - // remote_storage_config, - // indoc! {r#" - // [remote_storage_config] - // local_path = "/nonexistent" - // some_invalid_field = 23 - // "#} - // ); - } } diff --git a/pageserver/src/config/ignored_fields.rs b/pageserver/src/config/ignored_fields.rs new file mode 100644 index 0000000000..68d0823604 --- /dev/null +++ b/pageserver/src/config/ignored_fields.rs @@ -0,0 +1,179 @@ +//! Check for fields in the on-disk config file that were ignored when +//! deserializing [`pageserver_api::config::ConfigToml`]. +//! +//! This could have been part of the [`pageserver_api::config`] module, +//! but the way we identify unused fields in this module +//! is specific to the format (TOML) and the implementation of the +//! deserialization for that format ([`toml_edit`]). + +use std::collections::HashSet; + +use itertools::Itertools; + +/// Pass in the user-specified config and the re-serialized [`pageserver_api::config::ConfigToml`]. +/// The returned [`Paths`] contains the paths to the fields that were ignored by deserialization +/// of the [`pageserver_api::config::ConfigToml`]. +pub fn find(user_specified: toml_edit::DocumentMut, reserialized: toml_edit::DocumentMut) -> Paths { + let user_specified = paths(user_specified); + let reserialized = paths(reserialized); + fn paths(doc: toml_edit::DocumentMut) -> HashSet { + let mut out = Vec::new(); + let mut visitor = PathsVisitor::new(&mut out); + visitor.visit_table_like(doc.as_table()); + HashSet::from_iter(out) + } + + let mut ignored = HashSet::new(); + + // O(n) because of HashSet + for path in user_specified { + if !reserialized.contains(&path) { + ignored.insert(path); + } + } + + Paths { + paths: ignored + .into_iter() + // sort lexicographically for deterministic output + .sorted() + .collect(), + } +} + +pub struct Paths { + pub paths: Vec, +} + +struct PathsVisitor<'a> { + stack: Vec, + out: &'a mut Vec, +} + +impl<'a> PathsVisitor<'a> { + fn new(out: &'a mut Vec) -> Self { + Self { + stack: Vec::new(), + out, + } + } + + fn visit_table_like(&mut self, table_like: &dyn toml_edit::TableLike) { + for (entry, item) in table_like.iter() { + self.stack.push(entry.to_string()); + self.visit_item(item); + self.stack.pop(); + } + } + + fn visit_item(&mut self, item: &toml_edit::Item) { + match item { + toml_edit::Item::None => (), + toml_edit::Item::Value(value) => self.visit_value(value), + toml_edit::Item::Table(table) => { + self.visit_table_like(table); + } + toml_edit::Item::ArrayOfTables(array_of_tables) => { + for (i, table) in array_of_tables.iter().enumerate() { + self.stack.push(format!("[{i}]")); + self.visit_table_like(table); + self.stack.pop(); + } + } + } + } + + fn visit_value(&mut self, value: &toml_edit::Value) { + match value { + toml_edit::Value::String(_) + | toml_edit::Value::Integer(_) + | toml_edit::Value::Float(_) + | toml_edit::Value::Boolean(_) + | toml_edit::Value::Datetime(_) => self.out.push(self.stack.join(".")), + toml_edit::Value::Array(array) => { + for (i, value) in array.iter().enumerate() { + self.stack.push(format!("[{i}]")); + self.visit_value(value); + self.stack.pop(); + } + } + toml_edit::Value::InlineTable(inline_table) => self.visit_table_like(inline_table), + } + } +} + +#[cfg(test)] +pub(crate) mod tests { + + fn test_impl(original: &str, parsed: &str, expect: [&str; 1]) { + let original: toml_edit::DocumentMut = original.parse().expect("parse original config"); + let parsed: toml_edit::DocumentMut = parsed.parse().expect("parse re-serialized config"); + + let super::Paths { paths: actual } = super::find(original, parsed); + assert_eq!(actual, &expect); + } + + #[test] + fn top_level() { + test_impl( + r#" + [a] + b = 1 + c = 2 + d = 3 + "#, + r#" + [a] + b = 1 + c = 2 + "#, + ["a.d"], + ); + } + + #[test] + fn nested() { + test_impl( + r#" + [a.b.c] + d = 23 + "#, + r#" + [a] + e = 42 + "#, + ["a.b.c.d"], + ); + } + + #[test] + fn array_of_tables() { + test_impl( + r#" + [[a]] + b = 1 + c = 2 + d = 3 + "#, + r#" + [[a]] + b = 1 + c = 2 + "#, + ["a.[0].d"], + ); + } + + #[test] + fn array() { + test_impl( + r#" + foo = [ {bar = 23} ] + "#, + r#" + foo = [ { blup = 42 }] + "#, + ["foo.[0].bar"], + ); + } +} diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index d2caf030df..04dcca4299 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -89,7 +89,7 @@ //! [`RequestContext`] argument. Functions in the middle of the call chain //! only need to pass it on. -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use once_cell::sync::Lazy; use tracing::warn; @@ -100,6 +100,12 @@ use crate::{ task_mgr::TaskKind, tenant::Timeline, }; +use futures::FutureExt; +use futures::future::BoxFuture; +use std::future::Future; +use tracing_utils::perf_span::{PerfInstrument, PerfSpan}; + +use tracing::{Dispatch, Span}; // The main structure of this module, see module-level comment. pub struct RequestContext { @@ -109,6 +115,8 @@ pub struct RequestContext { page_content_kind: PageContentKind, read_path_debug: bool, scope: Scope, + perf_span: Option, + perf_span_dispatch: Option, } #[derive(Clone)] @@ -263,22 +271,15 @@ impl RequestContextBuilder { page_content_kind: PageContentKind::Unknown, read_path_debug: false, scope: Scope::new_global(), + perf_span: None, + perf_span_dispatch: None, }, } } - pub fn extend(original: &RequestContext) -> Self { + pub fn from(original: &RequestContext) -> Self { Self { - // This is like a Copy, but avoid implementing Copy because ordinary users of - // RequestContext should always move or ref it. - inner: RequestContext { - task_kind: original.task_kind, - download_behavior: original.download_behavior, - access_stats_behavior: original.access_stats_behavior, - page_content_kind: original.page_content_kind, - read_path_debug: original.read_path_debug, - scope: original.scope.clone(), - }, + inner: original.clone(), } } @@ -316,12 +317,74 @@ impl RequestContextBuilder { self } - pub fn build(self) -> RequestContext { + pub(crate) fn perf_span_dispatch(mut self, dispatch: Option) -> Self { + self.inner.perf_span_dispatch = dispatch; + self + } + + pub fn root_perf_span(mut self, make_span: Fn) -> Self + where + Fn: FnOnce() -> Span, + { + assert!(self.inner.perf_span.is_none()); + assert!(self.inner.perf_span_dispatch.is_some()); + + let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap(); + let new_span = tracing::dispatcher::with_default(dispatcher, make_span); + + self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone())); + + self + } + + pub fn perf_span(mut self, make_span: Fn) -> Self + where + Fn: FnOnce(&Span) -> Span, + { + if let Some(ref perf_span) = self.inner.perf_span { + assert!(self.inner.perf_span_dispatch.is_some()); + let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap(); + + let new_span = + tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner())); + + self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone())); + } + + self + } + + pub fn root(self) -> RequestContext { + self.inner + } + + pub fn attached_child(self) -> RequestContext { + self.inner + } + + pub fn detached_child(self) -> RequestContext { self.inner } } impl RequestContext { + /// Private clone implementation + /// + /// Callers should use the [`RequestContextBuilder`] or child spaning APIs of + /// [`RequestContext`]. + fn clone(&self) -> Self { + Self { + task_kind: self.task_kind, + download_behavior: self.download_behavior, + access_stats_behavior: self.access_stats_behavior, + page_content_kind: self.page_content_kind, + read_path_debug: self.read_path_debug, + scope: self.scope.clone(), + perf_span: self.perf_span.clone(), + perf_span_dispatch: self.perf_span_dispatch.clone(), + } + } + /// Create a new RequestContext that has no parent. /// /// The function is called `new` because, once we add children @@ -337,7 +400,7 @@ impl RequestContext { pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { RequestContextBuilder::new(task_kind) .download_behavior(download_behavior) - .build() + .root() } /// Create a detached child context for a task that may outlive `self`. @@ -358,7 +421,10 @@ impl RequestContext { /// /// We could make new calls to this function fail if `self` is already canceled. pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { - self.child_impl(task_kind, download_behavior) + RequestContextBuilder::from(self) + .task_kind(task_kind) + .download_behavior(download_behavior) + .detached_child() } /// Create a child of context `self` for a task that shall not outlive `self`. @@ -382,7 +448,7 @@ impl RequestContext { /// The method to wait for child tasks would return an error, indicating /// that the child task was not started because the context was canceled. pub fn attached_child(&self) -> Self { - self.child_impl(self.task_kind(), self.download_behavior()) + RequestContextBuilder::from(self).attached_child() } /// Use this function when you should be creating a child context using @@ -397,17 +463,10 @@ impl RequestContext { Self::new(task_kind, download_behavior) } - fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { - RequestContextBuilder::extend(self) - .task_kind(task_kind) - .download_behavior(download_behavior) - .build() - } - pub fn with_scope_timeline(&self, timeline: &Arc) -> Self { - RequestContextBuilder::extend(self) + RequestContextBuilder::from(self) .scope(Scope::new_timeline(timeline)) - .build() + .attached_child() } pub(crate) fn with_scope_page_service_pagestream( @@ -416,9 +475,9 @@ impl RequestContext { crate::page_service::TenantManagerTypes, >, ) -> Self { - RequestContextBuilder::extend(self) + RequestContextBuilder::from(self) .scope(Scope::new_page_service_pagestream(timeline_handle)) - .build() + .attached_child() } pub fn with_scope_secondary_timeline( @@ -426,28 +485,30 @@ impl RequestContext { tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> Self { - RequestContextBuilder::extend(self) + RequestContextBuilder::from(self) .scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id)) - .build() + .attached_child() } pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self { - RequestContextBuilder::extend(self) + RequestContextBuilder::from(self) .scope(Scope::new_secondary_tenant(tenant_shard_id)) - .build() + .attached_child() } #[cfg(test)] pub fn with_scope_unit_test(&self) -> Self { - RequestContextBuilder::new(TaskKind::UnitTest) + RequestContextBuilder::from(self) + .task_kind(TaskKind::UnitTest) .scope(Scope::new_unit_test()) - .build() + .attached_child() } pub fn with_scope_debug_tools(&self) -> Self { - RequestContextBuilder::new(TaskKind::DebugTool) + RequestContextBuilder::from(self) + .task_kind(TaskKind::DebugTool) .scope(Scope::new_debug_tools()) - .build() + .attached_child() } pub fn task_kind(&self) -> TaskKind { @@ -504,4 +565,76 @@ impl RequestContext { Scope::DebugTools { io_size_metrics } => io_size_metrics, } } + + pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) { + if duration == Duration::ZERO { + return; + } + + match &self.scope { + Scope::Timeline { arc_arc } => arc_arc + .wait_ondemand_download_time + .observe(self.task_kind, duration), + _ => { + use once_cell::sync::Lazy; + use std::sync::Mutex; + use std::time::Duration; + use utils::rate_limit::RateLimit; + static LIMIT: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1)))); + let mut guard = LIMIT.lock().unwrap(); + guard.call2(|rate_limit_stats| { + warn!( + %rate_limit_stats, + backtrace=%std::backtrace::Backtrace::force_capture(), + "ondemand downloads should always happen within timeline scope", + ); + }); + } + } + } + + pub(crate) fn perf_follows_from(&self, from: &RequestContext) { + if let (Some(span), Some(from_span)) = (&self.perf_span, &from.perf_span) { + span.inner().follows_from(from_span.inner()); + } + } + + pub(crate) fn has_perf_span(&self) -> bool { + self.perf_span.is_some() + } } + +/// [`Future`] extension trait that allow for creating performance +/// spans on sampled requests +pub(crate) trait PerfInstrumentFutureExt<'a>: Future + Send { + /// Instrument this future with a new performance span when the + /// provided request context indicates the originator request + /// was sampled. Otherwise, just box the future and return it as is. + fn maybe_perf_instrument( + self, + ctx: &RequestContext, + make_span: Fn, + ) -> BoxFuture<'a, Self::Output> + where + Self: Sized + 'a, + Fn: FnOnce(&Span) -> Span, + { + match &ctx.perf_span { + Some(perf_span) => { + assert!(ctx.perf_span_dispatch.is_some()); + let dispatcher = ctx.perf_span_dispatch.as_ref().unwrap(); + + let new_span = + tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner())); + + let new_perf_span = PerfSpan::new(new_span, dispatcher.clone()); + self.instrument(new_perf_span).boxed() + } + None => self.boxed(), + } + } +} + +// Implement the trait for all types that satisfy the trait bounds +impl<'a, T: Future + Send + 'a> PerfInstrumentFutureExt<'a> for T {} diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 745d04cf62..fd5fbfcba9 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -21,10 +21,7 @@ use crate::virtual_file::on_fatal_io_error; /// The Pageserver's client for using the storage controller upcall API: this is a small API /// for dealing with generations (see docs/rfcs/025-generation-numbers.md). -/// -/// The server presenting this API may either be the storage controller or some other -/// service (such as the Neon control plane) providing a store of generation numbers. -pub struct ControllerUpcallClient { +pub struct StorageControllerUpcallClient { http_client: reqwest::Client, base_url: Url, node_id: NodeId, @@ -37,7 +34,7 @@ pub enum RetryForeverError { ShuttingDown, } -pub trait ControlPlaneGenerationsApi { +pub trait StorageControllerUpcallApi { fn re_attach( &self, conf: &PageServerConf, @@ -50,13 +47,16 @@ pub trait ControlPlaneGenerationsApi { ) -> impl Future, RetryForeverError>> + Send; } -impl ControllerUpcallClient { +impl StorageControllerUpcallClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. - pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { + pub fn new( + conf: &'static PageServerConf, + cancel: &CancellationToken, + ) -> Result, reqwest::Error> { let mut url = match conf.control_plane_api.as_ref() { Some(u) => u.clone(), - None => return None, + None => return Ok(None), }; if let Ok(mut segs) = url.path_segments_mut() { @@ -76,12 +76,16 @@ impl ControllerUpcallClient { client = client.default_headers(headers); } - Some(Self { - http_client: client.build().expect("Failed to construct HTTP client"), + for ssl_ca_cert in &conf.ssl_ca_certs { + client = client.add_root_certificate(ssl_ca_cert.clone()); + } + + Ok(Some(Self { + http_client: client.build()?, base_url: url, node_id: conf.id, cancel: cancel.clone(), - }) + })) } #[tracing::instrument(skip_all)] @@ -124,7 +128,7 @@ impl ControllerUpcallClient { } } -impl ControlPlaneGenerationsApi for ControllerUpcallClient { +impl StorageControllerUpcallApi for StorageControllerUpcallClient { /// Block until we get a successful response, or error out if we are shut down #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn re_attach( diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 8118f66252..d9c1c07b10 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -26,7 +26,7 @@ use self::deleter::Deleter; use self::list_writer::{DeletionOp, ListWriter, RecoverOp}; use self::validator::Validator; use crate::config::PageServerConf; -use crate::controller_upcall_client::ControlPlaneGenerationsApi; +use crate::controller_upcall_client::StorageControllerUpcallApi; use crate::metrics; use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path}; use crate::tenant::storage_layer::LayerName; @@ -76,7 +76,7 @@ pub struct DeletionQueue { /// worker objects themselves public pub struct DeletionQueueWorkers where - C: ControlPlaneGenerationsApi + Send + Sync, + C: StorageControllerUpcallApi + Send + Sync, { frontend: ListWriter, backend: Validator, @@ -85,7 +85,7 @@ where impl DeletionQueueWorkers where - C: ControlPlaneGenerationsApi + Send + Sync + 'static, + C: StorageControllerUpcallApi + Send + Sync + 'static, { pub fn spawn_with(mut self, runtime: &tokio::runtime::Handle) -> tokio::task::JoinHandle<()> { let jh_frontend = runtime.spawn(async move { @@ -589,7 +589,7 @@ impl DeletionQueue { conf: &'static PageServerConf, ) -> (Self, DeletionQueueWorkers) where - C: ControlPlaneGenerationsApi + Send + Sync, + C: StorageControllerUpcallApi + Send + Sync, { // Unbounded channel: enables non-async functions to submit deletions. The actual length is // constrained by how promptly the ListWriter wakes up and drains it, which should be frequent @@ -691,7 +691,7 @@ mod test { harness: TenantHarness, remote_fs_dir: Utf8PathBuf, storage: GenericRemoteStorage, - mock_control_plane: MockControlPlane, + mock_control_plane: MockStorageController, deletion_queue: DeletionQueue, worker_join: JoinHandle<()>, } @@ -751,11 +751,11 @@ mod test { } #[derive(Debug, Clone)] - struct MockControlPlane { + struct MockStorageController { pub latest_generation: std::sync::Arc>>, } - impl MockControlPlane { + impl MockStorageController { fn new() -> Self { Self { latest_generation: Arc::default(), @@ -763,7 +763,7 @@ mod test { } } - impl ControlPlaneGenerationsApi for MockControlPlane { + impl StorageControllerUpcallApi for MockStorageController { async fn re_attach( &self, _conf: &PageServerConf, @@ -810,7 +810,7 @@ mod test { .await .unwrap(); - let mock_control_plane = MockControlPlane::new(); + let mock_control_plane = MockStorageController::new(); let (deletion_queue, worker) = DeletionQueue::new( storage.clone(), diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index b0ce2b80b4..4e775f15eb 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -25,7 +25,7 @@ use tracing::{debug, info, warn}; use super::deleter::DeleterMessage; use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates}; use crate::config::PageServerConf; -use crate::controller_upcall_client::{ControlPlaneGenerationsApi, RetryForeverError}; +use crate::controller_upcall_client::{RetryForeverError, StorageControllerUpcallApi}; use crate::metrics; use crate::virtual_file::MaybeFatalIo; @@ -46,7 +46,7 @@ pub(super) enum ValidatorQueueMessage { } pub(super) struct Validator where - C: ControlPlaneGenerationsApi, + C: StorageControllerUpcallApi, { conf: &'static PageServerConf, rx: tokio::sync::mpsc::Receiver, @@ -80,7 +80,7 @@ where impl Validator where - C: ControlPlaneGenerationsApi, + C: StorageControllerUpcallApi, { pub(super) fn new( conf: &'static PageServerConf, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index e799efcce3..566086c527 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -669,6 +669,13 @@ paths: Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`. Current implementation might not be retryable across failure cases, but will be enhanced in future. Detaching should be expected to be expensive operation. Timeouts should be retried. + parameters: + - name: detach_behavior + in: query + required: false + schema: + description: Currently valid values are `v1`, `v2` + type: string responses: "200": description: | @@ -1079,6 +1086,7 @@ components: - last_record_lsn - disk_consistent_lsn - state + - min_readable_lsn properties: timeline_id: type: string @@ -1125,6 +1133,40 @@ components: applied_gc_cutoff_lsn: type: string format: hex + safekeepers: + $ref: "#/components/schemas/TimelineSafekeepersInfo" + + TimelineSafekeepersInfo: + type: object + required: + - tenant_id + - timeline_id + - generation + - safekeepers + properties: + tenant_id: + type: string + format: hex + timeline_id: + type: string + format: hex + generation: + type: integer + safekeepers: + type: array + items: + $ref: "#/components/schemas/TimelineSafekeeperInfo" + + TimelineSafekeeperInfo: + type: object + required: + - id + - hostname + properties: + id: + type: integer + hostname: + type: string SyntheticSizeResponse: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e8a32ca1ef..cf67dc596a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -37,8 +37,8 @@ use pageserver_api::models::{ TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode, TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo, - TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem, - TopTenantShardsRequest, TopTenantShardsResponse, + TimelinePatchIndexPartRequest, TimelineVisibilityState, TimelinesInfoAndOffloaded, + TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, }; use pageserver_api::shard::{ShardCount, TenantShardId}; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; @@ -60,7 +60,7 @@ use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; -use crate::tenant::config::{LocationConf, TenantConfOpt}; +use crate::tenant::config::LocationConf; use crate::tenant::mgr::{ GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError, @@ -74,8 +74,8 @@ use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; use crate::tenant::timeline::{ - CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout, - WaitLsnWaiter, import_pgdata, + CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline, + WaitLsnTimeout, WaitLsnWaiter, import_pgdata, }; use crate::tenant::{ GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError, @@ -439,11 +439,15 @@ async fn build_timeline_info_common( let remote_consistent_lsn_visible = timeline .get_remote_consistent_lsn_visible() .unwrap_or(Lsn(0)); + let is_invisible = timeline.remote_client.is_invisible().unwrap_or(false); let walreceiver_status = timeline.walreceiver_status(); let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + // Externally, expose the lowest LSN that can be used to create a branch. + // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we + // actually trimmed data to), which can pass each other when PITR is changed. let min_readable_lsn = std::cmp::max( timeline.get_gc_cutoff_lsn(), *timeline.get_applied_gc_cutoff_lsn(), @@ -460,7 +464,6 @@ async fn build_timeline_info_common( initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), - _unused: Default::default(), // Unused, for legacy decode only min_readable_lsn, applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(), current_logical_size: current_logical_size.size_dont_care_about_accuracy(), @@ -482,6 +485,7 @@ async fn build_timeline_info_common( state, is_archived: Some(is_archived), rel_size_migration: Some(timeline.get_rel_size_v2_status()), + is_invisible: Some(is_invisible), walreceiver_status, }; @@ -1849,8 +1853,7 @@ async fn update_tenant_config_handler( let tenant_id = request_data.tenant_id; check_permission(&request, Some(tenant_id))?; - let new_tenant_conf = - TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; + let new_tenant_conf = request_data.config; let state = get_state(&request); @@ -1899,7 +1902,10 @@ async fn patch_tenant_config_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let updated = tenant - .update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone())) + .update_tenant_config(|crnt| { + crnt.apply_patch(request_data.config.clone()) + .map_err(anyhow::Error::new) + }) .map_err(ApiError::BadRequest)?; // This is a legacy API that only operates on attached tenants: the preferred @@ -2252,7 +2258,6 @@ async fn timeline_compact_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); - flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; @@ -2331,6 +2336,38 @@ async fn timeline_compact_handler( .await } +async fn timeline_mark_invisible_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let compact_request = json_request_maybe::>(&mut request).await?; + + let state = get_state(&request); + + let visibility = match compact_request { + Some(req) => match req.is_visible { + Some(true) => TimelineVisibilityState::Visible, + Some(false) | None => TimelineVisibilityState::Invisible, + }, + None => TimelineVisibilityState::Invisible, + }; + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + let timeline = tenant.get_timeline(timeline_id, true)?; + timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(visibility).map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, ()) + } + .instrument(info_span!("manual_timeline_mark_invisible", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await +} + // Run offload immediately on given timeline. async fn timeline_offload_handler( request: Request, @@ -2391,7 +2428,6 @@ async fn timeline_checkpoint_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); - flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; } @@ -2661,11 +2697,12 @@ async fn getpage_at_lsn_handler_inner( let lsn: Option = parse_query_param(&request, "lsn")?; async { - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - // Enable read path debugging let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; - let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true) - .scope(context::Scope::new_timeline(&timeline)).build(); + let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest) + .download_behavior(DownloadBehavior::Download) + .scope(context::Scope::new_timeline(&timeline)) + .read_path_debug(true) + .root(); // Use last_record_lsn if no lsn is provided let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); @@ -3152,7 +3189,8 @@ async fn list_aux_files( timeline.gate.enter().map_err(|_| ApiError::Cancelled)?, ); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let files = timeline .list_aux_files(body.lsn, &ctx, io_concurrency) .await?; @@ -3396,14 +3434,15 @@ async fn put_tenant_timeline_import_wal( check_permission(&request, Some(tenant_id))?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn); async move { let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?; - let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build(); + let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest) + .download_behavior(DownloadBehavior::Warn) + .scope(context::Scope::new_timeline(&timeline)) + .root(); let mut body = StreamReader::new(request.into_body().map(|res| { res.map_err(|error| { @@ -3748,6 +3787,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload", |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/mark_invisible", + |r| api_handler( r, timeline_mark_invisible_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 8373d0bd87..bda218444d 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -55,6 +55,9 @@ pub const DEFAULT_PG_VERSION: u32 = 16; pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; +// Target used for performance traces. +pub const PERF_TRACE_TARGET: &str = "P"; + static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index f3307ed5a9..2e81123384 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,16 +1,14 @@ use std::collections::HashMap; use std::num::NonZeroUsize; use std::os::fd::RawFd; -use std::pin::Pin; use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; -use std::task::{Context, Poll}; use std::time::{Duration, Instant}; use enum_map::{Enum as _, EnumMap}; use futures::Future; use metrics::{ - Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, @@ -23,13 +21,13 @@ use pageserver_api::config::{ }; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use pin_project_lite::pin_project; use postgres_backend::{QueryError, is_expected_io_error}; use pq_proto::framed::ConnectionError; use strum::{EnumCount, IntoEnumIterator as _, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use utils::id::TimelineId; +use crate::config; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::pgdatadir_mapping::DatadirModificationStats; @@ -499,14 +497,99 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::n .expect("failed to define a metric") }); -static FLUSH_WAIT_UPLOAD_TIME: Lazy = Lazy::new(|| { - register_gauge_vec!( - "pageserver_flush_wait_upload_seconds", - "Time spent waiting for preceding uploads during layer flush", - &["tenant_id", "shard_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); +pub(crate) mod wait_ondemand_download_time { + use super::*; + const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[ + 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, // 10 ms - 100ms + 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, // 100ms to 1s + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, // 1s to 10s + 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, // 10s to 1m + ]; + + /// The task kinds for which we want to track wait times for on-demand downloads. + /// Other task kinds' wait times are accumulated in label value `unknown`. + pub(crate) const WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS: [TaskKind; 2] = [ + TaskKind::PageRequestHandler, + TaskKind::WalReceiverConnectionHandler, + ]; + + pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL: Lazy> = Lazy::new(|| { + let histo = register_histogram_vec!( + "pageserver_wait_ondemand_download_seconds_global", + "Observations are individual tasks' wait times for on-demand downloads. \ + If N tasks coalesce on an on-demand download, and it takes 10s, than we observe N * 10s.", + &["task_kind"], + WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS.into(), + ) + .expect("failed to define a metric"); + WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS + .iter() + .map(|task_kind| histo.with_label_values(&[task_kind.into()])) + .collect::>() + }); + + pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_SUM: Lazy = Lazy::new(|| { + register_counter_vec!( + // use a name that _could_ be evolved into a per-timeline histogram later + "pageserver_wait_ondemand_download_seconds_sum", + "Like `pageserver_wait_ondemand_download_seconds_global` but per timeline", + &["tenant_id", "shard_id", "timeline_id", "task_kind"], + ) + .unwrap() + }); + + pub struct WaitOndemandDownloadTimeSum { + counters: [Counter; WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS.len()], + } + + impl WaitOndemandDownloadTimeSum { + pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self { + let counters = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS + .iter() + .map(|task_kind| { + WAIT_ONDEMAND_DOWNLOAD_TIME_SUM + .get_metric_with_label_values(&[ + tenant_id, + shard_id, + timeline_id, + task_kind.into(), + ]) + .unwrap() + }) + .collect::>(); + Self { + counters: counters.try_into().unwrap(), + } + } + pub(crate) fn observe(&self, task_kind: TaskKind, duration: Duration) { + let maybe = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS + .iter() + .enumerate() + .find(|(_, kind)| **kind == task_kind); + let Some((idx, _)) = maybe else { + return; + }; + WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL[idx].observe(duration.as_secs_f64()); + let counter = &self.counters[idx]; + counter.inc_by(duration.as_secs_f64()); + } + } + + pub(crate) fn shutdown_timeline(tenant_id: &str, shard_id: &str, timeline_id: &str) { + for task_kind in WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS { + let _ = WAIT_ONDEMAND_DOWNLOAD_TIME_SUM.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + task_kind.into(), + ]); + } + } + + pub(crate) fn preinitialize_global_metrics() { + Lazy::force(&WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL); + } +} static LAST_RECORD_LSN: Lazy = Lazy::new(|| { register_int_gauge_vec!( @@ -1259,13 +1342,13 @@ pub(crate) static STORAGE_IO_TIME_METRIC: Lazy = Lazy::new(Storag #[derive(Clone, Copy)] #[repr(usize)] -enum StorageIoSizeOperation { +pub(crate) enum StorageIoSizeOperation { Read, Write, } impl StorageIoSizeOperation { - const VARIANTS: &'static [&'static str] = &["read", "write"]; + pub(crate) const VARIANTS: &'static [&'static str] = &["read", "write"]; fn as_str(&self) -> &'static str { Self::VARIANTS[*self as usize] @@ -1273,7 +1356,7 @@ impl StorageIoSizeOperation { } // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1 -static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { +pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_io_operations_bytes_total", "Total amount of bytes read/written in IO operations", @@ -2325,13 +2408,18 @@ impl RemoteOpFileKind { } } -pub(crate) static REMOTE_OPERATION_TIME: Lazy = Lazy::new(|| { +pub(crate) static REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY: Lazy = Lazy::new(|| { register_histogram_vec!( - "pageserver_remote_operation_seconds", - "Time spent on remote storage operations. \ - Grouped by tenant, timeline, operation_kind and status. \ + "pageserver_remote_timeline_client_seconds_global", + "Time spent on remote timeline client operations. \ + Grouped by task_kind, file_kind, operation_kind and status. \ + The task_kind is \ + - for layer downloads, populated from RequestContext (primary objective of having the label) \ + - for index downloads, set to 'unknown' \ + - for any upload operation, set to 'RemoteUploadTask' \ + This keeps dimensionality at bay. \ Does not account for time spent waiting in remote timeline client's queues.", - &["file_kind", "op_kind", "status"] + &["task_kind", "file_kind", "op_kind", "status"] ) .expect("failed to define a metric") }); @@ -2866,7 +2954,6 @@ pub(crate) struct TimelineMetrics { timeline_id: String, pub flush_time_histo: StorageTimeMetrics, pub flush_delay_histo: StorageTimeMetrics, - pub flush_wait_upload_time_gauge: Gauge, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, pub logical_size_histo: StorageTimeMetrics, @@ -2894,6 +2981,7 @@ pub(crate) struct TimelineMetrics { pub storage_io_size: StorageIoSizeMetrics, pub wait_lsn_in_progress_micros: GlobalAndPerTenantIntCounter, pub wait_lsn_start_finish_counterpair: IntCounterPair, + pub wait_ondemand_download_time: wait_ondemand_download_time::WaitOndemandDownloadTimeSum, shutdown: std::sync::atomic::AtomicBool, } @@ -2918,9 +3006,6 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); - let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME - .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) - .unwrap(); let compact_time_histo = StorageTimeMetrics::new( StorageTimeOperation::Compact, &tenant_id, @@ -3042,13 +3127,19 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let wait_ondemand_download_time = + wait_ondemand_download_time::WaitOndemandDownloadTimeSum::new( + &tenant_id, + &shard_id, + &timeline_id, + ); + TimelineMetrics { tenant_id, shard_id, timeline_id, flush_time_histo, flush_delay_histo, - flush_wait_upload_time_gauge, compact_time_histo, create_images_time_histo, logical_size_histo, @@ -3076,6 +3167,7 @@ impl TimelineMetrics { wal_records_received, wait_lsn_in_progress_micros, wait_lsn_start_finish_counterpair, + wait_ondemand_download_time, shutdown: std::sync::atomic::AtomicBool::default(), } } @@ -3098,14 +3190,6 @@ impl TimelineMetrics { self.resident_physical_size_gauge.get() } - pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) { - self.flush_wait_upload_time_gauge.add(duration); - crate::metrics::FLUSH_WAIT_UPLOAD_TIME - .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id]) - .unwrap() - .add(duration); - } - /// Generates TIMELINE_LAYER labels for a persistent layer. fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] { let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) { @@ -3209,7 +3293,6 @@ impl TimelineMetrics { let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); - let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); @@ -3277,6 +3360,8 @@ impl TimelineMetrics { .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id]); } + wait_ondemand_download_time::shutdown_timeline(tenant_id, shard_id, timeline_id); + let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), tenant_id, @@ -3398,13 +3483,18 @@ impl RemoteTimelineClientMetrics { pub fn remote_operation_time( &self, + task_kind: Option, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, status: &'static str, ) -> Histogram { - let key = (file_kind.as_str(), op_kind.as_str(), status); - REMOTE_OPERATION_TIME - .get_metric_with_label_values(&[key.0, key.1, key.2]) + REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY + .get_metric_with_label_values(&[ + task_kind.as_ref().map(|tk| tk.into()).unwrap_or("unknown"), + file_kind.as_str(), + op_kind.as_str(), + status, + ]) .unwrap() } @@ -3649,54 +3739,26 @@ impl Drop for RemoteTimelineClientMetrics { /// Wrapper future that measures the time spent by a remote storage operation, /// and records the time and success/failure as a prometheus metric. -pub(crate) trait MeasureRemoteOp: Sized { - fn measure_remote_op( +pub(crate) trait MeasureRemoteOp: Sized + Future> { + async fn measure_remote_op( self, + task_kind: Option, // not all caller contexts have a RequestContext / TaskKind handy file_kind: RemoteOpFileKind, op: RemoteOpKind, metrics: Arc, - ) -> MeasuredRemoteOp { + ) -> Result { let start = Instant::now(); - MeasuredRemoteOp { - inner: self, - file_kind, - op, - start, - metrics, - } + let res = self.await; + let duration = start.elapsed(); + let status = if res.is_ok() { &"success" } else { &"failure" }; + metrics + .remote_operation_time(task_kind, &file_kind, &op, status) + .observe(duration.as_secs_f64()); + res } } -impl MeasureRemoteOp for T {} - -pin_project! { - pub(crate) struct MeasuredRemoteOp - { - #[pin] - inner: F, - file_kind: RemoteOpFileKind, - op: RemoteOpKind, - start: Instant, - metrics: Arc, - } -} - -impl>, O, E> Future for MeasuredRemoteOp { - type Output = Result; - - fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - let this = self.project(); - let poll_result = this.inner.poll(cx); - if let Poll::Ready(ref res) = poll_result { - let duration = this.start.elapsed(); - let status = if res.is_ok() { &"success" } else { &"failure" }; - this.metrics - .remote_operation_time(this.file_kind, this.op, status) - .observe(duration.as_secs_f64()); - } - poll_result - } -} +impl MeasureRemoteOp for Fut where Fut: Sized + Future> {} pub mod tokio_epoll_uring { use std::collections::HashMap; @@ -4132,9 +4194,33 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { .set(u64::try_from(num_threads.get()).unwrap()); } -pub fn preinitialize_metrics(conf: &'static PageServerConf) { +static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_config_ignored_items", + "TOML items present in the on-disk configuration file but ignored by the pageserver config parser.\ + The `item` label is the dot-separated path of the ignored item in the on-disk configuration file.\ + The value for an unknown config item is always 1.\ + There is a special label value \"\", which is 0, so that there is always a metric exposed (simplifies dashboards).", + &["item"] + ) + .unwrap() +}); + +pub fn preinitialize_metrics( + conf: &'static PageServerConf, + ignored: config::ignored_fields::Paths, +) { set_page_service_config_max_batch_size(&conf.page_service_pipelining); + PAGESERVER_CONFIG_IGNORED_ITEMS + .with_label_values(&[""]) + .set(0); + for path in &ignored.paths { + PAGESERVER_CONFIG_IGNORED_ITEMS + .with_label_values(&[path]) + .set(1); + } + // Python tests need these and on some we do alerting. // // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of @@ -4220,4 +4306,5 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE); tenant_throttling::preinitialize_global_metrics(); + wait_ondemand_download_time::preinitialize_global_metrics(); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 94571cbaaa..7e3991dbdc 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -9,6 +9,7 @@ use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; use std::{io, str}; +use crate::PERF_TRACE_TARGET; use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; @@ -53,7 +54,9 @@ use utils::sync::spsc_fold; use crate::auth::check_permission; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext}; +use crate::context::{ + DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, +}; use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer, TimelineMetrics, @@ -100,6 +103,7 @@ pub fn spawn( conf: &'static PageServerConf, tenant_manager: Arc, pg_auth: Option>, + perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, ) -> Listener { let cancel = CancellationToken::new(); @@ -117,6 +121,7 @@ pub fn spawn( conf, tenant_manager, pg_auth, + perf_trace_dispatch, tcp_listener, conf.pg_auth_type, conf.page_service_pipelining.clone(), @@ -173,6 +178,7 @@ pub async fn libpq_listener_main( conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, + perf_trace_dispatch: Option, listener: tokio::net::TcpListener, auth_type: AuthType, pipelining_config: PageServicePipeliningConfig, @@ -205,8 +211,12 @@ pub async fn libpq_listener_main( // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - let connection_ctx = listener_ctx - .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download); + let connection_ctx = RequestContextBuilder::from(&listener_ctx) + .task_kind(TaskKind::PageRequestHandler) + .download_behavior(DownloadBehavior::Download) + .perf_span_dispatch(perf_trace_dispatch.clone()) + .detached_child(); + connection_handler_tasks.spawn(page_service_conn_main( conf, tenant_manager.clone(), @@ -237,7 +247,16 @@ pub async fn libpq_listener_main( type ConnectionHandlerResult = anyhow::Result<()>; -#[instrument(skip_all, fields(peer_addr, application_name))] +/// Perf root spans start at the per-request level, after shard routing. +/// This struct carries connection-level information to the root perf span definition. +#[derive(Clone)] +struct ConnectionPerfSpanFields { + peer_addr: String, + application_name: Option, + compute_mode: Option, +} + +#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))] #[allow(clippy::too_many_arguments)] async fn page_service_conn_main( conf: &'static PageServerConf, @@ -261,6 +280,12 @@ async fn page_service_conn_main( let socket_fd = socket.as_raw_fd(); let peer_addr = socket.peer_addr().context("get peer address")?; + + let perf_span_fields = ConnectionPerfSpanFields { + peer_addr: peer_addr.to_string(), + application_name: None, // filled in later + compute_mode: None, // filled in later + }; tracing::Span::current().record("peer_addr", field::display(peer_addr)); // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements: @@ -304,6 +329,7 @@ async fn page_service_conn_main( tenant_manager, auth, pipelining_config, + perf_span_fields, connection_ctx, cancel.clone(), gate_guard, @@ -348,6 +374,8 @@ struct PageServerHandler { /// `process_query` creates a child context from this one. connection_ctx: RequestContext, + perf_span_fields: ConnectionPerfSpanFields, + cancel: CancellationToken, /// None only while pagestream protocol is being processed. @@ -607,6 +635,7 @@ impl std::fmt::Display for BatchedPageStreamError { struct BatchedGetPageRequest { req: PagestreamGetPageRequest, timer: SmgrOpTimer, + ctx: RequestContext, } #[cfg(feature = "testing")] @@ -692,11 +721,13 @@ impl BatchedFeMessage { } impl PageServerHandler { + #[allow(clippy::too_many_arguments)] pub fn new( conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, pipelining_config: PageServicePipeliningConfig, + perf_span_fields: ConnectionPerfSpanFields, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -706,6 +737,7 @@ impl PageServerHandler { auth, claims: None, connection_ctx, + perf_span_fields, timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, pipelining_config, @@ -743,6 +775,7 @@ impl PageServerHandler { tenant_id: TenantId, timeline_id: TimelineId, timeline_handles: &mut TimelineHandles, + conn_perf_span_fields: &ConnectionPerfSpanFields, cancel: &CancellationToken, ctx: &RequestContext, protocol_version: PagestreamProtocolVersion, @@ -902,10 +935,12 @@ impl PageServerHandler { } let key = rel_block_to_key(req.rel, req.blkno); - let shard = match timeline_handles + + let res = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Page(key)) - .await - { + .await; + + let shard = match res { Ok(tl) => tl, Err(e) => { let span = mkspan!(before shard routing); @@ -932,6 +967,41 @@ impl PageServerHandler { } } }; + + let ctx = if shard.is_get_page_request_sampled() { + RequestContextBuilder::from(ctx) + .root_perf_span(|| { + info_span!( + target: PERF_TRACE_TARGET, + "GET_PAGE", + peer_addr = conn_perf_span_fields.peer_addr, + application_name = conn_perf_span_fields.application_name, + compute_mode = conn_perf_span_fields.compute_mode, + tenant_id = %tenant_id, + shard_id = %shard.get_shard_identity().shard_slug(), + timeline_id = %timeline_id, + lsn = %req.hdr.request_lsn, + request_id = %req.hdr.reqid, + key = %key, + ) + }) + .attached_child() + } else { + ctx.attached_child() + }; + + // This ctx travels as part of the BatchedFeMessage through + // batching into the request handler. + // The request handler needs to do some per-request work + // (relsize check) before dispatching the batch as a single + // get_vectored call to the Timeline. + // This ctx will be used for the reslize check, whereas the + // get_vectored call will be a different ctx with separate + // perf span. + let ctx = ctx.with_scope_page_service_pagestream(&shard); + + // Similar game for this `span`: we funnel it through so that + // request handler log messages contain the request-specific fields. let span = mkspan!(shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( @@ -939,19 +1009,34 @@ impl PageServerHandler { metrics::SmgrQueryType::GetPageAtLsn, received_at, ) + .maybe_perf_instrument(&ctx, |current_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: current_perf_span, + "THROTTLE", + ) + }) .await?; // We're holding the Handle - let effective_request_lsn = match Self::wait_or_get_last_lsn( + // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait + let res = Self::wait_or_get_last_lsn( &shard, req.hdr.request_lsn, req.hdr.not_modified_since, &shard.get_applied_gc_cutoff_lsn(), - ctx, + &ctx, ) - // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait - .await - { + .maybe_perf_instrument(&ctx, |current_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: current_perf_span, + "WAIT_LSN", + ) + }) + .await; + + let effective_request_lsn = match res { Ok(lsn) => lsn, Err(e) => { return respond_error!(span, e); @@ -961,7 +1046,7 @@ impl PageServerHandler { span, shard: shard.downgrade(), effective_request_lsn, - pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }], + pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }], } } #[cfg(feature = "testing")] @@ -1514,12 +1599,14 @@ impl PageServerHandler { IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { let cancel = self.cancel.clone(); + let err = loop { let msg = Self::pagestream_read_message( &mut pgb_reader, tenant_id, timeline_id, &mut timeline_handles, + &self.perf_span_fields, &cancel, ctx, protocol_version, @@ -1653,6 +1740,8 @@ impl PageServerHandler { // Batcher // + let perf_span_fields = self.perf_span_fields.clone(); + let cancel_batcher = self.cancel.child_token(); let (mut batch_tx, mut batch_rx) = spsc_fold::channel(); let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| { @@ -1666,6 +1755,7 @@ impl PageServerHandler { tenant_id, timeline_id, &mut timeline_handles, + &perf_span_fields, &cancel_batcher, &ctx, protocol_version, @@ -2004,7 +2094,9 @@ impl PageServerHandler { let results = timeline .get_rel_page_at_lsn_batched( - requests.iter().map(|p| (&p.req.rel, &p.req.blkno)), + requests + .iter() + .map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())), effective_lsn, io_concurrency, ctx, @@ -2512,6 +2604,58 @@ impl PageServiceCmd { } } +/// Parse the startup options from the postgres wire protocol startup packet. +/// +/// It takes a sequence of `-c option=X` or `-coption=X`. It parses the options string +/// by best effort and returns all the options parsed (key-value pairs) and a bool indicating +/// whether all options are successfully parsed. There could be duplicates in the options +/// if the caller passed such parameters. +fn parse_options(options: &str) -> (Vec<(String, String)>, bool) { + let mut parsing_config = false; + let mut has_error = false; + let mut config = Vec::new(); + for item in options.split_whitespace() { + if item == "-c" { + if !parsing_config { + parsing_config = true; + } else { + // "-c" followed with another "-c" + tracing::warn!("failed to parse the startup options: {options}"); + has_error = true; + break; + } + } else if item.starts_with("-c") || parsing_config { + let Some((mut key, value)) = item.split_once('=') else { + // "-c" followed with an invalid option + tracing::warn!("failed to parse the startup options: {options}"); + has_error = true; + break; + }; + if !parsing_config { + // Parse "-coptions=X" + let Some(stripped_key) = key.strip_prefix("-c") else { + tracing::warn!("failed to parse the startup options: {options}"); + has_error = true; + break; + }; + key = stripped_key; + } + config.push((key.to_string(), value.to_string())); + parsing_config = false; + } else { + tracing::warn!("failed to parse the startup options: {options}"); + has_error = true; + break; + } + } + if parsing_config { + // "-c" without the option + tracing::warn!("failed to parse the startup options: {options}"); + has_error = true; + } + (config, has_error) +} + impl postgres_backend::Handler for PageServerHandler where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, @@ -2554,8 +2698,18 @@ where if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(app_name) = params.get("application_name") { + self.perf_span_fields.application_name = Some(app_name.to_string()); Span::current().record("application_name", field::display(app_name)); } + if let Some(options) = params.get("options") { + let (config, _) = parse_options(options); + for (key, value) in config { + if key == "neon.compute_mode" { + self.perf_span_fields.compute_mode = Some(value.clone()); + Span::current().record("compute_mode", field::display(value)); + } + } + } }; Ok(()) @@ -2669,6 +2823,7 @@ where PageServiceCmd::Set => { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect + // TODO: allow setting options, i.e., application_name/compute_mode via SET commands pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } PageServiceCmd::LeaseLsn(LeaseLsnCmd { @@ -2943,4 +3098,46 @@ mod tests { let cmd = PageServiceCmd::parse(&format!("lease {tenant_id} {timeline_id} gzip 0/16ABCDE")); assert!(cmd.is_err()); } + + #[test] + fn test_parse_options() { + let (config, has_error) = parse_options(" -c neon.compute_mode=primary "); + assert!(!has_error); + assert_eq!( + config, + vec![("neon.compute_mode".to_string(), "primary".to_string())] + ); + + let (config, has_error) = parse_options(" -c neon.compute_mode=primary -c foo=bar "); + assert!(!has_error); + assert_eq!( + config, + vec![ + ("neon.compute_mode".to_string(), "primary".to_string()), + ("foo".to_string(), "bar".to_string()), + ] + ); + + let (config, has_error) = parse_options(" -c neon.compute_mode=primary -cfoo=bar"); + assert!(!has_error); + assert_eq!( + config, + vec![ + ("neon.compute_mode".to_string(), "primary".to_string()), + ("foo".to_string(), "bar".to_string()), + ] + ); + + let (_, has_error) = parse_options("-c"); + assert!(has_error); + + let (_, has_error) = parse_options("-c foo=bar -c -c"); + assert!(has_error); + + let (_, has_error) = parse_options(" "); + assert!(!has_error); + + let (_, has_error) = parse_options(" -c neon.compute_mode"); + assert!(has_error); + } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 4685f9383b..e3e06ab91a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -9,6 +9,7 @@ use std::collections::{BTreeMap, HashMap, HashSet, hash_map}; use std::ops::{ControlFlow, Range}; +use crate::PERF_TRACE_TARGET; use anyhow::{Context, ensure}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; @@ -31,7 +32,7 @@ use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, trace, warn}; +use tracing::{debug, info, info_span, trace, warn}; use utils::bin_ser::{BeSer, DeserializeError}; use utils::lsn::Lsn; use utils::pausable_failpoint; @@ -39,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; -use crate::context::RequestContext; +use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::{ RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD, @@ -209,7 +210,9 @@ impl Timeline { let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)]; let res = self .get_rel_page_at_lsn_batched( - pages.iter().map(|(tag, blknum)| (tag, blknum)), + pages + .iter() + .map(|(tag, blknum)| (tag, blknum, ctx.attached_child())), effective_lsn, io_concurrency.clone(), ctx, @@ -248,7 +251,7 @@ impl Timeline { /// The ordering of the returned vec corresponds to the ordering of `pages`. pub(crate) async fn get_rel_page_at_lsn_batched( &self, - pages: impl ExactSizeIterator, + pages: impl ExactSizeIterator, effective_lsn: Lsn, io_concurrency: IoConcurrency, ctx: &RequestContext, @@ -262,8 +265,11 @@ impl Timeline { let mut result = Vec::with_capacity(pages.len()); let result_slots = result.spare_capacity_mut(); - let mut keys_slots: BTreeMap> = BTreeMap::default(); - for (response_slot_idx, (tag, blknum)) in pages.enumerate() { + let mut keys_slots: BTreeMap> = + BTreeMap::default(); + + let mut perf_instrument = false; + for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() { if tag.relnode == 0 { result_slots[response_slot_idx].write(Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), @@ -274,7 +280,16 @@ impl Timeline { } let nblocks = match self - .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx) + .get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_REL_SIZE", + reltag=%tag, + lsn=%effective_lsn, + ) + }) .await { Ok(nblocks) => nblocks, @@ -297,8 +312,12 @@ impl Timeline { let key = rel_block_to_key(*tag, *blknum); + if ctx.has_perf_span() { + perf_instrument = true; + } + let key_slots = keys_slots.entry(key).or_default(); - key_slots.push(response_slot_idx); + key_slots.push((response_slot_idx, ctx)); } let keyspace = { @@ -314,16 +333,34 @@ impl Timeline { acc.to_keyspace() }; - match self - .get_vectored(keyspace, effective_lsn, io_concurrency, ctx) - .await - { + let ctx = match perf_instrument { + true => RequestContextBuilder::from(ctx) + .root_perf_span(|| { + info_span!( + target: PERF_TRACE_TARGET, + "GET_VECTORED", + tenant_id = %self.tenant_shard_id.tenant_id, + timeline_id = %self.timeline_id, + lsn = %effective_lsn, + shard = %self.tenant_shard_id.shard_slug(), + ) + }) + .attached_child(), + false => ctx.attached_child(), + }; + + let res = self + .get_vectored(keyspace, effective_lsn, io_concurrency, &ctx) + .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone()) + .await; + + match res { Ok(results) => { for (key, res) in results { let mut key_slots = keys_slots.remove(&key).unwrap().into_iter(); - let first_slot = key_slots.next().unwrap(); + let (first_slot, first_req_ctx) = key_slots.next().unwrap(); - for slot in key_slots { + for (slot, req_ctx) in key_slots { let clone = match &res { Ok(buf) => Ok(buf.clone()), Err(err) => Err(match err { @@ -341,17 +378,22 @@ impl Timeline { }; result_slots[slot].write(clone); + // There is no standardized way to express that the batched span followed from N request spans. + // So, abuse the system and mark the request contexts as follows_from the batch span, so we get + // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for. + req_ctx.perf_follows_from(&ctx); slots_filled += 1; } result_slots[first_slot].write(res); + first_req_ctx.perf_follows_from(&ctx); slots_filled += 1; } } Err(err) => { // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size // (We enforce the max batch size outside of this function, in the code that constructs the batch request.) - for slot in keys_slots.values().flatten() { + for (slot, req_ctx) in keys_slots.values().flatten() { // this whole `match` is a lot like `From for PageReconstructError` // but without taking ownership of the GetVectoredError let err = match &err { @@ -383,6 +425,7 @@ impl Timeline { } }; + req_ctx.perf_follows_from(&ctx); result_slots[*slot].write(err); } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 0b71b2cf5b..d4873e60a1 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -38,6 +38,7 @@ use std::panic::AssertUnwindSafe; use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; +use std::time::Duration; use futures::FutureExt; use once_cell::sync::Lazy; @@ -218,8 +219,7 @@ pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); // Bump this number when adding a new pageserver_runtime! -// SAFETY: it's obviously correct -const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) }; +const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = NonZeroUsize::new(4).unwrap(); #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); @@ -584,18 +584,25 @@ pub async fn shutdown_tasks( // warn to catch these in tests; there shouldn't be any warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); } - if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle) + const INITIAL_COMPLAIN_TIMEOUT: Duration = Duration::from_secs(1); + const PERIODIC_COMPLAIN_TIMEOUT: Duration = Duration::from_secs(60); + if tokio::time::timeout(INITIAL_COMPLAIN_TIMEOUT, &mut join_handle) .await .is_err() { // allow some time to elapse before logging to cut down the number of log // lines. info!("waiting for task {} to shut down", task.name); - // we never handled this return value, but: - // - we don't deschedule which would lead to is_cancelled - // - panics are already logged (is_panicked) - // - task errors are already logged in the wrapper - let _ = join_handle.await; + loop { + tokio::select! { + // we never handled this return value, but: + // - we don't deschedule which would lead to is_cancelled + // - panics are already logged (is_panicked) + // - task errors are already logged in the wrapper + _ = &mut join_handle => break, + _ = tokio::time::sleep(PERIODIC_COMPLAIN_TIMEOUT) => info!("still waiting for task {} to shut down", task.name), + } + } info!("task {} completed", task.name); } } else { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 55b5704d67..441597d77f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -67,7 +67,7 @@ use utils::try_rcu::ArcSwapExt; use utils::zstd::{create_zst_tarball, extract_zst_tarball}; use utils::{backoff, completion, failpoint_support, fs_ext, pausable_failpoint}; -use self::config::{AttachedLocationConfig, AttachmentMode, LocationConf, TenantConf}; +use self::config::{AttachedLocationConfig, AttachmentMode, LocationConf}; use self::metadata::TimelineMetadata; use self::mgr::{GetActiveTenantError, GetTenantError}; use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest}; @@ -88,7 +88,7 @@ use crate::metrics::{ TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, }; use crate::task_mgr::TaskKind; -use crate::tenant::config::{LocationMode, TenantConfOpt}; +use crate::tenant::config::LocationMode; use crate::tenant::gc_result::GcResult; pub use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::{ @@ -162,7 +162,7 @@ pub struct TenantSharedResources { /// in this struct. #[derive(Clone)] pub(super) struct AttachedTenantConf { - tenant_conf: TenantConfOpt, + tenant_conf: pageserver_api::models::TenantConfig, location: AttachedLocationConfig, /// The deadline before which we are blocked from GC so that /// leases have a chance to be renewed. @@ -170,7 +170,10 @@ pub(super) struct AttachedTenantConf { } impl AttachedTenantConf { - fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self { + fn new( + tenant_conf: pageserver_api::models::TenantConfig, + location: AttachedLocationConfig, + ) -> Self { // Sets a deadline before which we cannot proceed to GC due to lsn lease. // // We do this as the leases mapping are not persisted to disk. By delaying GC by lease @@ -251,7 +254,7 @@ pub struct Tenant { state: watch::Sender, // Overridden tenant-specific config parameters. - // We keep TenantConfOpt sturct here to preserve the information + // We keep pageserver_api::models::TenantConfig sturct here to preserve the information // about parameters that are not set. // This is necessary to allow global config updates. tenant_conf: Arc>, @@ -3077,6 +3080,7 @@ impl Tenant { let mut has_pending_l0 = false; for timeline in compact_l0 { let ctx = &ctx.with_scope_timeline(&timeline); + // NB: don't set CompactFlags::YieldForL0, since this is an L0-only compaction pass. let outcome = timeline .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx) .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) @@ -3094,14 +3098,9 @@ impl Tenant { } } - // Pass 2: image compaction and timeline offloading. If any timelines have accumulated - // more L0 layers, they may also be compacted here. - // - // NB: image compaction may yield if there is pending L0 compaction. - // - // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a - // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`. - // We leave this for a later PR. + // Pass 2: image compaction and timeline offloading. If any timelines have accumulated more + // L0 layers, they may also be compacted here. Image compaction will yield if there is + // pending L0 compaction on any tenant timeline. // // TODO: consider ordering timelines by some priority, e.g. time since last full compaction, // amount of L1 delta debt or garbage, offload-eligible timelines first, etc. @@ -3112,8 +3111,14 @@ impl Tenant { } let ctx = &ctx.with_scope_timeline(&timeline); + // Yield for L0 if the separate L0 pass is enabled (otherwise there's no point). + let mut flags = EnumSet::default(); + if self.get_compaction_l0_first() { + flags |= CompactFlags::YieldForL0; + } + let mut outcome = timeline - .compact(cancel, EnumSet::default(), ctx) + .compact(cancel, flags, ctx) .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) .await .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; @@ -3243,17 +3248,23 @@ impl Tenant { async fn housekeeping(&self) { // Call through to all timelines to freeze ephemeral layers as needed. This usually happens // during ingest, but we don't want idle timelines to hold open layers for too long. - let timelines = self - .timelines - .lock() - .unwrap() - .values() - .filter(|tli| tli.is_active()) - .cloned() - .collect_vec(); + // + // We don't do this if the tenant can't upload layers (i.e. it's in stale attachment mode). + // We don't run compaction in this case either, and don't want to keep flushing tiny L0 + // layers that won't be compacted down. + if self.tenant_conf.load().location.may_upload_layers_hint() { + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .filter(|tli| tli.is_active()) + .cloned() + .collect_vec(); - for timeline in timelines { - timeline.maybe_freeze_ephemeral_layer().await; + for timeline in timelines { + timeline.maybe_freeze_ephemeral_layer().await; + } } // Shut down walredo if idle. @@ -3678,7 +3689,7 @@ impl Tenant { } } } - TenantState::Active { .. } => { + TenantState::Active => { return Ok(()); } TenantState::Broken { reason, .. } => { @@ -3702,17 +3713,14 @@ impl Tenant { /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively /// rare external API calls, like a reconciliation at startup. pub(crate) fn get_location_conf(&self) -> models::LocationConfig { - let conf = self.tenant_conf.load(); + let attached_tenant_conf = self.tenant_conf.load(); - let location_config_mode = match conf.location.attach_mode { + let location_config_mode = match attached_tenant_conf.location.attach_mode { AttachmentMode::Single => models::LocationConfigMode::AttachedSingle, AttachmentMode::Multi => models::LocationConfigMode::AttachedMulti, AttachmentMode::Stale => models::LocationConfigMode::AttachedStale, }; - // We have a pageserver TenantConf, we need the API-facing TenantConfig. - let tenant_config: models::TenantConfig = conf.tenant_conf.clone().into(); - models::LocationConfig { mode: location_config_mode, generation: self.generation.into(), @@ -3720,7 +3728,7 @@ impl Tenant { shard_number: self.shard_identity.number.0, shard_count: self.shard_identity.count.literal(), shard_stripe_size: self.shard_identity.stripe_size.0, - tenant_conf: tenant_config, + tenant_conf: attached_tenant_conf.tenant_conf.clone(), } } @@ -3926,11 +3934,11 @@ enum ActivateTimelineArgs { } impl Tenant { - pub fn tenant_specific_overrides(&self) -> TenantConfOpt { + pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig { self.tenant_conf.load().tenant_conf.clone() } - pub fn effective_config(&self) -> TenantConf { + pub fn effective_config(&self) -> pageserver_api::config::TenantConfigToml { self.tenant_specific_overrides() .merge(self.conf.default_tenant_conf.clone()) } @@ -4072,10 +4080,14 @@ impl Tenant { } } - pub fn update_tenant_config anyhow::Result>( + pub fn update_tenant_config< + F: Fn( + pageserver_api::models::TenantConfig, + ) -> anyhow::Result, + >( &self, update: F, - ) -> anyhow::Result { + ) -> anyhow::Result { // Use read-copy-update in order to avoid overwriting the location config // state if this races with [`Tenant::set_new_location_config`]. Note that // this race is not possible if both request types come from the storage @@ -4122,7 +4134,7 @@ impl Tenant { fn get_pagestream_throttle_config( psconf: &'static PageServerConf, - overrides: &TenantConfOpt, + overrides: &pageserver_api::models::TenantConfig, ) -> throttle::Config { overrides .timeline_get_throttle @@ -4130,7 +4142,7 @@ impl Tenant { .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone()) } - pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { + pub(crate) fn tenant_conf_updated(&self, new_conf: &pageserver_api::models::TenantConfig) { let conf = Self::get_pagestream_throttle_config(self.conf, new_conf); self.pagestream_throttle.reconfigure(conf) } @@ -4193,9 +4205,9 @@ impl Tenant { self.cancel.child_token(), ); - let timeline_ctx = RequestContextBuilder::extend(ctx) + let timeline_ctx = RequestContextBuilder::from(ctx) .scope(context::Scope::new_timeline(&timeline)) - .build(); + .detached_child(); Ok((timeline, timeline_ctx)) } @@ -5091,14 +5103,17 @@ impl Tenant { fs::remove_dir_all(&pgdata_path).with_context(|| { format!("Failed to remove already existing initdb directory: {pgdata_path}") })?; + tracing::info!("removed previous attempt's temporary initdb directory '{pgdata_path}'"); } // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it let pgdata_path_deferred = pgdata_path.clone(); scopeguard::defer! { - if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred) { + if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred).or_else(fs_ext::ignore_not_found) { // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}"); + } else { + tracing::info!("removed temporary initdb directory '{pgdata_path_deferred}'"); } } if let Some(existing_initdb_timeline_id) = load_existing_initdb { @@ -5492,7 +5507,7 @@ impl Tenant { Ok(()) } - pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { + pub(crate) fn get_tenant_conf(&self) -> pageserver_api::models::TenantConfig { self.tenant_conf.load().tenant_conf.clone() } @@ -5682,59 +5697,9 @@ pub(crate) mod harness { buf.freeze() } - impl From for TenantConfOpt { - fn from(tenant_conf: TenantConf) -> Self { - Self { - checkpoint_distance: Some(tenant_conf.checkpoint_distance), - checkpoint_timeout: Some(tenant_conf.checkpoint_timeout), - compaction_target_size: Some(tenant_conf.compaction_target_size), - compaction_period: Some(tenant_conf.compaction_period), - compaction_threshold: Some(tenant_conf.compaction_threshold), - compaction_upper_limit: Some(tenant_conf.compaction_upper_limit), - compaction_algorithm: Some(tenant_conf.compaction_algorithm), - compaction_l0_first: Some(tenant_conf.compaction_l0_first), - compaction_l0_semaphore: Some(tenant_conf.compaction_l0_semaphore), - l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold, - l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold, - l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload), - gc_horizon: Some(tenant_conf.gc_horizon), - gc_period: Some(tenant_conf.gc_period), - image_creation_threshold: Some(tenant_conf.image_creation_threshold), - pitr_interval: Some(tenant_conf.pitr_interval), - walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), - lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), - max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), - eviction_policy: Some(tenant_conf.eviction_policy), - min_resident_size_override: tenant_conf.min_resident_size_override, - evictions_low_residence_duration_metric_threshold: Some( - tenant_conf.evictions_low_residence_duration_metric_threshold, - ), - heatmap_period: Some(tenant_conf.heatmap_period), - lazy_slru_download: Some(tenant_conf.lazy_slru_download), - timeline_get_throttle: Some(tenant_conf.timeline_get_throttle), - image_layer_creation_check_threshold: Some( - tenant_conf.image_layer_creation_check_threshold, - ), - image_creation_preempt_threshold: Some( - tenant_conf.image_creation_preempt_threshold, - ), - lsn_lease_length: Some(tenant_conf.lsn_lease_length), - lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), - timeline_offloading: Some(tenant_conf.timeline_offloading), - wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override, - rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled), - gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled), - gc_compaction_initial_threshold_kb: Some( - tenant_conf.gc_compaction_initial_threshold_kb, - ), - gc_compaction_ratio_percent: Some(tenant_conf.gc_compaction_ratio_percent), - } - } - } - pub struct TenantHarness { pub conf: &'static PageServerConf, - pub tenant_conf: TenantConf, + pub tenant_conf: pageserver_api::models::TenantConfig, pub tenant_shard_id: TenantShardId, pub generation: Generation, pub shard: ShardIndex, @@ -5761,7 +5726,7 @@ pub(crate) mod harness { impl TenantHarness { pub async fn create_custom( test_name: &'static str, - tenant_conf: TenantConf, + tenant_conf: pageserver_api::models::TenantConfig, tenant_id: TenantId, shard_identity: ShardIdentity, generation: Generation, @@ -5814,10 +5779,10 @@ pub(crate) mod harness { pub async fn create(test_name: &'static str) -> anyhow::Result { // Disable automatic GC and compaction to make the unit tests more deterministic. // The tests perform them manually if needed. - let tenant_conf = TenantConf { - gc_period: Duration::ZERO, - compaction_period: Duration::ZERO, - ..TenantConf::default() + let tenant_conf = pageserver_api::models::TenantConfig { + gc_period: Some(Duration::ZERO), + compaction_period: Some(Duration::ZERO), + ..Default::default() }; let tenant_id = TenantId::generate(); let shard = ShardIdentity::unsharded(); @@ -5857,7 +5822,7 @@ pub(crate) mod harness { TenantState::Attaching, self.conf, AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt::from(self.tenant_conf.clone()), + self.tenant_conf.clone(), self.generation, &ShardParameters::default(), )) @@ -6559,11 +6524,7 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact( - &CancellationToken::new(), - CompactFlags::NoYield.into(), - &ctx, - ) + .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; let mut writer = tline.writer().await; @@ -6580,11 +6541,7 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact( - &CancellationToken::new(), - CompactFlags::NoYield.into(), - &ctx, - ) + .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; let mut writer = tline.writer().await; @@ -6601,11 +6558,7 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact( - &CancellationToken::new(), - CompactFlags::NoYield.into(), - &ctx, - ) + .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; let mut writer = tline.writer().await; @@ -6622,11 +6575,7 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact( - &CancellationToken::new(), - CompactFlags::NoYield.into(), - &ctx, - ) + .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; assert_eq!( @@ -6709,9 +6658,7 @@ mod tests { timeline.freeze_and_flush().await?; if compact { // this requires timeline to be &Arc - timeline - .compact(&cancel, CompactFlags::NoYield.into(), ctx) - .await?; + timeline.compact(&cancel, EnumSet::default(), ctx).await?; } // this doesn't really need to use the timeline_id target, but it is closer to what it @@ -6941,14 +6888,14 @@ mod tests { // ``` #[tokio::test] async fn test_get_vectored_key_gap() -> anyhow::Result<()> { - let tenant_conf = TenantConf { + let tenant_conf = pageserver_api::models::TenantConfig { // Make compaction deterministic - gc_period: Duration::ZERO, - compaction_period: Duration::ZERO, + gc_period: Some(Duration::ZERO), + compaction_period: Some(Duration::ZERO), // Encourage creation of L1 layers - checkpoint_distance: 16 * 1024, - compaction_target_size: 8 * 1024, - ..TenantConf::default() + checkpoint_distance: Some(16 * 1024), + compaction_target_size: Some(8 * 1024), + ..Default::default() }; let harness = TenantHarness::create_custom( @@ -7038,7 +6985,6 @@ mod tests { child_timeline.freeze_and_flush().await?; let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); child_timeline .compact(&CancellationToken::new(), flags, &ctx) .await?; @@ -7254,9 +7200,9 @@ mod tests { compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { let mut harness = TenantHarness::create(name).await?; - harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + harness.tenant_conf.compaction_algorithm = Some(CompactionAlgorithmSettings { kind: compaction_algorithm, - }; + }); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -7417,9 +7363,7 @@ mod tests { // Perform a cycle of flush, compact, and GC tline.freeze_and_flush().await?; - tline - .compact(&cancel, CompactFlags::NoYield.into(), &ctx) - .await?; + tline.compact(&cancel, EnumSet::default(), &ctx).await?; tenant .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; @@ -7623,9 +7567,9 @@ mod tests { compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { let mut harness = TenantHarness::create(name).await?; - harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + harness.tenant_conf.compaction_algorithm = Some(CompactionAlgorithmSettings { kind: compaction_algorithm, - }; + }); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) @@ -7748,7 +7692,6 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); flags } else { EnumSet::empty() @@ -7799,9 +7742,7 @@ mod tests { let before_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); - tline - .compact(&cancel, CompactFlags::NoYield.into(), &ctx) - .await?; + tline.compact(&cancel, EnumSet::default(), &ctx).await?; let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); @@ -7966,7 +7907,6 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8429,7 +8369,6 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8497,7 +8436,6 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -11594,4 +11532,255 @@ mod tests { Ok(()) } + + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> { + use pageserver_api::models::TimelineVisibilityState; + + use crate::tenant::size::gather_inputs; + + let tenant_conf = pageserver_api::models::TenantConfig { + // Ensure that we don't compute gc_cutoffs (which needs reading the layer files) + pitr_interval: Some(Duration::ZERO), + ..Default::default() + }; + let harness = TenantHarness::create_custom( + "test_synthetic_size_calculation_with_invisible_branches", + tenant_conf, + TenantId::generate(), + ShardIdentity::unsharded(), + Generation::new(0xdeadbeef), + ) + .await?; + let (tenant, ctx) = harness.load().await; + let main_tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![], + vec![], + vec![], + Lsn(0x100), + ) + .await?; + + let snapshot1 = TimelineId::from_array(hex!("11223344556677881122334455667790")); + tenant + .branch_timeline_test_with_layers( + &main_tline, + snapshot1, + Some(Lsn(0x20)), + &ctx, + vec![], + vec![], + Lsn(0x50), + ) + .await?; + let snapshot2 = TimelineId::from_array(hex!("11223344556677881122334455667791")); + tenant + .branch_timeline_test_with_layers( + &main_tline, + snapshot2, + Some(Lsn(0x30)), + &ctx, + vec![], + vec![], + Lsn(0x50), + ) + .await?; + let snapshot3 = TimelineId::from_array(hex!("11223344556677881122334455667792")); + tenant + .branch_timeline_test_with_layers( + &main_tline, + snapshot3, + Some(Lsn(0x40)), + &ctx, + vec![], + vec![], + Lsn(0x50), + ) + .await?; + let limit = Arc::new(Semaphore::new(1)); + let max_retention_period = None; + let mut logical_size_cache = HashMap::new(); + let cause = LogicalSizeCalculationCause::EvictionTaskImitation; + let cancel = CancellationToken::new(); + + let inputs = gather_inputs( + &tenant, + &limit, + max_retention_period, + &mut logical_size_cache, + cause, + &cancel, + &ctx, + ) + .instrument(info_span!( + "gather_inputs", + tenant_id = "unknown", + shard_id = "unknown", + )) + .await?; + use crate::tenant::size::{LsnKind, ModelInputs, SegmentMeta}; + use LsnKind::*; + use tenant_size_model::Segment; + let ModelInputs { mut segments, .. } = inputs; + segments.retain(|s| s.timeline_id == TIMELINE_ID); + for segment in segments.iter_mut() { + segment.segment.parent = None; // We don't care about the parent for the test + segment.segment.size = None; // We don't care about the size for the test + } + assert_eq!( + segments, + [ + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x10, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchStart, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x20, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x30, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x40, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x100, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: GcCutOff, + }, // we need to retain everything above the last branch point + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x100, + size: None, + needed: true, + }, + timeline_id: TIMELINE_ID, + kind: BranchEnd, + }, + ] + ); + + main_tline + .remote_client + .schedule_index_upload_for_timeline_invisible_state( + TimelineVisibilityState::Invisible, + )?; + main_tline.remote_client.wait_completion().await?; + let inputs = gather_inputs( + &tenant, + &limit, + max_retention_period, + &mut logical_size_cache, + cause, + &cancel, + &ctx, + ) + .instrument(info_span!( + "gather_inputs", + tenant_id = "unknown", + shard_id = "unknown", + )) + .await?; + let ModelInputs { mut segments, .. } = inputs; + segments.retain(|s| s.timeline_id == TIMELINE_ID); + for segment in segments.iter_mut() { + segment.segment.parent = None; // We don't care about the parent for the test + segment.segment.size = None; // We don't care about the size for the test + } + assert_eq!( + segments, + [ + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x10, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchStart, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x20, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x30, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x40, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x40, // Branch end LSN == last branch point LSN + size: None, + needed: true, + }, + timeline_id: TIMELINE_ID, + kind: BranchEnd, + }, + ] + ); + Ok(()) + } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 4308db84e5..bf82fc8df8 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,19 +8,11 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! -use std::num::NonZeroU64; -use std::time::Duration; -pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; -use pageserver_api::models::{ - self, CompactionAlgorithmSettings, EvictionPolicy, TenantConfigPatch, -}; +use pageserver_api::models; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; -use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; -use serde_json::Value; use utils::generation::Generation; -use utils::postgres_client::PostgresClientProtocol; #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum AttachmentMode { @@ -74,7 +66,7 @@ pub(crate) struct LocationConf { pub(crate) shard: ShardIdentity, /// The pan-cluster tenant configuration, the same on all locations - pub(crate) tenant_conf: TenantConfOpt, + pub(crate) tenant_conf: pageserver_api::models::TenantConfig, } impl std::fmt::Debug for LocationConf { @@ -140,7 +132,7 @@ impl LocationConf { /// implies it is in AttachmentMode::Single, which used to be the only /// possible state. This function should eventually be removed. pub(crate) fn attached_single( - tenant_conf: TenantConfOpt, + tenant_conf: pageserver_api::models::TenantConfig, generation: Generation, shard_params: &models::ShardParameters, ) -> Self { @@ -174,7 +166,7 @@ impl LocationConf { } pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result { - let tenant_conf = TenantConfOpt::try_from(&conf.tenant_conf)?; + let tenant_conf = conf.tenant_conf.clone(); fn get_generation(conf: &'_ models::LocationConfig) -> Result { conf.generation @@ -250,509 +242,19 @@ impl Default for LocationConf { generation: Generation::none(), attach_mode: AttachmentMode::Single, }), - tenant_conf: TenantConfOpt::default(), + tenant_conf: pageserver_api::models::TenantConfig::default(), shard: ShardIdentity::unsharded(), } } } -/// Same as TenantConf, but this struct preserves the information about -/// which parameters are set and which are not. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] -pub struct TenantConfOpt { - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub checkpoint_distance: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub checkpoint_timeout: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub compaction_target_size: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub compaction_period: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub compaction_threshold: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub compaction_upper_limit: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub compaction_algorithm: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub compaction_l0_first: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub compaction_l0_semaphore: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub l0_flush_delay_threshold: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub l0_flush_stall_threshold: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub l0_flush_wait_upload: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub gc_horizon: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub gc_period: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub image_creation_threshold: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub pitr_interval: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub walreceiver_connect_timeout: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub lagging_wal_timeout: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub max_lsn_wal_lag: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub eviction_policy: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub min_resident_size_override: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub evictions_low_residence_duration_metric_threshold: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub heatmap_period: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub lazy_slru_download: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - pub timeline_get_throttle: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - pub image_layer_creation_check_threshold: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - pub image_creation_preempt_threshold: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub lsn_lease_length: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(with = "humantime_serde")] - #[serde(default)] - pub lsn_lease_length_for_ts: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub timeline_offloading: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - pub wal_receiver_protocol_override: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - pub rel_size_v2_enabled: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - pub gc_compaction_enabled: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - pub gc_compaction_initial_threshold_kb: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - pub gc_compaction_ratio_percent: Option, -} - -impl TenantConfOpt { - pub fn merge(&self, global_conf: TenantConf) -> TenantConf { - TenantConf { - checkpoint_distance: self - .checkpoint_distance - .unwrap_or(global_conf.checkpoint_distance), - checkpoint_timeout: self - .checkpoint_timeout - .unwrap_or(global_conf.checkpoint_timeout), - compaction_target_size: self - .compaction_target_size - .unwrap_or(global_conf.compaction_target_size), - compaction_period: self - .compaction_period - .unwrap_or(global_conf.compaction_period), - compaction_threshold: self - .compaction_threshold - .unwrap_or(global_conf.compaction_threshold), - compaction_upper_limit: self - .compaction_upper_limit - .unwrap_or(global_conf.compaction_upper_limit), - compaction_algorithm: self - .compaction_algorithm - .as_ref() - .unwrap_or(&global_conf.compaction_algorithm) - .clone(), - compaction_l0_first: self - .compaction_l0_first - .unwrap_or(global_conf.compaction_l0_first), - compaction_l0_semaphore: self - .compaction_l0_semaphore - .unwrap_or(global_conf.compaction_l0_semaphore), - l0_flush_delay_threshold: self - .l0_flush_delay_threshold - .or(global_conf.l0_flush_delay_threshold), - l0_flush_stall_threshold: self - .l0_flush_stall_threshold - .or(global_conf.l0_flush_stall_threshold), - l0_flush_wait_upload: self - .l0_flush_wait_upload - .unwrap_or(global_conf.l0_flush_wait_upload), - gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), - gc_period: self.gc_period.unwrap_or(global_conf.gc_period), - image_creation_threshold: self - .image_creation_threshold - .unwrap_or(global_conf.image_creation_threshold), - pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), - walreceiver_connect_timeout: self - .walreceiver_connect_timeout - .unwrap_or(global_conf.walreceiver_connect_timeout), - lagging_wal_timeout: self - .lagging_wal_timeout - .unwrap_or(global_conf.lagging_wal_timeout), - max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), - eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), - min_resident_size_override: self - .min_resident_size_override - .or(global_conf.min_resident_size_override), - evictions_low_residence_duration_metric_threshold: self - .evictions_low_residence_duration_metric_threshold - .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), - heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period), - lazy_slru_download: self - .lazy_slru_download - .unwrap_or(global_conf.lazy_slru_download), - timeline_get_throttle: self - .timeline_get_throttle - .clone() - .unwrap_or(global_conf.timeline_get_throttle), - image_layer_creation_check_threshold: self - .image_layer_creation_check_threshold - .unwrap_or(global_conf.image_layer_creation_check_threshold), - image_creation_preempt_threshold: self - .image_creation_preempt_threshold - .unwrap_or(global_conf.image_creation_preempt_threshold), - lsn_lease_length: self - .lsn_lease_length - .unwrap_or(global_conf.lsn_lease_length), - lsn_lease_length_for_ts: self - .lsn_lease_length_for_ts - .unwrap_or(global_conf.lsn_lease_length_for_ts), - timeline_offloading: self - .timeline_offloading - .unwrap_or(global_conf.timeline_offloading), - wal_receiver_protocol_override: self - .wal_receiver_protocol_override - .or(global_conf.wal_receiver_protocol_override), - rel_size_v2_enabled: self - .rel_size_v2_enabled - .unwrap_or(global_conf.rel_size_v2_enabled), - gc_compaction_enabled: self - .gc_compaction_enabled - .unwrap_or(global_conf.gc_compaction_enabled), - gc_compaction_initial_threshold_kb: self - .gc_compaction_initial_threshold_kb - .unwrap_or(global_conf.gc_compaction_initial_threshold_kb), - gc_compaction_ratio_percent: self - .gc_compaction_ratio_percent - .unwrap_or(global_conf.gc_compaction_ratio_percent), - } - } - - pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result { - let Self { - mut checkpoint_distance, - mut checkpoint_timeout, - mut compaction_target_size, - mut compaction_period, - mut compaction_threshold, - mut compaction_upper_limit, - mut compaction_algorithm, - mut compaction_l0_first, - mut compaction_l0_semaphore, - mut l0_flush_delay_threshold, - mut l0_flush_stall_threshold, - mut l0_flush_wait_upload, - mut gc_horizon, - mut gc_period, - mut image_creation_threshold, - mut pitr_interval, - mut walreceiver_connect_timeout, - mut lagging_wal_timeout, - mut max_lsn_wal_lag, - mut eviction_policy, - mut min_resident_size_override, - mut evictions_low_residence_duration_metric_threshold, - mut heatmap_period, - mut lazy_slru_download, - mut timeline_get_throttle, - mut image_layer_creation_check_threshold, - mut image_creation_preempt_threshold, - mut lsn_lease_length, - mut lsn_lease_length_for_ts, - mut timeline_offloading, - mut wal_receiver_protocol_override, - mut rel_size_v2_enabled, - mut gc_compaction_enabled, - mut gc_compaction_initial_threshold_kb, - mut gc_compaction_ratio_percent, - } = self; - - patch.checkpoint_distance.apply(&mut checkpoint_distance); - patch - .checkpoint_timeout - .map(|v| humantime::parse_duration(&v))? - .apply(&mut checkpoint_timeout); - patch - .compaction_target_size - .apply(&mut compaction_target_size); - patch - .compaction_period - .map(|v| humantime::parse_duration(&v))? - .apply(&mut compaction_period); - patch.compaction_threshold.apply(&mut compaction_threshold); - patch - .compaction_upper_limit - .apply(&mut compaction_upper_limit); - patch.compaction_algorithm.apply(&mut compaction_algorithm); - patch.compaction_l0_first.apply(&mut compaction_l0_first); - patch - .compaction_l0_semaphore - .apply(&mut compaction_l0_semaphore); - patch - .l0_flush_delay_threshold - .apply(&mut l0_flush_delay_threshold); - patch - .l0_flush_stall_threshold - .apply(&mut l0_flush_stall_threshold); - patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); - patch.gc_horizon.apply(&mut gc_horizon); - patch - .gc_period - .map(|v| humantime::parse_duration(&v))? - .apply(&mut gc_period); - patch - .image_creation_threshold - .apply(&mut image_creation_threshold); - patch - .pitr_interval - .map(|v| humantime::parse_duration(&v))? - .apply(&mut pitr_interval); - patch - .walreceiver_connect_timeout - .map(|v| humantime::parse_duration(&v))? - .apply(&mut walreceiver_connect_timeout); - patch - .lagging_wal_timeout - .map(|v| humantime::parse_duration(&v))? - .apply(&mut lagging_wal_timeout); - patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag); - patch.eviction_policy.apply(&mut eviction_policy); - patch - .min_resident_size_override - .apply(&mut min_resident_size_override); - patch - .evictions_low_residence_duration_metric_threshold - .map(|v| humantime::parse_duration(&v))? - .apply(&mut evictions_low_residence_duration_metric_threshold); - patch - .heatmap_period - .map(|v| humantime::parse_duration(&v))? - .apply(&mut heatmap_period); - patch.lazy_slru_download.apply(&mut lazy_slru_download); - patch - .timeline_get_throttle - .apply(&mut timeline_get_throttle); - patch - .image_layer_creation_check_threshold - .apply(&mut image_layer_creation_check_threshold); - patch - .image_creation_preempt_threshold - .apply(&mut image_creation_preempt_threshold); - patch - .lsn_lease_length - .map(|v| humantime::parse_duration(&v))? - .apply(&mut lsn_lease_length); - patch - .lsn_lease_length_for_ts - .map(|v| humantime::parse_duration(&v))? - .apply(&mut lsn_lease_length_for_ts); - patch.timeline_offloading.apply(&mut timeline_offloading); - patch - .wal_receiver_protocol_override - .apply(&mut wal_receiver_protocol_override); - patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); - patch - .gc_compaction_enabled - .apply(&mut gc_compaction_enabled); - patch - .gc_compaction_initial_threshold_kb - .apply(&mut gc_compaction_initial_threshold_kb); - patch - .gc_compaction_ratio_percent - .apply(&mut gc_compaction_ratio_percent); - - Ok(Self { - checkpoint_distance, - checkpoint_timeout, - compaction_target_size, - compaction_period, - compaction_threshold, - compaction_upper_limit, - compaction_algorithm, - compaction_l0_first, - compaction_l0_semaphore, - l0_flush_delay_threshold, - l0_flush_stall_threshold, - l0_flush_wait_upload, - gc_horizon, - gc_period, - image_creation_threshold, - pitr_interval, - walreceiver_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - eviction_policy, - min_resident_size_override, - evictions_low_residence_duration_metric_threshold, - heatmap_period, - lazy_slru_download, - timeline_get_throttle, - image_layer_creation_check_threshold, - image_creation_preempt_threshold, - lsn_lease_length, - lsn_lease_length_for_ts, - timeline_offloading, - wal_receiver_protocol_override, - rel_size_v2_enabled, - gc_compaction_enabled, - gc_compaction_initial_threshold_kb, - gc_compaction_ratio_percent, - }) - } -} - -impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { - type Error = anyhow::Error; - - fn try_from(request_data: &'_ models::TenantConfig) -> Result { - // Convert the request_data to a JSON Value - let json_value: Value = serde_json::to_value(request_data)?; - - // Create a Deserializer from the JSON Value - let deserializer = json_value.into_deserializer(); - - // Use serde_path_to_error to deserialize the JSON Value into TenantConfOpt - let tenant_conf: TenantConfOpt = serde_path_to_error::deserialize(deserializer)?; - - Ok(tenant_conf) - } -} - -/// This is a conversion from our internal tenant config object to the one used -/// in external APIs. -impl From for models::TenantConfig { - // TODO(vlad): These are now the same, but they have different serialization logic. - // Can we merge them? - fn from(value: TenantConfOpt) -> Self { - Self { - checkpoint_distance: value.checkpoint_distance, - checkpoint_timeout: value.checkpoint_timeout, - compaction_algorithm: value.compaction_algorithm, - compaction_target_size: value.compaction_target_size, - compaction_period: value.compaction_period, - compaction_threshold: value.compaction_threshold, - compaction_upper_limit: value.compaction_upper_limit, - compaction_l0_first: value.compaction_l0_first, - compaction_l0_semaphore: value.compaction_l0_semaphore, - l0_flush_delay_threshold: value.l0_flush_delay_threshold, - l0_flush_stall_threshold: value.l0_flush_stall_threshold, - l0_flush_wait_upload: value.l0_flush_wait_upload, - gc_horizon: value.gc_horizon, - gc_period: value.gc_period, - image_creation_threshold: value.image_creation_threshold, - pitr_interval: value.pitr_interval, - walreceiver_connect_timeout: value.walreceiver_connect_timeout, - lagging_wal_timeout: value.lagging_wal_timeout, - max_lsn_wal_lag: value.max_lsn_wal_lag, - eviction_policy: value.eviction_policy, - min_resident_size_override: value.min_resident_size_override, - evictions_low_residence_duration_metric_threshold: value - .evictions_low_residence_duration_metric_threshold, - heatmap_period: value.heatmap_period, - lazy_slru_download: value.lazy_slru_download, - timeline_get_throttle: value.timeline_get_throttle, - image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, - image_creation_preempt_threshold: value.image_creation_preempt_threshold, - lsn_lease_length: value.lsn_lease_length, - lsn_lease_length_for_ts: value.lsn_lease_length_for_ts, - timeline_offloading: value.timeline_offloading, - wal_receiver_protocol_override: value.wal_receiver_protocol_override, - rel_size_v2_enabled: value.rel_size_v2_enabled, - gc_compaction_enabled: value.gc_compaction_enabled, - gc_compaction_initial_threshold_kb: value.gc_compaction_initial_threshold_kb, - gc_compaction_ratio_percent: value.gc_compaction_ratio_percent, - } - } -} - #[cfg(test)] mod tests { - use models::TenantConfig; - - use super::*; - #[test] - fn de_serializing_pageserver_config_omits_empty_values() { - let small_conf = TenantConfOpt { + fn serde_roundtrip_tenant_conf_opt() { + let small_conf = pageserver_api::models::TenantConfig { gc_horizon: Some(42), - ..TenantConfOpt::default() + ..Default::default() }; let toml_form = toml_edit::ser::to_string(&small_conf).unwrap(); @@ -763,19 +265,4 @@ mod tests { assert_eq!(json_form, "{\"gc_horizon\":42}"); assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap()); } - - #[test] - fn test_try_from_models_tenant_config_success() { - let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some(Duration::from_secs(5)), - ..TenantConfig::default() - }; - - let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config).unwrap(); - - assert_eq!( - tenant_conf_opt.lagging_wal_timeout, - Some(Duration::from_secs(5)) - ); - } } diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs index cf0085c071..a42ac92973 100644 --- a/pageserver/src/tenant/layer_map/layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/layer_coverage.rs @@ -53,7 +53,7 @@ impl LayerCoverage { /// /// Complexity: O(log N) fn add_node(&mut self, key: i128) { - let value = match self.nodes.range(..=key).last() { + let value = match self.nodes.range(..=key).next_back() { Some((_, Some(v))) => Some(v.clone()), Some((_, None)) => None, None => None, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index f02247950f..ac81b8e3d7 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -40,7 +40,7 @@ use super::{GlobalShutDown, TenantSharedResources}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::controller_upcall_client::{ - ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError, + RetryForeverError, StorageControllerUpcallApi, StorageControllerUpcallClient, }; use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; @@ -58,7 +58,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service -/// reads and ingest WAL. +/// reads and ingest WAL. /// - `Secondary`: is only keeping a local cache warm. /// /// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because @@ -344,7 +344,7 @@ async fn init_load_generations( "Emergency mode! Tenants will be attached unsafely using their last known generation" ); emergency_generations(tenant_confs) - } else if let Some(client) = ControllerUpcallClient::new(conf, cancel) { + } else if let Some(client) = StorageControllerUpcallClient::new(conf, cancel)? { info!("Calling {} API to re-attach tenants", client.base_url()); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. match client.re_attach(conf).await { diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 891760b499..10a13ef1a2 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -194,7 +194,7 @@ pub(crate) use download::{ }; use index::GcCompactionState; pub(crate) use index::LayerFileMetadata; -use pageserver_api::models::{RelSizeMigration, TimelineArchivalState}; +use pageserver_api::models::{RelSizeMigration, TimelineArchivalState, TimelineVisibilityState}; use pageserver_api::shard::{ShardIndex, TenantShardId}; use regex::Regex; use remote_storage::{ @@ -573,6 +573,16 @@ impl RemoteTimelineClient { .ok() } + /// Returns true if the timeline is invisible in synthetic size calculations. + pub(crate) fn is_invisible(&self) -> Option { + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|q| q.clean.0.marked_invisible_at.is_some()) + .ok() + } + /// Returns `Ok(Some(timestamp))` if the timeline has been archived, `Ok(None)` if the timeline hasn't been archived. /// /// Return Err(_) if the remote index_part hasn't been downloaded yet, or the timeline hasn't been stopped yet. @@ -632,6 +642,7 @@ impl RemoteTimelineClient { cancel, ) .measure_remote_op( + Option::::None, RemoteOpFileKind::Index, RemoteOpKind::Download, Arc::clone(&self.metrics), @@ -729,6 +740,7 @@ impl RemoteTimelineClient { ctx, ) .measure_remote_op( + Some(ctx.task_kind()), RemoteOpFileKind::Layer, RemoteOpKind::Download, Arc::clone(&self.metrics), @@ -845,6 +857,37 @@ impl RemoteTimelineClient { Ok(need_wait) } + pub(crate) fn schedule_index_upload_for_timeline_invisible_state( + self: &Arc, + state: TimelineVisibilityState, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + fn need_change( + marked_invisible_at: &Option, + state: TimelineVisibilityState, + ) -> Option { + match (marked_invisible_at, state) { + (Some(_), TimelineVisibilityState::Invisible) => Some(false), + (None, TimelineVisibilityState::Invisible) => Some(true), + (Some(_), TimelineVisibilityState::Visible) => Some(false), + (None, TimelineVisibilityState::Visible) => Some(true), + } + } + + let need_upload_scheduled = need_change(&upload_queue.dirty.marked_invisible_at, state); + + if let Some(marked_invisible_at_set) = need_upload_scheduled { + let intended_marked_invisible_at = + marked_invisible_at_set.then(|| Utc::now().naive_utc()); + upload_queue.dirty.marked_invisible_at = intended_marked_invisible_at; + self.schedule_index_upload(upload_queue); + } + + Ok(()) + } + /// Shuts the timeline client down, but only if the timeline is archived. /// /// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the @@ -1927,9 +1970,7 @@ impl RemoteTimelineClient { /// Pick next tasks from the queue, and start as many of them as possible without violating /// the ordering constraints. /// - /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does. - /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has - /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks. + /// The number of inprogress tasks is limited by `Self::inprogress_tasks`, see `next_ready`. fn launch_queued_tasks(self: &Arc, upload_queue: &mut UploadQueueInitialized) { while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() { debug!("starting op: {next_op}"); @@ -2136,6 +2177,7 @@ impl RemoteTimelineClient { &self.cancel, ) .measure_remote_op( + Some(TaskKind::RemoteUploadTask), RemoteOpFileKind::Layer, RemoteOpKind::Upload, Arc::clone(&self.metrics), @@ -2152,6 +2194,7 @@ impl RemoteTimelineClient { &self.cancel, ) .measure_remote_op( + Some(TaskKind::RemoteUploadTask), RemoteOpFileKind::Index, RemoteOpKind::Upload, Arc::clone(&self.metrics), @@ -2177,6 +2220,11 @@ impl RemoteTimelineClient { } res } + // TODO: this should wait for the deletion to be executed by the deletion queue. + // Otherwise, the deletion may race with an upload and wrongfully delete a newer + // file. Some of the above logic attempts to work around this, it should be replaced + // by the upload queue ordering guarantees (see `can_bypass`). See: + // . UploadOp::Delete(delete) => { if self.config.read().unwrap().block_deletions { let mut queue_locked = self.upload_queue.lock().unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 16c38be907..a5cd8989aa 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -110,6 +110,10 @@ pub struct IndexPart { /// just the specific use case here; it needs a new name. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) gc_compaction: Option, + + /// The timestamp when the timeline was marked invisible in synthetic size calculations. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) marked_invisible_at: Option, } #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] @@ -126,7 +130,7 @@ impl IndexPart { /// Version history /// - 2: added `deleted_at` /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers - /// is always generated from the keys of `layer_metadata`) + /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. /// - 5: lineage was added /// - 6: last_aux_file_policy is added. @@ -137,10 +141,11 @@ impl IndexPart { /// - 11: +rel_size_migration /// - 12: +l2_lsn /// - 13: +gc_compaction - const LATEST_VERSION: usize = 13; + /// - 14: +marked_invisible_at + const LATEST_VERSION: usize = 14; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -159,6 +164,7 @@ impl IndexPart { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, } } @@ -468,6 +474,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -516,6 +523,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -565,6 +573,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -617,6 +626,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -664,6 +674,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -714,6 +725,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -769,6 +781,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -829,6 +842,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -890,6 +904,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -956,6 +971,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1035,6 +1051,7 @@ mod tests { rel_size_migration: None, l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1115,6 +1132,7 @@ mod tests { rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: None, gc_compaction: None, + marked_invisible_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1124,7 +1142,7 @@ mod tests { #[test] fn v12_v13_l2_gc_ompaction_is_parsed() { let example = r#"{ - "version": 12, + "version": 13, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } @@ -1160,7 +1178,7 @@ mod tests { }"#; let expected = IndexPart { - version: 12, + version: 13, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, @@ -1201,6 +1219,95 @@ mod tests { gc_compaction: Some(GcCompactionState { last_completed_lsn: "0/16960E8".parse::().unwrap(), }), + marked_invisible_at: None, + }; + + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v14_marked_invisible_at_is_parsed() { + let example = r#"{ + "version": 14, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + }, + "import_pgdata": { + "V1": { + "Done": { + "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", + "started_at": "2024-11-13T09:23:42.123", + "finished_at": "2024-11-13T09:42:23.123" + } + } + }, + "rel_size_migration": "legacy", + "l2_lsn": "0/16960E8", + "gc_compaction": { + "last_completed_lsn": "0/16960E8" + }, + "marked_invisible_at": "2023-07-31T09:00:00.123" + }"#; + + let expected = IndexPart { + version: 14, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ + started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), + finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), + }))), + rel_size_migration: Some(RelSizeMigration::Legacy), + l2_lsn: Some("0/16960E8".parse::().unwrap()), + gc_compaction: Some(GcCompactionState { + last_completed_lsn: "0/16960E8".parse::().unwrap(), + }), + marked_invisible_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 8f8622c796..2fa0ed9be9 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -20,7 +20,7 @@ use utils::sync::gate::Gate; use self::downloader::{SecondaryDetail, downloader_task}; use self::heatmap_uploader::heatmap_uploader_task; use super::GetTenantError; -use super::config::{SecondaryLocationConfig, TenantConfOpt}; +use super::config::SecondaryLocationConfig; use super::mgr::TenantManager; use super::span::debug_assert_current_span_has_tenant_id; use super::storage_layer::LayerName; @@ -98,11 +98,11 @@ pub(crate) struct SecondaryTenant { pub(crate) gate: Gate, - // Secondary mode does not need the full shard identity or the TenantConfOpt. However, + // Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig. However, // storing these enables us to report our full LocationConf, enabling convenient reconciliation // by the control plane (see [`Self::get_location_conf`]) shard_identity: ShardIdentity, - tenant_conf: std::sync::Mutex, + tenant_conf: std::sync::Mutex, // Internal state used by the Downloader. detail: std::sync::Mutex, @@ -121,7 +121,7 @@ impl SecondaryTenant { pub(crate) fn new( tenant_shard_id: TenantShardId, shard_identity: ShardIdentity, - tenant_conf: TenantConfOpt, + tenant_conf: pageserver_api::models::TenantConfig, config: &SecondaryLocationConfig, ) -> Arc { let tenant_id = tenant_shard_id.tenant_id.to_string(); @@ -167,17 +167,24 @@ impl SecondaryTenant { self.validate_metrics(); + // Metrics are subtracted from and/or removed eagerly. + // Deletions are done in the background via [`BackgroundPurges::spawn`]. let tenant_id = self.tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + + self.detail + .lock() + .unwrap() + .drain_timelines(&self.tenant_shard_id, &self.resident_size_metric); } pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) { self.detail.lock().unwrap().config = config.clone(); } - pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) { + pub(crate) fn set_tenant_conf(&self, config: &pageserver_api::models::TenantConfig) { *(self.tenant_conf.lock().unwrap()) = config.clone(); } @@ -197,7 +204,7 @@ impl SecondaryTenant { shard_number: self.tenant_shard_id.shard_number.0, shard_count: self.tenant_shard_id.shard_count.literal(), shard_stripe_size: self.shard_identity.stripe_size.0, - tenant_conf: tenant_conf.into(), + tenant_conf, } } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 992f94c0e1..f653dbe84c 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -4,6 +4,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; +use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation}; use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; @@ -124,15 +125,53 @@ impl OnDiskState { } } -#[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. pub(super) evicted_at: HashMap, + + ctx: RequestContext, +} + +impl Clone for SecondaryDetailTimeline { + fn clone(&self) -> Self { + Self { + on_disk_layers: self.on_disk_layers.clone(), + evicted_at: self.evicted_at.clone(), + // This is a bit awkward. The downloader code operates on a snapshot + // of the secondary list to avoid locking it for extended periods of time. + // No particularly strong reason to chose [`RequestContext::detached_child`], + // but makes more sense than [`RequestContext::attached_child`]. + ctx: self + .ctx + .detached_child(self.ctx.task_kind(), self.ctx.download_behavior()), + } + } +} + +impl std::fmt::Debug for SecondaryDetailTimeline { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SecondaryDetailTimeline") + .field("on_disk_layers", &self.on_disk_layers) + .field("evicted_at", &self.evicted_at) + .finish() + } } impl SecondaryDetailTimeline { + pub(super) fn empty(ctx: RequestContext) -> Self { + SecondaryDetailTimeline { + on_disk_layers: Default::default(), + evicted_at: Default::default(), + ctx, + } + } + + pub(super) fn context(&self) -> &RequestContext { + &self.ctx + } + pub(super) fn remove_layer( &mut self, name: &LayerName, @@ -258,18 +297,50 @@ impl SecondaryDetail { pub(super) fn remove_timeline( &mut self, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, resident_metric: &UIntGauge, ) { let removed = self.timelines.remove(timeline_id); if let Some(removed) = removed { - resident_metric.sub( - removed - .on_disk_layers - .values() - .map(|l| l.metadata.file_size) - .sum(), - ); + Self::clear_timeline_metrics(tenant_shard_id, timeline_id, removed, resident_metric); + } + } + + pub(super) fn drain_timelines( + &mut self, + tenant_shard_id: &TenantShardId, + resident_metric: &UIntGauge, + ) { + for (timeline_id, removed) in self.timelines.drain() { + Self::clear_timeline_metrics(tenant_shard_id, &timeline_id, removed, resident_metric); + } + } + + fn clear_timeline_metrics( + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + detail: SecondaryDetailTimeline, + resident_metric: &UIntGauge, + ) { + resident_metric.sub( + detail + .on_disk_layers + .values() + .map(|l| l.metadata.file_size) + .sum(), + ); + + let shard_id = format!("{}", tenant_shard_id.shard_slug()); + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + for op in StorageIoSizeOperation::VARIANTS { + let _ = STORAGE_IO_SIZE.remove_label_values(&[ + op, + tenant_id.as_str(), + shard_id.as_str(), + timeline_id.as_str(), + ]); } } @@ -727,6 +798,7 @@ impl<'a> TenantDownloader<'a> { last_heatmap, timeline, &self.secondary_state.resident_size_metric, + ctx, ) .await; @@ -774,7 +846,6 @@ impl<'a> TenantDownloader<'a> { // Download the layers in the heatmap for timeline in heatmap.timelines { - let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id); let timeline_state = timeline_states .remove(&timeline.timeline_id) .expect("Just populated above"); @@ -917,7 +988,11 @@ impl<'a> TenantDownloader<'a> { for delete_timeline in &delete_timelines { // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal // from disk fails that will be a fatal error. - detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric); + detail.remove_timeline( + self.secondary_state.get_tenant_shard_id(), + delete_timeline, + &self.secondary_state.resident_size_metric, + ); } } @@ -1013,7 +1088,6 @@ impl<'a> TenantDownloader<'a> { timeline: HeatMapTimeline, timeline_state: SecondaryDetailTimeline, deadline: Instant, - ctx: &RequestContext, ) -> (Result<(), UpdateError>, Vec) { // Accumulate updates to the state let mut touched = Vec::new(); @@ -1044,7 +1118,12 @@ impl<'a> TenantDownloader<'a> { } match self - .download_layer(tenant_shard_id, &timeline_id, layer, ctx) + .download_layer( + tenant_shard_id, + &timeline_id, + layer, + timeline_state.context(), + ) .await { Ok(Some(layer)) => touched.push(layer), @@ -1155,13 +1234,16 @@ impl<'a> TenantDownloader<'a> { tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count()); let (result, touched) = self - .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx) + .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline) .await; // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful { let mut detail = self.secondary_state.detail.lock().unwrap(); - let timeline_detail = detail.timelines.entry(timeline_id).or_default(); + let timeline_detail = detail.timelines.entry(timeline_id).or_insert_with(|| { + let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline_id); + SecondaryDetailTimeline::empty(ctx) + }); tracing::info!("Wrote timeline_detail for {} touched layers", touched.len()); touched.into_iter().for_each(|t| { @@ -1295,10 +1377,12 @@ async fn init_timeline_state( last_heatmap: Option<&HeatMapTimeline>, heatmap: &HeatMapTimeline, resident_metric: &UIntGauge, + ctx: &RequestContext, ) -> SecondaryDetailTimeline { - let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); - let mut detail = SecondaryDetailTimeline::default(); + let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &heatmap.timeline_id); + let mut detail = SecondaryDetailTimeline::empty(ctx); + let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); let mut dir = match tokio::fs::read_dir(&timeline_path).await { Ok(d) => d, Err(e) => { diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 8cc94b4e4d..c7ac50ca6a 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -33,7 +33,7 @@ pub struct ModelInputs { } /// A [`Segment`], with some extra information for display purposes -#[derive(Debug, serde::Serialize, serde::Deserialize)] +#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq, Eq)] pub struct SegmentMeta { pub segment: Segment, pub timeline_id: TimelineId, @@ -248,6 +248,8 @@ pub(super) async fn gather_inputs( None }; + let branch_is_invisible = timeline.is_invisible() == Some(true); + let lease_points = gc_info .leases .keys() @@ -271,7 +273,10 @@ pub(super) async fn gather_inputs( .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint)) .collect::>(); - lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); + if !branch_is_invisible { + // Do not count lease points for invisible branches. + lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); + } drop(gc_info); @@ -287,7 +292,9 @@ pub(super) async fn gather_inputs( // Add a point for the PITR cutoff let branch_start_needed = next_pitr_cutoff <= branch_start_lsn; - if !branch_start_needed { + if !branch_start_needed && !branch_is_invisible { + // Only add the GcCutOff point when the timeline is visible; otherwise, do not compute the size for the LSN + // range from the last branch point to the latest data. lsns.push((next_pitr_cutoff, LsnKind::GcCutOff)); } @@ -373,11 +380,19 @@ pub(super) async fn gather_inputs( } } + let branch_end_lsn = if branch_is_invisible { + // If the branch is invisible, the branch end is the last requested LSN (likely a branch cutoff point). + segments.last().unwrap().segment.lsn + } else { + // Otherwise, the branch end is the last record LSN. + last_record_lsn.0 + }; + // Current end of the timeline segments.push(SegmentMeta { segment: Segment { parent: Some(parent), - lsn: last_record_lsn.0, + lsn: branch_end_lsn, size: None, // Filled in later, if necessary needed: true, }, @@ -609,6 +624,7 @@ async fn calculate_logical_size( Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) } +#[cfg(test)] #[test] fn verify_size_for_multiple_branches() { // this is generated from integration test test_tenant_size_with_multiple_branches, but this way @@ -766,6 +782,7 @@ fn verify_size_for_multiple_branches() { assert_eq!(inputs.calculate(), 37_851_408); } +#[cfg(test)] #[test] fn verify_size_for_one_branch() { let doc = r#" diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index ece163b24a..2ea0c1b979 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -13,13 +13,13 @@ pub mod merge_iterator; use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; -use std::future::Future; use std::ops::Range; use std::pin::Pin; use std::sync::Arc; use std::sync::atomic::AtomicUsize; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use crate::PERF_TRACE_TARGET; pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter}; use bytes::Bytes; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; @@ -34,7 +34,7 @@ use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; -use tracing::{Instrument, trace}; +use tracing::{Instrument, info_span, trace}; use utils::lsn::Lsn; use utils::sync::gate::GateGuard; @@ -43,7 +43,9 @@ use super::PageReconstructError; use super::layer_map::InMemoryLayerDesc; use super::timeline::{GetVectoredError, ReadPath}; use crate::config::PageServerConf; -use crate::context::{AccessStatsBehavior, RequestContext}; +use crate::context::{ + AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, +}; pub fn range_overlaps(a: &Range, b: &Range) -> bool where @@ -874,13 +876,37 @@ impl ReadableLayer { ) -> Result<(), GetVectoredError> { match self { ReadableLayer::PersistentLayer(layer) => { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "PLAN_LAYER", + layer = %layer + ) + }) + .attached_child(); + layer - .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await } ReadableLayer::InMemoryLayer(layer) => { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "PLAN_LAYER", + layer = %layer + ) + }) + .attached_child(); + layer - .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index f2a34ceec9..12735352a2 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -959,9 +959,9 @@ impl DeltaLayerInner { where Reader: BlockReader + Clone, { - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::DeltaLayerBtreeNode) - .build(); + .attached_child(); for range in keyspace.ranges.iter() { let mut range_end_handled = false; @@ -1168,9 +1168,9 @@ impl DeltaLayerInner { all_keys.push(entry); true }, - &RequestContextBuilder::extend(ctx) + &RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::DeltaLayerBtreeNode) - .build(), + .attached_child(), ) .await?; if let Some(last) = all_keys.last_mut() { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 1626742771..228048ac9c 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -493,9 +493,9 @@ impl ImageLayerInner { let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::ImageLayerBtreeNode) - .build(); + .attached_child(); for range in keyspace.ranges.iter() { let mut range_end_handled = false; @@ -1225,7 +1225,6 @@ mod test { use super::{ImageLayerIterator, ImageLayerWriter}; use crate::DEFAULT_PG_VERSION; use crate::context::RequestContext; - use crate::tenant::config::TenantConf; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; @@ -1233,10 +1232,10 @@ mod test { #[tokio::test] async fn image_layer_rewrite() { - let tenant_conf = TenantConf { - gc_period: Duration::ZERO, - compaction_period: Duration::ZERO, - ..TenantConf::default() + let tenant_conf = pageserver_api::models::TenantConfig { + gc_period: Some(Duration::ZERO), + compaction_period: Some(Duration::ZERO), + ..Default::default() }; let tenant_id = TenantId::generate(); let mut gen_ = Generation::new(0xdead0001); diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 5a07389dc6..5d558e66cc 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -421,9 +421,9 @@ impl InMemoryLayer { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::InMemoryLayer) - .build(); + .attached_child(); let inner = self.inner.read().await; diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 247092bf45..b7f6e5dc77 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -3,12 +3,13 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; +use crate::PERF_TRACE_TARGET; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; -use tracing::Instrument; +use tracing::{Instrument, info_span}; use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -18,7 +19,7 @@ use super::delta_layer::{self}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, - LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, + LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState, }; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; @@ -324,16 +325,29 @@ impl Layer { reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let downloaded = + let downloaded = { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_LAYER", + ) + }) + .attached_child(); + self.0 - .get_or_maybe_download(true, ctx) + .get_or_maybe_download(true, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_context| crnt_perf_context.clone()) .await .map_err(|err| match err { DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { GetVectoredError::Cancelled } other => GetVectoredError::Other(anyhow::anyhow!(other)), - })?; + })? + }; + let this = ResidentLayer { downloaded: downloaded.clone(), owner: self.clone(), @@ -341,9 +355,20 @@ impl Layer { self.record_access(ctx); + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "VISIT_LAYER", + ) + }) + .attached_child(); + downloaded - .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx) + .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, &ctx) .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await .map_err(|err| match err { GetVectoredError::Other(err) => GetVectoredError::Other( @@ -950,6 +975,10 @@ impl LayerInner { allow_download: bool, ctx: &RequestContext, ) -> Result, DownloadError> { + let mut wait_for_download_recorder = + scopeguard::guard(utils::elapsed_accum::ElapsedAccum::default(), |accum| { + ctx.ondemand_download_wait_observe(accum.get()); + }); let (weak, permit) = { // get_or_init_detached can: // - be fast (mutex lock) OR uncontested semaphore permit acquire @@ -958,7 +987,7 @@ impl LayerInner { let locked = self .inner - .get_or_init_detached() + .get_or_init_detached_measured(Some(&mut wait_for_download_recorder)) .await .map(|mut guard| guard.get_and_upgrade().ok_or(guard)); @@ -988,6 +1017,7 @@ impl LayerInner { Err(permit) => (None, permit), } }; + let _guard = wait_for_download_recorder.guard(); if let Some(weak) = weak { // only drop the weak after dropping the heavier_once_cell guard @@ -1045,15 +1075,34 @@ impl LayerInner { return Err(DownloadError::DownloadRequired); } - let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download); + let ctx = if ctx.has_perf_span() { + let dl_ctx = RequestContextBuilder::from(ctx) + .task_kind(TaskKind::LayerDownload) + .download_behavior(DownloadBehavior::Download) + .root_perf_span(|| { + info_span!( + target: PERF_TRACE_TARGET, + "DOWNLOAD_LAYER", + layer = %self, + reason = %reason + ) + }) + .detached_child(); + ctx.perf_follows_from(&dl_ctx); + dl_ctx + } else { + ctx.attached_child() + }; async move { tracing::info!(%reason, "downloading on-demand"); let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let res = self - .download_init_and_wait(timeline, permit, download_ctx) + .download_init_and_wait(timeline, permit, ctx.attached_child()) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await?; + scopeguard::ScopeGuard::into_inner(init_cancelled); Ok(res) } @@ -1158,6 +1207,7 @@ impl LayerInner { permit: heavier_once_cell::InitPermit, ctx: &RequestContext, ) -> Result, remote_storage::DownloadError> { + let start = std::time::Instant::now(); let result = timeline .remote_client .download_layer_file( @@ -1169,7 +1219,8 @@ impl LayerInner { ctx, ) .await; - + let latency = start.elapsed(); + let latency_millis = u64::try_from(latency.as_millis()).unwrap(); match result { Ok(size) => { assert_eq!(size, self.desc.file_size); @@ -1185,9 +1236,8 @@ impl LayerInner { Err(e) => { panic!("post-condition failed: needs_download errored: {e:?}"); } - } - - tracing::info!(size=%self.desc.file_size, "on-demand download successful"); + }; + tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful"); timeline .metrics .resident_physical_size_add(self.desc.file_size); @@ -1216,7 +1266,7 @@ impl LayerInner { return Err(e); } - tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); + tracing::error!(consecutive_failures, %latency_millis, "layer file download failed: {e:#}"); let backoff = utils::backoff::exponential_backoff_duration_seconds( consecutive_failures.min(u32::MAX as usize) as u32, @@ -1720,9 +1770,9 @@ impl DownloadedLayer { ); let res = if owner.desc.is_delta { - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary) - .build(); + .attached_child(); let summary = Some(delta_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, @@ -1738,9 +1788,9 @@ impl DownloadedLayer { .await .map(LayerKind::Delta) } else { - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(crate::context::PageContentKind::ImageLayerSummary) - .build(); + .attached_child(); let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 7086429bfe..b6fd4678d6 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -119,6 +119,10 @@ async fn smoke_test() { let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); assert!(matches!(e, EvictionError::NotFound)); + let dl_ctx = RequestContextBuilder::from(ctx) + .download_behavior(DownloadBehavior::Download) + .attached_child(); + // on accesses when the layer is evicted, it will automatically be downloaded. let img_after = { let mut data = ValuesReconstructState::new(io_concurrency.clone()); @@ -127,7 +131,7 @@ async fn smoke_test() { controlfile_keyspace.clone(), Lsn(0x10)..Lsn(0x11), &mut data, - ctx, + &dl_ctx, ) .instrument(download_span.clone()) .await @@ -177,7 +181,7 @@ async fn smoke_test() { // plain downloading is rarely needed layer - .download_and_keep_resident(ctx) + .download_and_keep_resident(&dl_ctx) .instrument(download_span) .await .unwrap(); @@ -645,9 +649,10 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let ctx = ctx.with_scope_timeline(&timeline); // This test does downloads - let ctx = RequestContextBuilder::extend(&ctx) + let ctx = RequestContextBuilder::from(&ctx) .download_behavior(DownloadBehavior::Download) - .build(); + .attached_child(); + let layer = { let mut layers = { let layers = timeline.layers.read().await; @@ -730,9 +735,9 @@ async fn evict_and_wait_does_not_wait_for_download() { let ctx = ctx.with_scope_timeline(&timeline); // This test does downloads - let ctx = RequestContextBuilder::extend(&ctx) + let ctx = RequestContextBuilder::from(&ctx) .download_behavior(DownloadBehavior::Download) - .build(); + .attached_child(); let layer = { let mut layers = { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 51b78dcaba..df9a686eac 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -23,6 +23,7 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; +use crate::PERF_TRACE_TARGET; use anyhow::{Context, Result, anyhow, bail, ensure}; use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; @@ -84,9 +85,8 @@ use self::eviction_task::EvictionTaskTimelineState; use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::config::TenantConf; +use super::remote_timeline_client::RemoteTimelineClient; use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; -use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use super::secondary::heatmap::HeatMapLayer; use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; use super::tasks::log_compaction_error; @@ -97,7 +97,9 @@ use super::{ }; use crate::aux_file::AuxFileSizeEstimator; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext}; +use crate::context::{ + DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, +}; use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32}; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::l0_flush::{self, L0FlushGlobalState}; @@ -111,7 +113,7 @@ use crate::pgdatadir_mapping::{ MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; use crate::task_mgr::TaskKind; -use crate::tenant::config::{AttachmentMode, TenantConfOpt}; +use crate::tenant::config::AttachmentMode; use crate::tenant::gc_result::GcResult; use crate::tenant::layer_map::{LayerMap, SearchResult}; use crate::tenant::metadata::TimelineMetadata; @@ -536,11 +538,11 @@ impl GcInfo { /// between time-based and space-based retention for observability and consumption metrics purposes. #[derive(Debug, Clone)] pub(crate) struct GcCutoffs { - /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much + /// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much /// history we must keep to retain a specified number of bytes of WAL. pub(crate) space: Lsn, - /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much + /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates how much /// history we must keep to enable reading back at least the PITR interval duration. pub(crate) time: Lsn, } @@ -871,9 +873,14 @@ pub(crate) enum CompactFlags { OnlyL0Compaction, EnhancedGcBottomMostCompaction, DryRun, - /// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting - /// compaction via HTTP API. - NoYield, + /// Makes image compaction yield if there's pending L0 compaction. This should always be used in + /// the background compaction task, since we want to aggressively compact down L0 to bound + /// read amplification. + /// + /// It only makes sense to use this when `compaction_l0_first` is enabled (such that we yield to + /// an L0 compaction pass), and without `OnlyL0Compaction` (L0 compaction shouldn't yield for L0 + /// compaction). + YieldForL0, } #[serde_with::serde_as] @@ -891,6 +898,12 @@ pub(crate) struct CompactRequest { pub sub_compaction_max_job_size_mb: Option, } +#[derive(Debug, Clone, serde::Deserialize)] +pub(crate) struct MarkInvisibleRequest { + #[serde(default)] + pub is_visible: Option, +} + #[derive(Debug, Clone, Default)] pub(crate) struct CompactOptions { pub flags: EnumSet, @@ -1279,9 +1292,22 @@ impl Timeline { }; reconstruct_state.read_path = read_path; - let traversal_res: Result<(), _> = self - .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) - .await; + let traversal_res: Result<(), _> = { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "PLAN_IO", + ) + }) + .attached_child(); + + self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .await + }; + if let Err(err) = traversal_res { // Wait for all the spawned IOs to complete. // See comments on `spawn_io` inside `storage_layer` for more details. @@ -1295,14 +1321,46 @@ impl Timeline { let layers_visited = reconstruct_state.get_layers_visited(); + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "RECONSTRUCT", + ) + }) + .attached_child(); + let futs = FuturesUnordered::new(); for (key, state) in std::mem::take(&mut reconstruct_state.keys) { futs.push({ let walredo_self = self.myself.upgrade().expect("&self method holds the arc"); + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "RECONSTRUCT_KEY", + key = %key, + ) + }) + .attached_child(); + async move { assert_eq!(state.situation, ValueReconstructSituation::Complete); - let converted = match state.collect_pending_ios().await { + let res = state + .collect_pending_ios() + .maybe_perf_instrument(&ctx, |crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "WAIT_FOR_IO_COMPLETIONS", + ) + }) + .await; + + let converted = match res { Ok(ok) => ok, Err(err) => { return (key, Err(err)); @@ -1319,16 +1377,27 @@ impl Timeline { "{converted:?}" ); - ( - key, - walredo_self.reconstruct_value(key, lsn, converted).await, - ) + let walredo_deltas = converted.num_deltas(); + let walredo_res = walredo_self + .reconstruct_value(key, lsn, converted) + .maybe_perf_instrument(&ctx, |crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "WALREDO", + deltas = %walredo_deltas, + ) + }) + .await; + + (key, walredo_res) } }); } let results = futs .collect::>>() + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await; // For aux file keys (v1 or v2) the vectored read path does not return an error @@ -1892,18 +1961,19 @@ impl Timeline { // out by other background tasks (including image compaction). We request this via // `BackgroundLoopKind::L0Compaction`. // - // If this is a regular compaction pass, and L0-only compaction is enabled in the config, - // then we should yield for immediate L0 compaction if necessary while we're waiting for the - // background task semaphore. There's no point yielding otherwise, since we'd just end up - // right back here. + // Yield for pending L0 compaction while waiting for the semaphore. let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction); let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() { true => BackgroundLoopKind::L0Compaction, false => BackgroundLoopKind::Compaction, }; - let yield_for_l0 = !is_l0_only - && self.get_compaction_l0_first() - && !options.flags.contains(CompactFlags::NoYield); + let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0); + if yield_for_l0 { + // If this is an L0 pass, it doesn't make sense to yield for L0. + debug_assert!(!is_l0_only, "YieldForL0 during L0 pass"); + // If `compaction_l0_first` is disabled, there's no point yielding. + debug_assert!(self.get_compaction_l0_first(), "YieldForL0 without L0 pass"); + } let acquire = async move { let guard = self.compaction_lock.lock().await; @@ -2210,6 +2280,10 @@ impl Timeline { self.remote_client.is_archived() } + pub(crate) fn is_invisible(&self) -> Option { + self.remote_client.is_invisible() + } + pub(crate) fn is_stopping(&self) -> bool { self.current_state() == TimelineState::Stopping } @@ -2232,7 +2306,7 @@ impl Timeline { .await .expect("holding a reference to self"); } - TimelineState::Active { .. } => { + TimelineState::Active => { return Ok(()); } TimelineState::Broken { .. } | TimelineState::Stopping => { @@ -2402,6 +2476,31 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download) } + /// Checks if a get page request should get perf tracing + /// + /// The configuration priority is: tenant config override, default tenant config, + /// pageserver config. + pub(crate) fn is_get_page_request_sampled(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + let ratio = tenant_conf + .tenant_conf + .sampling_ratio + .flatten() + .or(self.conf.default_tenant_conf.sampling_ratio) + .or(self.conf.tracing.as_ref().map(|t| t.sampling_ratio)); + + match ratio { + Some(r) => { + if r.numerator == 0 { + false + } else { + rand::thread_rng().gen_range(0..r.denominator) < r.numerator + } + } + None => false, + } + } + fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2563,14 +2662,6 @@ impl Timeline { Some(max(l0_flush_stall_threshold, compaction_threshold)) } - fn get_l0_flush_wait_upload(&self) -> bool { - let tenant_conf = self.tenant_conf.load(); - tenant_conf - .tenant_conf - .l0_flush_wait_upload - .unwrap_or(self.conf.default_tenant_conf.l0_flush_wait_upload) - } - fn get_image_creation_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2598,8 +2689,8 @@ impl Timeline { } fn get_evictions_low_residence_duration_metric_threshold( - tenant_conf: &TenantConfOpt, - default_tenant_conf: &TenantConf, + tenant_conf: &pageserver_api::models::TenantConfig, + default_tenant_conf: &pageserver_api::config::TenantConfigToml, ) -> Duration { tenant_conf .evictions_low_residence_duration_metric_threshold @@ -3868,15 +3959,30 @@ impl Timeline { let TimelineVisitOutcome { completed_keyspace: completed, image_covered_keyspace, - } = Self::get_vectored_reconstruct_data_timeline( - timeline, - keyspace.clone(), - cont_lsn, - reconstruct_state, - &self.cancel, - ctx, - ) - .await?; + } = { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "PLAN_IO_TIMELINE", + timeline = %timeline.timeline_id, + lsn = %cont_lsn, + ) + }) + .attached_child(); + + Self::get_vectored_reconstruct_data_timeline( + timeline, + keyspace.clone(), + cont_lsn, + reconstruct_state, + &self.cancel, + &ctx, + ) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .await? + }; keyspace.remove_overlapping_with(&completed); @@ -3920,8 +4026,24 @@ impl Timeline { // Take the min to avoid reconstructing a page with data newer than request Lsn. cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); + + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_ANCESTOR", + timeline = %timeline.timeline_id, + lsn = %cont_lsn, + ancestor = %ancestor_timeline.timeline_id, + ancestor_lsn = %timeline.ancestor_lsn + ) + }) + .attached_child(); + timeline_owned = timeline - .get_ready_ancestor_timeline(ancestor_timeline, ctx) + .get_ready_ancestor_timeline(ancestor_timeline, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await?; timeline = &*timeline_owned; }; @@ -4592,27 +4714,6 @@ impl Timeline { // release lock on 'layers' }; - // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files. - // This makes us refuse ingest until the new layers have been persisted to the remote - // TODO: remove this, and rely on l0_flush_{delay,stall}_threshold instead. - if self.get_l0_flush_wait_upload() { - let start = Instant::now(); - self.remote_client - .wait_completion() - .await - .map_err(|e| match e { - WaitCompletionError::UploadQueueShutDownOrStopped - | WaitCompletionError::NotInitialized( - NotInitialized::ShuttingDown | NotInitialized::Stopped, - ) => FlushLayerError::Cancelled, - WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { - FlushLayerError::Other(anyhow!(e).into()) - } - })?; - let duration = start.elapsed().as_secs_f64(); - self.metrics.flush_wait_upload_time_gauge_add(duration); - } - // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this @@ -7285,9 +7386,9 @@ mod tests { eprintln!("Downloading {layer} and re-generating heatmap"); - let ctx = &RequestContextBuilder::extend(ctx) + let ctx = &RequestContextBuilder::from(ctx) .download_behavior(crate::context::DownloadBehavior::Download) - .build(); + .attached_child(); let _resident = layer .download_and_keep_resident(ctx) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 4ef2a1c38d..2da1fb1b25 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -26,7 +26,7 @@ use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::keyspace::{KeySpace, ShardedRange}; -use pageserver_api::models::CompactInfoResponse; +use pageserver_api::models::{CompactInfoResponse, CompactKeyRange}; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use pageserver_api::value::Value; @@ -61,7 +61,7 @@ use crate::tenant::timeline::{ DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, drop_rlock, }; -use crate::tenant::{DeltaLayer, MaybeOffloaded, gc_block}; +use crate::tenant::{DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; /// Maximum number of deltas before generating an image layer in bottom-most compaction. @@ -123,7 +123,6 @@ impl GcCompactionQueueItem { #[derive(Default)] struct GcCompactionGuardItems { notify: Option>, - gc_guard: Option, permit: Option, } @@ -279,7 +278,7 @@ impl GcCompactionQueue { gc_compaction_ratio_percent: u64, ) -> bool { const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB - if l1_size >= AUTO_TRIGGER_LIMIT || l2_size >= AUTO_TRIGGER_LIMIT { + if l1_size + l2_size >= AUTO_TRIGGER_LIMIT { // Do not auto-trigger when physical size >= 150GB return false; } @@ -319,7 +318,12 @@ impl GcCompactionQueue { flags }, sub_compaction: true, - compact_key_range: None, + // Only auto-trigger gc-compaction over the data keyspace due to concerns in + // https://github.com/neondatabase/neon/issues/11318. + compact_key_range: Some(CompactKeyRange { + start: Key::MIN, + end: Key::metadata_key_range().start, + }), compact_lsn_range: None, sub_compaction_max_job_size_mb: None, }, @@ -343,44 +347,45 @@ impl GcCompactionQueue { info!("compaction job id={} finished", id); let mut guard = self.inner.lock().unwrap(); if let Some(items) = guard.guards.remove(&id) { - drop(items.gc_guard); if let Some(tx) = items.notify { let _ = tx.send(()); } } } + fn clear_running_job(&self) { + let mut guard = self.inner.lock().unwrap(); + guard.running = None; + } + async fn handle_sub_compaction( &self, id: GcCompactionJobId, options: CompactOptions, timeline: &Arc, - gc_block: &GcBlock, auto: bool, ) -> Result<(), CompactionError> { info!( "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" ); - let jobs = timeline + let res = timeline .gc_compaction_split_jobs( GcCompactJob::from_compact_options(options.clone()), options.sub_compaction_max_job_size_mb, ) - .await?; + .await; + let jobs = match res { + Ok(jobs) => jobs, + Err(err) => { + warn!("cannot split gc-compaction jobs: {}, unblocked gc", err); + self.notify_and_unblock(id); + return Err(err); + } + }; if jobs.is_empty() { info!("no jobs to run, skipping scheduled compaction task"); self.notify_and_unblock(id); } else { - let gc_guard = match gc_block.start().await { - Ok(guard) => guard, - Err(e) => { - return Err(CompactionError::Other(anyhow!( - "cannot run gc-compaction because gc is blocked: {}", - e - ))); - } - }; - let jobs_len = jobs.len(); let mut pending_tasks = Vec::new(); // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate. @@ -394,8 +399,8 @@ impl GcCompactionQueue { if job.dry_run { flags |= CompactFlags::DryRun; } - if options.flags.contains(CompactFlags::NoYield) { - flags |= CompactFlags::NoYield; + if options.flags.contains(CompactFlags::YieldForL0) { + flags |= CompactFlags::YieldForL0; } let options = CompactOptions { flags, @@ -415,7 +420,6 @@ impl GcCompactionQueue { { let mut guard = self.inner.lock().unwrap(); - guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); let mut tasks = Vec::new(); for task in pending_tasks { let id = guard.next_id(); @@ -446,7 +450,18 @@ impl GcCompactionQueue { if let Err(err) = &res { log_compaction_error(err, None, cancel.is_cancelled()); } - res + match res { + Ok(res) => Ok(res), + Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown), + Err(_) => { + // There are some cases where traditional gc might collect some layer + // files causing gc-compaction cannot read the full history of the key. + // This needs to be resolved in the long-term by improving the compaction + // process. For now, let's simply avoid such errors triggering the + // circuit breaker. + Ok(CompactionOutcome::Skipped) + } + } } async fn iteration_inner( @@ -494,27 +509,32 @@ impl GcCompactionQueue { info!( "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" ); - self.handle_sub_compaction(id, options, timeline, gc_block, auto) + self.handle_sub_compaction(id, options, timeline, auto) .await?; } else { // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn // in this branch. - let gc_guard = match gc_block.start().await { + let _gc_guard = match gc_block.start().await { Ok(guard) => guard, Err(e) => { + self.notify_and_unblock(id); + self.clear_running_job(); return Err(CompactionError::Other(anyhow!( "cannot run gc-compaction because gc is blocked: {}", e ))); } }; - { - let mut guard = self.inner.lock().unwrap(); - guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); - } - let compaction_result = - timeline.compact_with_options(cancel, options, ctx).await?; - self.notify_and_unblock(id); + let res = timeline.compact_with_options(cancel, options, ctx).await; + let compaction_result = match res { + Ok(res) => res, + Err(err) => { + warn!(%err, "failed to run gc-compaction"); + self.notify_and_unblock(id); + self.clear_running_job(); + return Err(err); + } + }; if compaction_result == CompactionOutcome::YieldForL0 { yield_for_l0 = true; } @@ -522,7 +542,25 @@ impl GcCompactionQueue { } GcCompactionQueueItem::SubCompactionJob(options) => { // TODO: error handling, clear the queue if any task fails? - let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?; + let _gc_guard = match gc_block.start().await { + Ok(guard) => guard, + Err(e) => { + self.clear_running_job(); + return Err(CompactionError::Other(anyhow!( + "cannot run gc-compaction because gc is blocked: {}", + e + ))); + } + }; + let res = timeline.compact_with_options(cancel, options, ctx).await; + let compaction_result = match res { + Ok(res) => res, + Err(err) => { + warn!(%err, "failed to run gc-compaction subcompaction job"); + self.clear_running_job(); + return Err(err); + } + }; if compaction_result == CompactionOutcome::YieldForL0 { // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because @@ -553,10 +591,7 @@ impl GcCompactionQueue { } } } - { - let mut guard = self.inner.lock().unwrap(); - guard.running = None; - } + self.clear_running_job(); Ok(if yield_for_l0 { tracing::info!("give up gc-compaction: yield for L0 compaction"); CompactionOutcome::YieldForL0 @@ -984,7 +1019,7 @@ impl Timeline { // Yield if we have pending L0 compaction. The scheduler will do another pass. if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0) - && !options.flags.contains(CompactFlags::NoYield) + && options.flags.contains(CompactFlags::YieldForL0) { info!("image/ancestor compaction yielding for L0 compaction"); return Ok(CompactionOutcome::YieldForL0); @@ -1002,9 +1037,9 @@ impl Timeline { { Ok(((dense_partitioning, sparse_partitioning), lsn)) => { // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them - let image_ctx = RequestContextBuilder::extend(ctx) + let image_ctx = RequestContextBuilder::from(ctx) .access_stats_behavior(AccessStatsBehavior::Skip) - .build(); + .attached_child(); let mut partitioning = dense_partitioning; partitioning @@ -1029,7 +1064,7 @@ impl Timeline { .load() .as_ref() .clone(), - !options.flags.contains(CompactFlags::NoYield), + options.flags.contains(CompactFlags::YieldForL0), ) .await .inspect_err(|err| { @@ -1210,6 +1245,10 @@ impl Timeline { let mut replace_image_layers = Vec::new(); for layer in layers_to_rewrite { + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + tracing::info!(layer=%layer, "Rewriting layer after shard split..."); let mut image_layer_writer = ImageLayerWriter::new( self.conf, @@ -2640,7 +2679,7 @@ impl Timeline { ) -> Result { let sub_compaction = options.sub_compaction; let job = GcCompactJob::from_compact_options(options.clone()); - let no_yield = options.flags.contains(CompactFlags::NoYield); + let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0); if sub_compaction { info!( "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" @@ -2655,7 +2694,7 @@ impl Timeline { idx + 1, jobs_len ); - self.compact_with_gc_inner(cancel, job, ctx, no_yield) + self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0) .await?; } if jobs_len == 0 { @@ -2663,7 +2702,8 @@ impl Timeline { } return Ok(CompactionOutcome::Done); } - self.compact_with_gc_inner(cancel, job, ctx, no_yield).await + self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0) + .await } async fn compact_with_gc_inner( @@ -2671,7 +2711,7 @@ impl Timeline { cancel: &CancellationToken, job: GcCompactJob, ctx: &RequestContext, - no_yield: bool, + yield_for_l0: bool, ) -> Result { // Block other compaction/GC tasks from running for now. GC-compaction could run along // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. @@ -2941,18 +2981,15 @@ impl Timeline { if cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); } - if !no_yield { - let should_yield = self + let should_yield = yield_for_l0 + && self .l0_compaction_trigger .notified() .now_or_never() .is_some(); - if should_yield { - tracing::info!( - "preempt gc-compaction when downloading layers: too many L0 layers" - ); - return Ok(CompactionOutcome::YieldForL0); - } + if should_yield { + tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers"); + return Ok(CompactionOutcome::YieldForL0); } let resident_layer = layer .download_and_keep_resident(ctx) @@ -3089,21 +3126,17 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - if !no_yield { - keys_processed += 1; - if keys_processed % 1000 == 0 { - let should_yield = self - .l0_compaction_trigger - .notified() - .now_or_never() - .is_some(); - if should_yield { - tracing::info!( - "preempt gc-compaction in the main loop: too many L0 layers" - ); - return Ok(CompactionOutcome::YieldForL0); - } - } + keys_processed += 1; + let should_yield = yield_for_l0 + && keys_processed % 1000 == 0 + && self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!("preempt gc-compaction in the main loop: too many L0 layers"); + return Ok(CompactionOutcome::YieldForL0); } if self.shard_identity.is_key_disposable(&key) { // If this shard does not need to store this key, simply skip it. diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 66e6d1e848..5fdb40d7dc 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -2,10 +2,14 @@ use std::collections::HashSet; use std::sync::Arc; use anyhow::Context; +use bytes::Bytes; use http_utils::error::ApiError; +use pageserver_api::key::Key; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::DetachBehavior; use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::shard::ShardIdentity; +use pageserver_compaction::helpers::overlaps_with; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; @@ -22,7 +26,10 @@ use crate::task_mgr::TaskKind; use crate::tenant::Tenant; use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor; use crate::tenant::storage_layer::layer::local_layer_path; -use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}; +use crate::tenant::storage_layer::{ + AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, + ValuesReconstructState, +}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; #[derive(Debug, thiserror::Error)] @@ -170,6 +177,92 @@ impl Attempt { } } +async fn generate_tombstone_image_layer( + detached: &Arc, + ancestor: &Arc, + ancestor_lsn: Lsn, + ctx: &RequestContext, +) -> Result, Error> { + tracing::info!( + "removing non-inherited keys by writing an image layer with tombstones at the detach LSN" + ); + let io_concurrency = IoConcurrency::spawn_from_conf( + detached.conf, + detached.gate.enter().map_err(|_| Error::ShuttingDown)?, + ); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); + // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should + // not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute. + let key_range = Key::sparse_non_inherited_keyspace(); + // avoid generating a "future layer" which will then be removed + let image_lsn = ancestor_lsn; + + { + let layers = detached.layers.read().await; + for layer in layers.all_persistent_layers() { + if !layer.is_delta + && layer.lsn_range.start == image_lsn + && overlaps_with(&key_range, &layer.key_range) + { + tracing::warn!( + layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files" + ); + return Ok(None); + } + } + } + + let data = ancestor + .get_vectored_impl( + KeySpace::single(key_range.clone()), + image_lsn, + &mut reconstruct_state, + ctx, + ) + .await + .context("failed to retrieve aux keys") + .map_err(|e| Error::launder(e, Error::Prepare))?; + if !data.is_empty() { + // TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated + // upon compaction but theoretically possible. + let mut image_layer_writer = ImageLayerWriter::new( + detached.conf, + detached.timeline_id, + detached.tenant_shard_id, + &key_range, + image_lsn, + ctx, + ) + .await + .context("failed to create image layer writer") + .map_err(Error::Prepare)?; + for key in data.keys() { + image_layer_writer + .put_image(*key, Bytes::new(), ctx) + .await + .context("failed to write key") + .map_err(|e| Error::launder(e, Error::Prepare))?; + } + let (desc, path) = image_layer_writer + .finish(ctx) + .await + .context("failed to finish image layer writer for removing the metadata keys") + .map_err(|e| Error::launder(e, Error::Prepare))?; + let generated = Layer::finish_creating(detached.conf, detached, desc, &path) + .map_err(|e| Error::launder(e, Error::Prepare))?; + detached + .remote_client + .upload_layer_file(&generated, &detached.cancel) + .await + .map_err(|e| Error::launder(e, Error::Prepare))?; + tracing::info!(layer=%generated, "wrote image layer"); + Ok(Some(generated)) + } else { + tracing::info!("no aux keys found in ancestor"); + Ok(None) + } +} + /// See [`Timeline::prepare_to_detach_from_ancestor`] pub(super) async fn prepare( detached: &Arc, @@ -235,7 +328,7 @@ pub(super) async fn prepare( return Err(NoAncestor); } - check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; + check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn, behavior)?; if let DetachBehavior::MultiLevelAndNoReparent = behavior { // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline. @@ -249,7 +342,13 @@ pub(super) async fn prepare( ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable ancestor = ancestor_of_ancestor; // TODO: do we still need to check if we don't want to reparent? - check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; + check_no_archived_children_of_ancestor( + tenant, + detached, + &ancestor, + ancestor_lsn, + behavior, + )?; } } else if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose @@ -346,10 +445,16 @@ pub(super) async fn prepare( // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after let mut new_layers: Vec = - Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len()); + Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1); + + if let Some(tombstone_layer) = + generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await? + { + new_layers.push(tombstone_layer.into()); + } { - tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); + tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); let mut tasks = tokio::task::JoinSet::new(); @@ -1158,31 +1263,44 @@ fn check_no_archived_children_of_ancestor( detached: &Arc, ancestor: &Arc, ancestor_lsn: Lsn, + detach_behavior: DetachBehavior, ) -> Result<(), Error> { - let timelines = tenant.timelines.lock().unwrap(); - let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); - for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) { - if timeline.is_archived() == Some(true) { - return Err(Error::Archived(timeline.timeline_id)); - } - } - for timeline_offloaded in timelines_offloaded.values() { - if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) { - continue; - } - // This forbids the detach ancestor feature if flattened timelines are present, - // even if the ancestor_lsn is from after the branchpoint of the detached timeline. - // But as per current design, we don't record the ancestor_lsn of flattened timelines. - // This is a bit unfortunate, but as of writing this we don't support flattening - // anyway. Maybe we can evolve the data model in the future. - if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn { - let is_earlier = retain_lsn <= ancestor_lsn; - if !is_earlier { - continue; + match detach_behavior { + DetachBehavior::NoAncestorAndReparent => { + let timelines = tenant.timelines.lock().unwrap(); + let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); + + for timeline in + reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) + { + if timeline.is_archived() == Some(true) { + return Err(Error::Archived(timeline.timeline_id)); + } + } + + for timeline_offloaded in timelines_offloaded.values() { + if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) { + continue; + } + // This forbids the detach ancestor feature if flattened timelines are present, + // even if the ancestor_lsn is from after the branchpoint of the detached timeline. + // But as per current design, we don't record the ancestor_lsn of flattened timelines. + // This is a bit unfortunate, but as of writing this we don't support flattening + // anyway. Maybe we can evolve the data model in the future. + if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn { + let is_earlier = retain_lsn <= ancestor_lsn; + if !is_earlier { + continue; + } + } + return Err(Error::Archived(timeline_offloaded.timeline_id)); } } - return Err(Error::Archived(timeline_offloaded.timeline_id)); + DetachBehavior::MultiLevelAndNoReparent => { + // We don't need to check anything if the user requested to not reparent. + } } + Ok(()) } diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs index 7c7a4de2fc..352bbbc4d4 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs @@ -32,9 +32,15 @@ impl Client { let Some(ref base_url) = conf.import_pgdata_upcall_api else { anyhow::bail!("import_pgdata_upcall_api is not configured") }; + let mut http_client = reqwest::Client::builder(); + for cert in &conf.ssl_ca_certs { + http_client = http_client.add_root_certificate(cert.clone()); + } + let http_client = http_client.build()?; + Ok(Self { base_url: base_url.to_string(), - client: reqwest::Client::new(), + client: http_client, cancel, authorization_header: conf .import_pgdata_upcall_api_token diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index c1b4015ae2..07f949b89e 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -25,8 +25,8 @@ impl AlignedBufferMut> { /// * `align` must be a power of two, /// /// * `capacity`, when rounded up to the nearest multiple of `align`, - /// must not overflow isize (i.e., the rounded value must be - /// less than or equal to `isize::MAX`). + /// must not overflow isize (i.e., the rounded value must be + /// less than or equal to `isize::MAX`). pub fn with_capacity(capacity: usize) -> Self { AlignedBufferMut { raw: RawAlignedBuffer::with_capacity(capacity), diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs index 97a6c4049a..d273772411 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -37,8 +37,8 @@ impl RawAlignedBuffer> { /// * `align` must be a power of two, /// /// * `capacity`, when rounded up to the nearest multiple of `align`, - /// must not overflow isize (i.e., the rounded value must be - /// less than or equal to `isize::MAX`). + /// must not overflow isize (i.e., the rounded value must be + /// less than or equal to `isize::MAX`). pub fn with_capacity(capacity: usize) -> Self { let align = ConstAlign::; let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout"); diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index c87ae59fd6..8259d24359 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -10,6 +10,7 @@ OBJS = \ libpagestore.o \ logical_replication_monitor.o \ neon.o \ + neon_lwlsncache.o \ neon_pgversioncompat.o \ neon_perf_counters.o \ neon_utils.o \ diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h index 0a131816ef..21efd13547 100644 --- a/pgxn/neon/bitmap.h +++ b/pgxn/neon/bitmap.h @@ -9,4 +9,4 @@ #define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) #define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) -#endif //NEON_BITMAP_H +#endif /* NEON_BITMAP_H */ diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 59096a1bc8..47ed37da06 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -13,9 +13,6 @@ * accumulate changes. On subtransaction commit, the top of the stack * is merged with the table below it. * - * IDENTIFICATION - * contrib/neon/control_plane_connector.c - * *------------------------------------------------------------------------- */ diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index 0331f961b4..00dcb6920e 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -3,9 +3,6 @@ * extension_server.c * Request compute_ctl to download extension files. * - * IDENTIFICATION - * contrib/neon/extension_server.c - * *------------------------------------------------------------------------- */ #include "postgres.h" diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h index 3e67708b85..8356d70959 100644 --- a/pgxn/neon/extension_server.h +++ b/pgxn/neon/extension_server.h @@ -3,9 +3,6 @@ * extension_server.h * Request compute_ctl to download extension files. * - * IDENTIFICATION - * contrib/neon/extension_server.h - * *------------------------------------------------------------------------- */ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index f13522e55b..97a4c39e49 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -1,4 +1,4 @@ -/* +/*------------------------------------------------------------------------- * * file_cache.c * @@ -6,10 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * - * IDENTIFICATION - * pgxn/neon/file_cache.c - * *------------------------------------------------------------------------- */ @@ -48,6 +44,7 @@ #include "hll.h" #include "bitmap.h" #include "neon.h" +#include "neon_lwlsncache.h" #include "neon_perf_counters.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -646,18 +643,25 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, return found; } +#if PG_MAJORVERSION_NUM >= 16 +static PGIOAlignedBlock voidblock = {0}; +#else +static PGAlignedBlock voidblock = {0}; +#endif +#define SCRIBBLEPAGE (&voidblock.data) + /* * Try to read pages from local cache. * Returns the number of pages read from the local cache, and sets bits in - * 'read' for the pages which were read. This may scribble over buffers not - * marked in 'read', so be careful with operation ordering. + * 'mask' for the pages which were read. This may scribble over buffers not + * marked in 'mask', so be careful with operation ordering. * * In case of error local file cache is disabled (lfc->limit is set to zero), - * and -1 is returned. Note that 'read' and the buffers may be touched and in - * an otherwise invalid state. + * and -1 is returned. * - * If the mask argument is supplied, bits will be set at the offsets of pages - * that were present and read from the LFC. + * If the mask argument is supplied, we'll only try to read those pages which + * don't have their bits set on entry. At exit, pages which were successfully + * read from LFC will have their bits set. */ int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, @@ -692,23 +696,43 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, while (nblocks > 0) { struct iovec iov[PG_IOV_MAX]; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int8 chunk_mask[BLOCKS_PER_CHUNK / 8] = {0}; + int chunk_offs = (blkno & (BLOCKS_PER_CHUNK - 1)); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); int iteration_hits = 0; int iteration_misses = 0; uint64 io_time_us = 0; - int n_blocks_to_read = 0; + int n_blocks_to_read = 0; + int iov_last_used = 0; + int first_block_in_chunk_read = -1; ConditionVariable* cv; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) { - n_blocks_to_read += (BITMAP_ISSET(mask, buf_offset + i) != 0); - iov[i].iov_base = buffers[buf_offset + i]; iov[i].iov_len = BLCKSZ; - BITMAP_CLR(mask, buf_offset + i); + /* mask not set = we must do work */ + if (!BITMAP_ISSET(mask, buf_offset + i)) + { + iov[i].iov_base = buffers[buf_offset + i]; + n_blocks_to_read++; + iov_last_used = i + 1; + + if (first_block_in_chunk_read == -1) + { + first_block_in_chunk_read = i; + } + } + /* mask set = we must do no work */ + else + { + /* don't scribble on pages we weren't requested to write to */ + iov[i].iov_base = SCRIBBLEPAGE; + } } + + /* shortcut IO */ if (n_blocks_to_read == 0) { buf_offset += blocks_in_chunk; @@ -717,6 +741,12 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, continue; } + /* + * The effective iov size must be >= the number of blocks we're about + * to read. + */ + Assert(iov_last_used - first_block_in_chunk_read >= n_blocks_to_read); + tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); cv = &lfc_ctl->cv[hash % N_COND_VARS]; @@ -761,10 +791,15 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - for (int i = 0; i < blocks_in_chunk; i++) + for (int i = first_block_in_chunk_read; i < iov_last_used; i++) { FileCacheBlockState state = UNAVAILABLE; bool sleeping = false; + + /* no need to work on something we're not interested in */ + if (BITMAP_ISSET(mask, buf_offset + i)) + continue; + while (lfc_ctl->generation == generation) { state = GET_STATE(entry, chunk_offs + i); @@ -788,7 +823,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } if (state == AVAILABLE) { - BITMAP_SET(mask, buf_offset + i); + BITMAP_SET(chunk_mask, i); iteration_hits++; } else @@ -800,16 +835,34 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (iteration_hits != 0) { + /* chunk offset (# of pages) into the LFC file */ + off_t first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK; + int nwrite = iov_last_used - first_block_in_chunk_read; + /* offset of first IOV */ + first_read_offset += chunk_offs + first_block_in_chunk_read; + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ); - rc = preadv(lfc_desc, iov, blocks_in_chunk, - ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + + /* Read only the blocks we're interested in, limiting */ + rc = preadv(lfc_desc, &iov[first_block_in_chunk_read], + nwrite, first_read_offset * BLCKSZ); pgstat_report_wait_end(); - if (rc != (BLCKSZ * blocks_in_chunk)) + if (rc != (BLCKSZ * nwrite)) { lfc_disable("read"); return -1; } + + /* + * We successfully read the pages we know were valid when we + * started reading; now mark those pages as read + */ + for (int i = first_block_in_chunk_read; i < iov_last_used; i++) + { + if (BITMAP_ISSET(chunk_mask, i)) + BITMAP_SET(mask, buf_offset + i); + } } /* Place entry to the head of LRU list */ @@ -999,7 +1052,9 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, LWLockRelease(lfc_lock); return false; } - lwlsn = GetLastWrittenLSN(rinfo, forknum, blkno); + + lwlsn = neon_get_lwlsn(rinfo, forknum, blkno); + if (lwlsn > lsn) { elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X", @@ -1010,6 +1065,9 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); + tag.blockNum = blkno; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + if (found) { state = GET_STATE(entry, chunk_offs); @@ -1160,6 +1218,13 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); + /* Approximate working set for the blocks assumed in this entry */ + for (int i = 0; i < blocks_in_chunk; i++) + { + tag.blockNum = blkno + i; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } + if (found) { /* diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 637281fe4a..60b2249461 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -6,15 +6,12 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * - * IDENTIFICATION - * contrib/neon/libpqpagestore.c - * *------------------------------------------------------------------------- */ #include "postgres.h" #include +#include #include "libpq-int.h" @@ -33,6 +30,7 @@ #include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "utils/guc.h" +#include "utils/memutils.h" #include "neon.h" #include "neon_perf_counters.h" @@ -50,6 +48,20 @@ #define MIN_RECONNECT_INTERVAL_USEC 1000 #define MAX_RECONNECT_INTERVAL_USEC 1000000 + +enum NeonComputeMode { + CP_MODE_PRIMARY = 0, + CP_MODE_REPLICA, + CP_MODE_STATIC +}; + +static const struct config_enum_entry neon_compute_modes[] = { + {"primary", CP_MODE_PRIMARY, false}, + {"replica", CP_MODE_REPLICA, false}, + {"static", CP_MODE_STATIC, false}, + {NULL, 0, false} +}; + /* GUCs */ char *neon_timeline; char *neon_tenant; @@ -62,11 +74,13 @@ int flush_every_n_requests = 8; int neon_protocol_version = 2; +static int neon_compute_mode = 0; static int max_reconnect_attempts = 60; static int stripe_size; static int pageserver_response_log_timeout = 10000; -static int pageserver_response_disconnect_timeout = 120000; /* 2 minutes */ +/* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */ +static int pageserver_response_disconnect_timeout = 150000; typedef struct { @@ -390,9 +404,10 @@ pageserver_connect(shardno_t shard_no, int elevel) { case PS_Disconnected: { - const char *keywords[4]; - const char *values[4]; - char pid_str[16]; + const char *keywords[5]; + const char *values[5]; + char pid_str[16] = { 0 }; + char endpoint_str[36] = { 0 }; int n_pgsql_params; TimestampTz now; int64 us_since_last_attempt; @@ -464,6 +479,31 @@ pageserver_connect(shardno_t shard_no, int elevel) n_pgsql_params++; } + { + bool param_set = false; + switch (neon_compute_mode) + { + case CP_MODE_PRIMARY: + strncpy(endpoint_str, "-c neon.compute_mode=primary", sizeof(endpoint_str)); + param_set = true; + break; + case CP_MODE_REPLICA: + strncpy(endpoint_str, "-c neon.compute_mode=replica", sizeof(endpoint_str)); + param_set = true; + break; + case CP_MODE_STATIC: + strncpy(endpoint_str, "-c neon.compute_mode=static", sizeof(endpoint_str)); + param_set = true; + break; + } + if (param_set) + { + keywords[n_pgsql_params] = "options"; + values[n_pgsql_params] = endpoint_str; + n_pgsql_params++; + } + } + keywords[n_pgsql_params] = NULL; values[n_pgsql_params] = NULL; @@ -722,6 +762,24 @@ get_socket_stats(int socketfd, int *sndbuf, int *recvbuf) #endif } +/* + * Tries to get the local port of a socket. Sets 'port' to -1 on error. + */ +static void +get_local_port(int socketfd, int *port) +{ + struct sockaddr_in addr; + socklen_t addr_len = sizeof(addr); + + memset(&addr, 0, addr_len); + if (getsockname(socketfd, (struct sockaddr*) &addr, &addr_len) == 0) + { + *port = ntohs(addr.sin_port); + } else { + *port = -1; + } +} + /* * A wrapper around PQgetCopyData that checks for interrupts while sleeping. */ @@ -811,15 +869,17 @@ retry: */ if (INSTR_TIME_GET_MILLISEC(since_last_log) >= pageserver_response_log_timeout) { + int port; int sndbuf; int recvbuf; + get_local_port(PQsocket(pageserver_conn), &port); get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); neon_shard_log(shard_no, LOG, - "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d) (conn start=%d end=%d)", + "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket port=%d sndbuf=%d recvbuf=%d) (conn start=%d end=%d)", INSTR_TIME_GET_DOUBLE(since_start), - shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf, + shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf, pageserver_conn->inStart, pageserver_conn->inEnd); shard->receive_last_log_time = now; shard->receive_logged = true; @@ -841,8 +901,10 @@ retry: */ if (INSTR_TIME_GET_MILLISEC(since_start) >= pageserver_response_disconnect_timeout) { - neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting", - INSTR_TIME_GET_DOUBLE(since_start)); + int port; + get_local_port(PQsocket(pageserver_conn), &port); + neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)", + INSTR_TIME_GET_DOUBLE(since_start), port); pageserver_disconnect(shard_no); return -1; } @@ -1077,15 +1139,22 @@ pageserver_try_receive(shardno_t shard_no) NeonResponse *resp; PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn = shard->conn; - /* read response */ - int rc; + int rc; if (shard->state != PS_Connected) return NULL; Assert(pageserver_conn); - rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */); + rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */); + if (rc == 0) + { + if (!PQconsumeInput(shard->conn)) + { + return NULL; + } + rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */); + } if (rc == 0) return NULL; @@ -1365,11 +1434,22 @@ pg_init_libpagestore(void) "If the pageserver doesn't respond to a request within this timeout, " "disconnect and reconnect.", &pageserver_response_disconnect_timeout, - 120000, 100, INT_MAX, + 150000, 100, INT_MAX, PGC_SUSET, GUC_UNIT_MS, NULL, NULL, NULL); + DefineCustomEnumVariable( + "neon.compute_mode", + "The compute endpoint node type", + NULL, + &neon_compute_mode, + CP_MODE_PRIMARY, + neon_compute_modes, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + relsize_hash_init(); if (page_server != NULL) diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c index b94faafdfa..69426c2e83 100644 --- a/pgxn/neon/logical_replication_monitor.c +++ b/pgxn/neon/logical_replication_monitor.c @@ -1,11 +1,11 @@ +#include "postgres.h" + #include #include #include #include #include -#include "postgres.h" - #include "miscadmin.h" #include "postmaster/bgworker.h" #include "postmaster/interrupt.h" diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 0f226cc9e2..081025e2d5 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -1,10 +1,7 @@ /*------------------------------------------------------------------------- * * neon.c - * Utility functions to expose neon specific information to user - * - * IDENTIFICATION - * contrib/neon/neon.c + * Main entry point into the neon exension * *------------------------------------------------------------------------- */ @@ -33,6 +30,7 @@ #include "extension_server.h" #include "neon.h" +#include "neon_lwlsncache.h" #include "control_plane_connector.h" #include "logical_replication_monitor.h" #include "unstable_extensions.h" @@ -437,6 +435,8 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); + init_lwlsncache(); + pagestore_smgr_init(); Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index c9beb8c318..e2fa136e37 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -3,15 +3,13 @@ * neon.h * Functions used in the initialization of this extension. * - * IDENTIFICATION - * contrib/neon/neon.h - * *------------------------------------------------------------------------- */ #ifndef NEON_H #define NEON_H -#include "access/xlogreader.h" + +#include "access/xlogdefs.h" #include "utils/wait_event.h" /* GUCs */ @@ -58,8 +56,8 @@ extern void SetNeonCurrentClusterSize(uint64 size); extern uint64 GetNeonCurrentClusterSize(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); -extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); -extern void PGDLLEXPORT WalProposerMain(Datum main_arg); -PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); +extern PGDLLEXPORT void WalProposerSync(int argc, char *argv[]); +extern PGDLLEXPORT void WalProposerMain(Datum main_arg); +extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); #endif /* NEON_H */ diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c new file mode 100644 index 0000000000..59222eb855 --- /dev/null +++ b/pgxn/neon/neon_lwlsncache.c @@ -0,0 +1,508 @@ +#include "postgres.h" + +#include "neon_lwlsncache.h" + +#include "miscadmin.h" +#include "access/xlog.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/buf_internals.h" +#include "utils/guc.h" +#include "utils/hsearch.h" + + + +typedef struct LastWrittenLsnCacheEntry +{ + BufferTag key; + XLogRecPtr lsn; + /* double linked list for LRU replacement algorithm */ + dlist_node lru_node; +} LastWrittenLsnCacheEntry; + +typedef struct LwLsnCacheCtl { + int lastWrittenLsnCacheSize; + /* + * Maximal last written LSN for pages not present in lastWrittenLsnCache + */ + XLogRecPtr maxLastWrittenLsn; + + /* + * Double linked list to implement LRU replacement policy for last written LSN cache. + * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'. + */ + dlist_head lastWrittenLsnLRU; +} LwLsnCacheCtl; + + +/* + * Cache of last written LSN for each relation page. + * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last + * relation metadata update. + * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"), + * pages are replaced using LRU algorithm, based on L2-list. + * Access to this cache is protected by 'LastWrittenLsnLock'. + */ +static HTAB *lastWrittenLsnCache; + +LwLsnCacheCtl* LwLsnCache; + +static int lwlsn_cache_size = (128 * 1024); + + +static void +lwlc_register_gucs(void) +{ + DefineCustomIntVariable("neon.last_written_lsn_cache_size", + "Size of last written LSN cache used by Neon", + NULL, + &lwlsn_cache_size, + (128*1024), 1024, INT_MAX, + PGC_POSTMASTER, + 0, /* plain units */ + NULL, NULL, NULL); +} + +static XLogRecPtr SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn, + NRelFileInfo rlocator, + ForkNumber forknum, + BlockNumber from, + BlockNumber n_blocks); + +/* All the necessary hooks are defined here */ + + +/* These hold the set_lwlsn_* hooks which were installed before ours, if any */ +static set_lwlsn_block_range_hook_type prev_set_lwlsn_block_range_hook = NULL; +static set_lwlsn_block_v_hook_type prev_set_lwlsn_block_v_hook = NULL; +static set_lwlsn_block_hook_type prev_set_lwlsn_block_hook = NULL; +static set_max_lwlsn_hook_type prev_set_max_lwlsn_hook = NULL; +static set_lwlsn_relation_hook_type prev_set_lwlsn_relation_hook = NULL; +static set_lwlsn_db_hook_type prev_set_lwlsn_db_hook = NULL; + +static shmem_startup_hook_type prev_shmem_startup_hook; + +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook; +#endif + +static void shmemrequest(void); +static void shmeminit(void); +static void neon_set_max_lwlsn(XLogRecPtr lsn); + +void +init_lwlsncache(void) +{ + if (!process_shared_preload_libraries_in_progress) + ereport(ERROR, errcode(ERRCODE_INTERNAL_ERROR), errmsg("Loading of shared preload libraries is not in progress. Exiting")); + + lwlc_register_gucs(); + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = shmeminit; + + #if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = shmemrequest; + #else + shmemrequest(); + #endif + + prev_set_lwlsn_block_range_hook = set_lwlsn_block_range_hook; + set_lwlsn_block_range_hook = neon_set_lwlsn_block_range; + prev_set_lwlsn_block_v_hook = set_lwlsn_block_v_hook; + set_lwlsn_block_v_hook = neon_set_lwlsn_block_v; + prev_set_lwlsn_block_hook = set_lwlsn_block_hook; + set_lwlsn_block_hook = neon_set_lwlsn_block; + prev_set_max_lwlsn_hook = set_max_lwlsn_hook; + set_max_lwlsn_hook = neon_set_max_lwlsn; + prev_set_lwlsn_relation_hook = set_lwlsn_relation_hook; + set_lwlsn_relation_hook = neon_set_lwlsn_relation; + prev_set_lwlsn_db_hook = set_lwlsn_db_hook; + set_lwlsn_db_hook = neon_set_lwlsn_db; +} + + +static void shmemrequest(void) { + Size requested_size = sizeof(LwLsnCacheCtl); + + requested_size += hash_estimate_size(lwlsn_cache_size, sizeof(LastWrittenLsnCacheEntry)); + + RequestAddinShmemSpace(requested_size); + + #if PG_VERSION_NUM >= 150000 + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + #endif +} + +static void shmeminit(void) { + static HASHCTL info; + bool found; + if (lwlsn_cache_size > 0) + { + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(LastWrittenLsnCacheEntry); + lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache", + lwlsn_cache_size, lwlsn_cache_size, + &info, + HASH_ELEM | HASH_BLOBS); + LwLsnCache = ShmemInitStruct("neon/LwLsnCacheCtl", sizeof(LwLsnCacheCtl), &found); + // Now set the size in the struct + LwLsnCache->lastWrittenLsnCacheSize = lwlsn_cache_size; + if (found) { + return; + } + } + dlist_init(&LwLsnCache->lastWrittenLsnLRU); + LwLsnCache->maxLastWrittenLsn = GetRedoRecPtr(); + if (prev_shmem_startup_hook) { + prev_shmem_startup_hook(); + } +} + +/* + * neon_get_lwlsn -- Returns maximal LSN of written page. + * It returns an upper bound for the last written LSN of a given page, + * either from a cached last written LSN or a global maximum last written LSN. + * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. + * If cache is large enough, iterating through all hash items may be rather expensive. + * But neon_get_lwlsn(InvalidOid) is used only by neon_dbsize which is not performance critical. + */ +XLogRecPtr +neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno) +{ + XLogRecPtr lsn; + LastWrittenLsnCacheEntry* entry; + + Assert(LwLsnCache->lastWrittenLsnCacheSize != 0); + + LWLockAcquire(LastWrittenLsnLock, LW_SHARED); + + /* Maximal last written LSN among all non-cached pages */ + lsn = LwLsnCache->maxLastWrittenLsn; + + if (NInfoGetRelNumber(rlocator) != InvalidOid) + { + BufferTag key; + Oid spcOid = NInfoGetSpcOid(rlocator); + Oid dbOid = NInfoGetDbOid(rlocator); + Oid relNumber = NInfoGetRelNumber(rlocator); + BufTagInit(key, relNumber, forknum, blkno, spcOid, dbOid); + + entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); + if (entry != NULL) + lsn = entry->lsn; + else + { + LWLockRelease(LastWrittenLsnLock); + LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); + /* + * In case of statements CREATE TABLE AS SELECT... or INSERT FROM SELECT... we are fetching data from source table + * and storing it in destination table. It cause problems with prefetch last-written-lsn is known for the pages of + * source table (which for example happens after compute restart). In this case we get get global value of + * last-written-lsn which is changed frequently as far as we are writing pages of destination table. + * As a result request-lsn for the prefetch and request-let when this page is actually needed are different + * and we got exported prefetch request. So it actually disarms prefetch. + * To prevent that, we re-insert the page with the latest LSN, so that it's + * less likely the LSN for this page will get evicted from the LwLsnCache + * before the page is read. + */ + lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, blkno, 1); + } + } + else + { + HASH_SEQ_STATUS seq; + /* Find maximum of all cached LSNs */ + hash_seq_init(&seq, lastWrittenLsnCache); + while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) + { + if (entry->lsn > lsn) + lsn = entry->lsn; + } + } + LWLockRelease(LastWrittenLsnLock); + + return lsn; +} + +static void neon_set_max_lwlsn(XLogRecPtr lsn) { + LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); + LwLsnCache->maxLastWrittenLsn = lsn; + LWLockRelease(LastWrittenLsnLock); +} + +/* + * GetLastWrittenLSN -- Returns maximal LSN of written page. + * It returns an upper bound for the last written LSN of a given page, + * either from a cached last written LSN or a global maximum last written LSN. + * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. + * If cache is large enough, iterating through all hash items may be rather expensive. + * But GetLastWrittenLSN(InvalidOid) is used only by neon_dbsize which is not performance critical. + */ +void +neon_get_lwlsn_v(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns) +{ + LastWrittenLsnCacheEntry* entry; + XLogRecPtr lsn; + + Assert(LwLsnCache->lastWrittenLsnCacheSize != 0); + Assert(nblocks > 0); + Assert(PointerIsValid(lsns)); + + LWLockAcquire(LastWrittenLsnLock, LW_SHARED); + + if (NInfoGetRelNumber(relfilenode) != InvalidOid) + { + BufferTag key; + bool missed_keys = false; + Oid spcOid = NInfoGetSpcOid(relfilenode); + Oid dbOid = NInfoGetDbOid(relfilenode); + Oid relNumber = NInfoGetRelNumber(relfilenode); + BufTagInit(key, relNumber, forknum, blkno, spcOid, dbOid); + + for (int i = 0; i < nblocks; i++) + { + /* Maximal last written LSN among all non-cached pages */ + key.blockNum = blkno + i; + + entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); + if (entry != NULL) + { + lsns[i] = entry->lsn; + } + else + { + /* Mark this block's LSN as missing - we'll update the LwLSN for missing blocks in bulk later */ + lsns[i] = InvalidXLogRecPtr; + missed_keys = true; + } + } + + /* + * If we had any missing LwLSN entries, we add the missing ones now. + * By doing the insertions in one batch, we decrease lock contention. + */ + if (missed_keys) + { + LWLockRelease(LastWrittenLsnLock); + LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); + + lsn = LwLsnCache->maxLastWrittenLsn; + + for (int i = 0; i < nblocks; i++) + { + if (lsns[i] == InvalidXLogRecPtr) + { + lsns[i] = lsn; + SetLastWrittenLSNForBlockRangeInternal(lsn, relfilenode, forknum, blkno + i, 1); + } + } + } + } + else + { + HASH_SEQ_STATUS seq; + lsn = LwLsnCache->maxLastWrittenLsn; + /* Find maximum of all cached LSNs */ + hash_seq_init(&seq, lastWrittenLsnCache); + while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) + { + if (entry->lsn > lsn) + lsn = entry->lsn; + } + + for (int i = 0; i < nblocks; i++) + lsns[i] = lsn; + } + LWLockRelease(LastWrittenLsnLock); +} + +/* + * Guts for SetLastWrittenLSNForBlockRange. + * Caller must ensure LastWrittenLsnLock is held in exclusive mode. + */ +static XLogRecPtr +SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn, + NRelFileInfo rlocator, + ForkNumber forknum, + BlockNumber from, + BlockNumber n_blocks) +{ + if (NInfoGetRelNumber(rlocator) == InvalidOid) + { + if (lsn > LwLsnCache->maxLastWrittenLsn) + LwLsnCache->maxLastWrittenLsn = lsn; + else + lsn = LwLsnCache->maxLastWrittenLsn; + } + else + { + LastWrittenLsnCacheEntry* entry; + BufferTag key; + bool found; + BlockNumber i; + + Oid spcOid = NInfoGetSpcOid(rlocator); + Oid dbOid = NInfoGetDbOid(rlocator); + Oid relNumber = NInfoGetRelNumber(rlocator); + BufTagInit(key, relNumber, forknum, from, spcOid, dbOid); + for (i = 0; i < n_blocks; i++) + { + key.blockNum = from + i; + entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); + if (found) + { + if (lsn > entry->lsn) + entry->lsn = lsn; + else + lsn = entry->lsn; + /* Unlink from LRU list */ + dlist_delete(&entry->lru_node); + } + else + { + entry->lsn = lsn; + if (hash_get_num_entries(lastWrittenLsnCache) > LwLsnCache->lastWrittenLsnCacheSize) + { + /* Replace least recently used entry */ + LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); + /* Adjust max LSN for not cached relations/chunks if needed */ + if (victim->lsn > LwLsnCache->maxLastWrittenLsn) + LwLsnCache->maxLastWrittenLsn = victim->lsn; + + hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); + } + } + /* Link to the end of LRU list */ + dlist_push_tail(&LwLsnCache->lastWrittenLsnLRU, &entry->lru_node); + } + } + return lsn; +} + +/* + * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range. + * We maintain cache of last written LSNs with limited size and LRU replacement + * policy. Keeping last written LSN for each page allows to use old LSN when + * requesting pages of unchanged or appended relations. Also it is critical for + * efficient work of prefetch in case massive update operations (like vacuum or remove). + * + * rlocator.relNumber can be InvalidOid, in this case maxLastWrittenLsn is updated. + * SetLastWrittenLsn with dummy rlocator is used by createdb and dbase_redo functions. + */ +XLogRecPtr +neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) +{ + if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0) + return lsn; + + LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); + lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, from, n_blocks); + LWLockRelease(LastWrittenLsnLock); + + return lsn; +} + +/* + * neon_set_lwlsn_block_v -- Set maximal LSN of pages to their respective + * LSNs. + * + * We maintain cache of last written LSNs with limited size and LRU replacement + * policy. Keeping last written LSN for each page allows to use old LSN when + * requesting pages of unchanged or appended relations. Also it is critical for + * efficient work of prefetch in case massive update operations (like vacuum or remove). + * + * Note: This is different from SetLastWrittenLSNForBlockRange[Internal], in that this + * specifies per-block LSNs, rather than only a single LSN. + */ +XLogRecPtr +neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, + ForkNumber forknum, BlockNumber blockno, + int nblocks) +{ + LastWrittenLsnCacheEntry* entry; + BufferTag key; + bool found; + XLogRecPtr max = InvalidXLogRecPtr; + Oid spcOid = NInfoGetSpcOid(relfilenode); + Oid dbOid = NInfoGetDbOid(relfilenode); + Oid relNumber = NInfoGetRelNumber(relfilenode); + + if (lsns == NULL || nblocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0 || + NInfoGetRelNumber(relfilenode) == InvalidOid) + return InvalidXLogRecPtr; + + + BufTagInit(key, relNumber, forknum, blockno, spcOid, dbOid); + + LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); + + for (int i = 0; i < nblocks; i++) + { + XLogRecPtr lsn = lsns[i]; + + key.blockNum = blockno + i; + entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); + if (found) + { + if (lsn > entry->lsn) + entry->lsn = lsn; + else + lsn = entry->lsn; + /* Unlink from LRU list */ + dlist_delete(&entry->lru_node); + } + else + { + entry->lsn = lsn; + if (hash_get_num_entries(lastWrittenLsnCache) > LwLsnCache->lastWrittenLsnCacheSize) + { + /* Replace least recently used entry */ + LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); + /* Adjust max LSN for not cached relations/chunks if needed */ + if (victim->lsn > LwLsnCache->maxLastWrittenLsn) + LwLsnCache->maxLastWrittenLsn = victim->lsn; + + hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); + } + } + /* Link to the end of LRU list */ + dlist_push_tail(&LwLsnCache->lastWrittenLsnLRU, &entry->lru_node); + max = Max(max, lsn); + } + + LWLockRelease(LastWrittenLsnLock); + + return max; +} + +/* + * SetLastWrittenLSNForBlock -- Set maximal LSN for block + */ +XLogRecPtr +neon_set_lwlsn_block(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno) +{ + return neon_set_lwlsn_block_range(lsn, rlocator, forknum, blkno, 1); +} + +/* + * neon_set_lwlsn_relation -- Set maximal LSN for relation metadata + */ +XLogRecPtr +neon_set_lwlsn_relation(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum) +{ + return neon_set_lwlsn_block(lsn, rlocator, forknum, REL_METADATA_PSEUDO_BLOCKNO); +} + +/* + * neon_set_lwlsn_db -- Set maximal LSN for the whole database + */ +XLogRecPtr +neon_set_lwlsn_db(XLogRecPtr lsn) +{ + NRelFileInfo dummyNode = {InvalidOid, InvalidOid, InvalidOid}; + return neon_set_lwlsn_block(lsn, dummyNode, MAIN_FORKNUM, 0); +} \ No newline at end of file diff --git a/pgxn/neon/neon_lwlsncache.h b/pgxn/neon/neon_lwlsncache.h new file mode 100644 index 0000000000..acb5561c0c --- /dev/null +++ b/pgxn/neon/neon_lwlsncache.h @@ -0,0 +1,17 @@ +#ifndef NEON_LWLSNCACHE_H +#define NEON_LWLSNCACHE_H + +#include "neon_pgversioncompat.h" + +void init_lwlsncache(void); + +/* Hooks */ +XLogRecPtr neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno); +void neon_get_lwlsn_v(NRelFileInfo relfilenode, ForkNumber forknum, BlockNumber blkno, int nblocks, XLogRecPtr *lsns); +XLogRecPtr neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks); +XLogRecPtr neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, ForkNumber forknum, BlockNumber blockno, int nblocks); +XLogRecPtr neon_set_lwlsn_block(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno); +XLogRecPtr neon_set_lwlsn_relation(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum); +XLogRecPtr neon_set_lwlsn_db(XLogRecPtr lsn); + +#endif /* NEON_LWLSNCACHE_H */ \ No newline at end of file diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 8edc658a30..5f5330bb69 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -12,8 +12,8 @@ #include "storage/procnumber.h" #else #include "storage/backendid.h" -#include "storage/proc.h" #endif +#include "storage/proc.h" static const uint64 io_wait_bucket_thresholds[] = { 2, 3, 6, 10, /* 0 us - 10 us */ diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index 6b4b355672..b3ed0c04e8 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -76,6 +76,14 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode) +#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \ + do { \ + RelFileNode rnode = { .spcNode = spcOid, .dbNode = dbOid, .relNode = relNumber}; \ + (tag).forkNum = forknum; \ + (tag).blockNum = blkno; \ + (tag).rnode = rnode; \ + } while (false) + #define InvalidRelFileNumber InvalidOid #define SMgrRelGetRelInfo(reln) \ @@ -125,6 +133,15 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, .relNumber = (tag).relNumber, \ }) +#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \ + do { \ + (tag).forkNum = forknum; \ + (tag).blockNum = blkno; \ + (tag).spcOid = spcOid; \ + (tag).dbOid = dbOid; \ + (tag).relNumber = relNumber; \ + } while (false) + #define SMgrRelGetRelInfo(reln) \ ((reln)->smgr_rlocator) diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index 5854a7ef0f..be2c4ddf79 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -20,6 +20,7 @@ #include "access/xlogreader.h" #include "libpq/pqformat.h" #include "storage/fd.h" +#include "utils/memutils.h" #include "utils/wait_event.h" #include "libpq-fe.h" diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 475697f9c0..a2e3d57e47 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -8,8 +8,8 @@ * *------------------------------------------------------------------------- */ -#ifndef pageserver_h -#define pageserver_h +#ifndef PAGESTORE_CLIENT_h +#define PAGESTORE_CLIENT_h #include "neon_pgversioncompat.h" @@ -17,11 +17,8 @@ #include "access/xlogdefs.h" #include RELFILEINFO_HDR #include "lib/stringinfo.h" -#include "libpq/pqformat.h" #include "storage/block.h" #include "storage/buf_internals.h" -#include "storage/smgr.h" -#include "utils/memutils.h" #define MAX_SHARDS 128 #define MAX_PAGESERVER_CONNSTRING_SIZE 256 @@ -277,13 +274,8 @@ typedef struct XLogRecPtr effective_request_lsn; } neon_request_lsns; -#if PG_MAJORVERSION_NUM < 16 -extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer); -#else extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); -#endif extern int64 neon_dbsize(Oid dbNode); /* utils for neon relsize cache */ @@ -315,7 +307,7 @@ static inline bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, void *buffer) { - bits8 rv = 1; + bits8 rv = 0; return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; } @@ -326,4 +318,4 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); } -#endif +#endif /* PAGESTORE_CLIENT_H */ diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 78e42191a4..eb8df11923 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -37,10 +37,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * - * IDENTIFICATION - * contrib/neon/pagestore_smgr.c - * *------------------------------------------------------------------------- */ #include "postgres.h" @@ -55,6 +51,7 @@ #include "catalog/pg_class.h" #include "common/hashfn.h" #include "executor/instrument.h" +#include "libpq/pqformat.h" #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" @@ -69,6 +66,7 @@ #include "bitmap.h" #include "neon.h" +#include "neon_lwlsncache.h" #include "neon_perf_counters.h" #include "pagestore_client.h" @@ -98,7 +96,7 @@ static char *hexdump_page(char *page); #define IS_LOCAL_REL(reln) (\ NInfoGetDbOid(InfoFromSMgrRel(reln)) != 0 && \ - NInfoGetRelNumber(InfoFromSMgrRel(reln)) > FirstNormalObjectId \ + NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \ ) const int SmgrTrace = DEBUG5; @@ -340,11 +338,6 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_ static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); -#if PG_MAJORVERSION_NUM < 17 -static void -GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, - BlockNumber blkno, int nblocks, XLogRecPtr *lsns); -#endif static void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, @@ -864,7 +857,7 @@ prefetch_on_ps_disconnect(void) /* * We can have gone into retry due to network error, so update stats with - * the latest available + * the latest available */ MyNeonCounters->pageserver_open_requests = MyPState->n_requests_inflight; @@ -1044,6 +1037,16 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n continue; } memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); + + + /* + * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received + * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here + * under buffer lock. + */ + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forknum, blocknum + i, buffers[i]); + prefetch_set_unused(ring_index); BITMAP_SET(mask, i); @@ -1075,6 +1078,9 @@ prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_r * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure * to calculate the LSNs to send. * + * Bits set in *mask (if present) indicate pages already read; i.e. pages we + * can skip in this process. + * * When performing a prefetch rather than a synchronous request, * is_prefetch==true. Currently, it only affects how the request is accounted * in the perf counters. @@ -1105,7 +1111,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, Retry: /* * We can have gone into retry due to network error, so update stats with - * the latest available + * the latest available */ MyNeonCounters->pageserver_open_requests = MyPState->ring_unused - MyPState->ring_receive; @@ -1120,7 +1126,7 @@ Retry: uint64 ring_index; neon_request_lsns *lsns; - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) continue; if (frlsns) @@ -1894,7 +1900,6 @@ neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, log_pages = true; } else if (XLogInsertAllowed() && - !ShutdownRequestPending && (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) { log_pages = true; @@ -1997,7 +2002,7 @@ neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (batch_size >= BLOCK_BATCH_SIZE) { - SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + neon_set_lwlsn_block_v(lsns, InfoFromSMgrRel(reln), forknum, batch_blockno, batch_size); batch_blockno += batch_size; @@ -2007,7 +2012,7 @@ neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (batch_size != 0) { - SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + neon_set_lwlsn_block_v(lsns, InfoFromSMgrRel(reln), forknum, batch_blockno, batch_size); } @@ -2134,7 +2139,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co * Remember the LSN on this page. When we read the page again, we must * read the same or newer version of it. */ - SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forknum, blocknum); + neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forknum, blocknum); } /* @@ -2217,19 +2222,6 @@ nm_adjust_lsn(XLogRecPtr lsn) } -/* - * Since PG17 we use vetorized version, - * so add compatibility function for older versions - */ -#if PG_MAJORVERSION_NUM < 17 -static void -GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, - BlockNumber blkno, int nblocks, XLogRecPtr *lsns) -{ - lsns[0] = GetLastWrittenLSN(relfilenode, forknum, blkno); -} -#endif - /* * Return LSN for requesting pages and number of blocks from page server */ @@ -2241,7 +2233,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, Assert(nblocks <= PG_IOV_MAX); - GetLastWrittenLSNv(rinfo, forknum, blkno, (int) nblocks, last_written_lsns); + neon_get_lwlsn_v(rinfo, forknum, blkno, (int) nblocks, last_written_lsns); for (int i = 0; i < nblocks; i++) { @@ -2388,7 +2380,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, LSN_FORMAT_ARGS(last_written_lsn), LSN_FORMAT_ARGS(flushlsn)); XLogFlush(last_written_lsn); - flushlsn = last_written_lsn; } /* @@ -2404,18 +2395,35 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * requesting the latest page, by setting request LSN to * UINT64_MAX. * - * Remember the current LSN, however, so that we can later - * correctly determine if the response to the request is still - * valid. The most up-to-date LSN we could use for that purpose - * would be the current insert LSN, but to avoid the overhead of - * looking it up, use 'flushlsn' instead. This relies on the - * assumption that if the page was modified since the last WAL - * flush, it should still be in the buffer cache, and we - * wouldn't be requesting it. + * effective_request_lsn is used to check that received response is still valid. + * In case of primary node it is last written LSN. Originally we used flush_lsn here, + * but it is not correct. Consider the following scenario: + * 1. Backend A wants to prefetch block X + * 2. Backend A checks that block X is not present in the shared buffer cache + * 3. Backend A calls prefetch_do_request, which calls neon_get_request_lsns + * 4. neon_get_request_lsns obtains LwLSN=11 for the block + * 5. Backend B downloads block X, updates and wallogs it with LSN=13 + * 6. Block X is once again evicted from shared buffers, its LwLSN is set to LSN=13 + * 7. Backend A is still executing in neon_get_request_lsns(). It calls 'flushlsn = GetFlushRecPtr();'. + * Let's say that it is LSN=14 + * 8. Backend A uses LSN=14 as effective_lsn in the prefetch slot. The request stored in the slot is + * [not_modified_since=11, effective_request_lsn=14] + * 9. Backend A sends the prefetch request, pageserver processes it, and sends response. + * The last LSN that the pageserver had processed was LSN=12, so the page image in the response is valid at LSN=12. + * 10. Backend A calls smgrread() for page X with LwLSN=13 + * 11. Backend A finds in prefetch ring the response for the prefetch request with [not_modified_since=11, effective_lsn=Lsn14], + * so it satisfies neon_prefetch_response_usable condition. + * + * Things go wrong in step 7-8, when [not_modified_since=11, effective_request_lsn=14] is determined for the request. + * That is incorrect, because the page has in fact been modified at LSN=13. The invariant is that for any request, + * there should not be any modifications to a page between its not_modified_since and (effective_)request_lsn values. + * + * The problem can be fixed by callingGetFlushRecPtr() before checking if the page is in the buffer cache. + * But you can't do that within smgrprefetch(), would need to modify the caller. */ result->request_lsn = UINT64_MAX; result->not_modified_since = last_written_lsn; - result->effective_request_lsn = flushlsn; + result->effective_request_lsn = last_written_lsn; } } } @@ -2474,11 +2482,8 @@ neon_prefetch_response_usable(neon_request_lsns *request_lsns, * `not_modified_since` and `request_lsn` are sent to the pageserver, but * in the primary node, we always use UINT64_MAX as the `request_lsn`, so * we remember `effective_request_lsn` separately. In a primary, - * `effective_request_lsn` is the last flush WAL position when the request - * was sent to the pageserver. That's logically the LSN that we are - * requesting the page at, but we send UINT64_MAX to the pageserver so - * that if the GC horizon advances past that position, we still get a - * valid response instead of an error. + * `effective_request_lsn` is the same as `not_modified_since`. + * See comments in neon_get_request_lsns why we can not use last flush WAL position here. * * To determine whether a response to a GetPage request issued earlier is * still valid to satisfy a new page read, we look at the @@ -2844,9 +2849,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (lsn == InvalidXLogRecPtr) { lsn = GetXLogInsertRecPtr(); - SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum, blkno); + neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno); } - SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum); + neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum); } #if PG_MAJORVERSION_NUM >= 16 @@ -2941,7 +2946,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, for (int i = 0; i < count; i++) { lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); - SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum, + neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blocknum + i); } @@ -2951,7 +2956,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, Assert(lsn != 0); - SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum); + neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum); set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum); } #endif @@ -3033,9 +3038,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, tag.blockNum = blocknum; - for (int i = 0; i < PG_IOV_MAX / 8; i++) - lfc_present[i] = ~(lfc_present[i]); - ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present, true); @@ -3141,14 +3143,18 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, #endif } +/* + * Read N pages at a specific LSN. + * + * *mask is set for pages read at a previous point in time, and which we + * should not touch, nor overwrite. + * New bits should be set in *mask for the pages we'successfully read. + * + * The offsets in request_lsns, buffers, and mask are linked. + */ static void -#if PG_MAJORVERSION_NUM < 16 -neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, - char **buffers, BlockNumber nblocks, const bits8 *mask) -#else neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, void **buffers, BlockNumber nblocks, const bits8 *mask) -#endif { NeonResponse *resp; uint64 ring_index; @@ -3193,7 +3199,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block neon_request_lsns *reqlsns = &request_lsns[i]; TimestampTz start_ts, end_ts; - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) continue; start_ts = GetCurrentTimestamp(); @@ -3294,6 +3300,12 @@ Retry: } } memcpy(buffer, getpage_resp->page, BLCKSZ); + + /* + * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received + * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here + * under buffer lock. + */ if (!lfc_store_prefetch_result) lfc_write(rinfo, forkNum, blockno, buffer); break; @@ -3338,13 +3350,8 @@ Retry: * To avoid breaking tests in the runtime please keep function signature in sync. */ void -#if PG_MAJORVERSION_NUM < 16 -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer) -#else neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer) -#endif { neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } @@ -3486,9 +3493,7 @@ static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) { - bits8 prefetch_hits[PG_IOV_MAX / 8] = {0}; - bits8 lfc_hits[PG_IOV_MAX / 8]; - bits8 read[PG_IOV_MAX / 8]; + bits8 read_pages[PG_IOV_MAX / 8]; neon_request_lsns request_lsns[PG_IOV_MAX]; int lfc_result; int prefetch_result; @@ -3520,19 +3525,18 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); + memset(read_pages, 0, sizeof(read_pages)); - prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks, buffers, prefetch_hits); + prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, + blocknum, request_lsns, nblocks, + buffers, read_pages); if (prefetch_result == nblocks) return; - /* invert the result: exclude prefetched blocks */ - for (int i = 0; i < PG_IOV_MAX / 8; i++) - lfc_hits[i] = ~prefetch_hits[i]; - /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, - nblocks, lfc_hits); + nblocks, read_pages); if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; @@ -3541,21 +3545,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (prefetch_result + lfc_result == nblocks) return; - if (lfc_result <= 0) - { - /* can't use the LFC result, so read all blocks from PS */ - for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = ~prefetch_hits[i]; - } - else - { - /* invert the result: exclude blocks read from lfc */ - for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = ~(prefetch_hits[i] | lfc_hits[i]); - } - neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, - buffers, nblocks, read); + buffers, nblocks, read_pages); /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. @@ -4052,7 +4043,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo * for the extended pages, so there's no harm in leaving behind obsolete * entries for the truncated chunks. */ - SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forknum); + neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -4510,7 +4501,7 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, if (relsize < blkno + 1) { update_cached_relsize(rinfo, forknum, blkno + 1); - SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); + neon_set_lwlsn_relation(end_recptr, rinfo, forknum); } } else @@ -4543,7 +4534,7 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, relsize = Max(nbresponse->n_blocks, blkno + 1); set_cached_relsize(rinfo, forknum, relsize); - SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); + neon_set_lwlsn_relation(end_recptr, rinfo, forknum); neon_log(SmgrTrace, "Set length to %d", relsize); } @@ -4674,7 +4665,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) */ if (no_redo_needed) { - SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); + neon_set_lwlsn_block(end_recptr, rinfo, forknum, blkno); /* * Redo changes if page exists in LFC. * We should perform this check after assigning LwLSN to prevent diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index 2a4c2dc799..60ca1675d9 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -6,10 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * - * IDENTIFICATION - * contrib/neon/relsize_cache.c - * *------------------------------------------------------------------------- */ #include "postgres.h" diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c index a986160224..b9460feb21 100644 --- a/pgxn/neon/walproposer_compat.c +++ b/pgxn/neon/walproposer_compat.c @@ -7,6 +7,7 @@ #include +#include "libpq/pqformat.h" #include "miscadmin.h" #include "utils/datetime.h" #include "walproposer.h" diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 0b5499ca53..d37412f674 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -50,13 +50,8 @@ PG_FUNCTION_INFO_V1(trigger_segfault); * Linkage to functions in neon module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -#if PG_MAJORVERSION_NUM < 16 -typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer); -#else typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); -#endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index 4673de778c..a1be498573 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -286,9 +286,6 @@ WalRedoMain(int argc, char *argv[]) max_wal_senders = 0; InitializeMaxBackends(); - /* Disable lastWrittenLsnCache */ - lastWrittenLsnCacheSize = 0; - #if PG_VERSION_NUM >= 150000 process_shmem_requests(); InitializeShmemGUCs(); diff --git a/poetry.lock b/poetry.lock index 7c84b2969b..08732fd641 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1286,24 +1286,20 @@ files = [ [[package]] name = "h2" -version = "4.1.0" +version = "4.2.0" description = "Pure-Python HTTP/2 protocol implementation" optional = false python-versions = ">=3.9" groups = ["main"] -files = [] -develop = false +files = [ + {file = "h2-4.2.0-py3-none-any.whl", hash = "sha256:479a53ad425bb29af087f3458a61d30780bc818e4ebcf01f0b536ba916462ed0"}, + {file = "h2-4.2.0.tar.gz", hash = "sha256:c8a52129695e88b1a0578d8d2cc6842bbd79128ac685463b887ee278126ad01f"}, +] [package.dependencies] hpack = ">=4.1,<5" hyperframe = ">=6.1,<7" -[package.source] -type = "git" -url = "https://github.com/python-hyper/h2" -reference = "HEAD" -resolved_reference = "0b98b244b5fd1fe96100ac14905417a3b70a4286" - [[package]] name = "hpack" version = "4.1.0" @@ -3111,30 +3107,30 @@ six = "*" [[package]] name = "ruff" -version = "0.7.0" +version = "0.11.2" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" groups = ["dev"] files = [ - {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"}, - {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"}, - {file = "ruff-0.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06"}, - {file = "ruff-0.7.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:630fce3fefe9844e91ea5bbf7ceadab4f9981f42b704fae011bb8efcaf5d84be"}, - {file = "ruff-0.7.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:211d877674e9373d4bb0f1c80f97a0201c61bcd1e9d045b6e9726adc42c156aa"}, - {file = "ruff-0.7.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:194d6c46c98c73949a106425ed40a576f52291c12bc21399eb8f13a0f7073495"}, - {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:82c2579b82b9973a110fab281860403b397c08c403de92de19568f32f7178598"}, - {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9af971fe85dcd5eaed8f585ddbc6bdbe8c217fb8fcf510ea6bca5bdfff56040e"}, - {file = "ruff-0.7.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b641c7f16939b7d24b7bfc0be4102c56562a18281f84f635604e8a6989948914"}, - {file = "ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9"}, - {file = "ruff-0.7.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ab7d98c7eed355166f367597e513a6c82408df4181a937628dbec79abb2a1fe4"}, - {file = "ruff-0.7.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1eb54986f770f49edb14f71d33312d79e00e629a57387382200b1ef12d6a4ef9"}, - {file = "ruff-0.7.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:dc452ba6f2bb9cf8726a84aa877061a2462afe9ae0ea1d411c53d226661c601d"}, - {file = "ruff-0.7.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:4b406c2dce5be9bad59f2de26139a86017a517e6bcd2688da515481c05a2cb11"}, - {file = "ruff-0.7.0-py3-none-win32.whl", hash = "sha256:f6c968509f767776f524a8430426539587d5ec5c662f6addb6aa25bc2e8195ec"}, - {file = "ruff-0.7.0-py3-none-win_amd64.whl", hash = "sha256:ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2"}, - {file = "ruff-0.7.0-py3-none-win_arm64.whl", hash = "sha256:10842f69c245e78d6adec7e1db0a7d9ddc2fff0621d730e61657b64fa36f207e"}, - {file = "ruff-0.7.0.tar.gz", hash = "sha256:47a86360cf62d9cd53ebfb0b5eb0e882193fc191c6d717e8bef4462bc3b9ea2b"}, + {file = "ruff-0.11.2-py3-none-linux_armv6l.whl", hash = "sha256:c69e20ea49e973f3afec2c06376eb56045709f0212615c1adb0eda35e8a4e477"}, + {file = "ruff-0.11.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:2c5424cc1c4eb1d8ecabe6d4f1b70470b4f24a0c0171356290b1953ad8f0e272"}, + {file = "ruff-0.11.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ecf20854cc73f42171eedb66f006a43d0a21bfb98a2523a809931cda569552d9"}, + {file = "ruff-0.11.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c543bf65d5d27240321604cee0633a70c6c25c9a2f2492efa9f6d4b8e4199bb"}, + {file = "ruff-0.11.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:20967168cc21195db5830b9224be0e964cc9c8ecf3b5a9e3ce19876e8d3a96e3"}, + {file = "ruff-0.11.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:955a9ce63483999d9f0b8f0b4a3ad669e53484232853054cc8b9d51ab4c5de74"}, + {file = "ruff-0.11.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:86b3a27c38b8fce73bcd262b0de32e9a6801b76d52cdb3ae4c914515f0cef608"}, + {file = "ruff-0.11.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a3b66a03b248c9fcd9d64d445bafdf1589326bee6fc5c8e92d7562e58883e30f"}, + {file = "ruff-0.11.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0397c2672db015be5aa3d4dac54c69aa012429097ff219392c018e21f5085147"}, + {file = "ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:869bcf3f9abf6457fbe39b5a37333aa4eecc52a3b99c98827ccc371a8e5b6f1b"}, + {file = "ruff-0.11.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:2a2b50ca35457ba785cd8c93ebbe529467594087b527a08d487cf0ee7b3087e9"}, + {file = "ruff-0.11.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:7c69c74bf53ddcfbc22e6eb2f31211df7f65054bfc1f72288fc71e5f82db3eab"}, + {file = "ruff-0.11.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:6e8fb75e14560f7cf53b15bbc55baf5ecbe373dd5f3aab96ff7aa7777edd7630"}, + {file = "ruff-0.11.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:842a472d7b4d6f5924e9297aa38149e5dcb1e628773b70e6387ae2c97a63c58f"}, + {file = "ruff-0.11.2-py3-none-win32.whl", hash = "sha256:aca01ccd0eb5eb7156b324cfaa088586f06a86d9e5314b0eb330cb48415097cc"}, + {file = "ruff-0.11.2-py3-none-win_amd64.whl", hash = "sha256:3170150172a8f994136c0c66f494edf199a0bbea7a409f649e4bc8f4d7084080"}, + {file = "ruff-0.11.2-py3-none-win_arm64.whl", hash = "sha256:52933095158ff328f4c77af3d74f0379e34fd52f175144cefc1b192e7ccd32b4"}, + {file = "ruff-0.11.2.tar.gz", hash = "sha256:ec47591497d5a1050175bdf4e1a4e6272cddff7da88a2ad595e1e326041d8d94"}, ] [[package]] @@ -3844,4 +3840,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "715fc8c896dcfa1b15054deeddcdec557ef93af91b26e1c8e4688fe4dbef5296" +content-hash = "7ab1e7b975af34b3271b7c6018fa22a261d3f73c7c0a0403b6b2bb86b5fbd36e" diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index feca5ccf88..62fdc18207 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -314,9 +314,9 @@ pub async fn run() -> anyhow::Result<()> { None => { bail!("plain auth requires redis_notifications to be set"); } - Some(url) => Some( - ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), - ), + Some(url) => { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) + } }, ("irsa", _) => match (&args.redis_host, args.redis_port) { (Some(host), Some(port)) => Some( diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index ee722e839e..d3ab4abd0b 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -1,5 +1,6 @@ //! Mock console backend which relies on a user-provided postgres instance. +use std::io; use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; @@ -22,7 +23,6 @@ use crate::control_plane::errors::{ }; use crate::control_plane::messages::MetricsAuxInfo; use crate::control_plane::{AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo}; -use crate::error::io_error; use crate::intern::RoleNameInt; use crate::types::{BranchId, EndpointId, ProjectId, RoleName}; use crate::url::ApiUrl; @@ -36,13 +36,13 @@ enum MockApiError { impl From for ControlPlaneError { fn from(e: MockApiError) -> Self { - io_error(e).into() + io::Error::other(e).into() } } impl From for ControlPlaneError { fn from(e: tokio_postgres::Error) -> Self { - io_error(e).into() + io::Error::other(e).into() } } diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs index bc30cffd27..337ed665cc 100644 --- a/proxy/src/control_plane/errors.rs +++ b/proxy/src/control_plane/errors.rs @@ -1,8 +1,10 @@ +use std::io; + use thiserror::Error; use crate::control_plane::client::ApiLockError; use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason}; -use crate::error::{ErrorKind, ReportableError, UserFacingError, io_error}; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::proxy::retry::CouldRetry; /// A go-to error message which doesn't leak any detail. @@ -79,13 +81,13 @@ impl CouldRetry for ControlPlaneError { impl From for ControlPlaneError { fn from(e: reqwest::Error) -> Self { - io_error(e).into() + io::Error::other(e).into() } } impl From for ControlPlaneError { fn from(e: reqwest_middleware::Error) -> Self { - io_error(e).into() + io::Error::other(e).into() } } diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 6a379499dc..aa02b211d9 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,15 +1,9 @@ -use std::error::Error as StdError; -use std::{fmt, io}; +use std::fmt; use anyhow::Context; use measured::FixedCardinalityLabel; use tokio::task::JoinError; -/// Upcast (almost) any error into an opaque [`io::Error`]. -pub(crate) fn io_error(e: impl Into>) -> io::Error { - io::Error::new(io::ErrorKind::Other, e) -} - /// Marks errors that may be safely shown to a client. /// This trait can be seen as a specialized version of [`ToString`]. /// diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 41180fa6c1..b0603da379 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -163,8 +163,7 @@ fn process_proxy_payload( // other values are unassigned and must not be emitted by senders. Receivers // must drop connections presenting unexpected values here. #[rustfmt::skip] // https://github.com/rust-lang/rustfmt/issues/6384 - _ => return Err(io::Error::new( - io::ErrorKind::Other, + _ => return Err(io::Error::other( format!( "invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)", header.version_and_command @@ -178,21 +177,20 @@ fn process_proxy_payload( TCP_OVER_IPV4 | UDP_OVER_IPV4 => { let addr = payload .try_get::() - .ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?; + .ok_or_else(|| io::Error::other(size_err))?; SocketAddr::from((addr.src_addr.get(), addr.src_port.get())) } TCP_OVER_IPV6 | UDP_OVER_IPV6 => { let addr = payload .try_get::() - .ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?; + .ok_or_else(|| io::Error::other(size_err))?; SocketAddr::from((addr.src_addr.get(), addr.src_port.get())) } // unspecified or unix stream. ignore the addresses _ => { - return Err(io::Error::new( - io::ErrorKind::Other, + return Err(io::Error::other( "invalid proxy protocol address family/transport protocol.", )); } diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index b5c3d13216..fe656557ac 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -143,6 +143,8 @@ impl ConnectionWithCredentialsProvider { db: 0, username: Some(username), password: Some(password.clone()), + // TODO: switch to RESP3 after testing new client version. + protocol: redis::ProtocolVersion::RESP2, }, }) } diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index fbd12ad9cb..7235fb6079 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -19,7 +19,7 @@ fn json_value_to_pg_text(value: &Value) -> Option { v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()), // avoid escaping here, as we pass this as a parameter - Value::String(s) => Some(s.to_string()), + Value::String(s) => Some(s.clone()), // special care for arrays Value::Array(_) => json_array_to_pg_array(value), diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index c958d077fc..3282c0ebde 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; -pub(crate) const EXT_VERSION: &str = "0.2.0"; +pub(crate) const EXT_VERSION: &str = "0.3.0"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 10e378a18d..972bf58d91 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -866,7 +866,7 @@ impl QueryData { let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); - let res = match select( + match select( pin!(query_to_json( config, &mut *inner, @@ -889,7 +889,7 @@ impl QueryData { // The query failed with an error Either::Left((Err(e), __not_yet_cancelled)) => { discard.discard(); - return Err(e); + Err(e) } // The query was cancelled. Either::Right((_cancelled, query)) => { @@ -930,8 +930,7 @@ impl QueryData { } } } - }; - res + } } } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index c4baeeb5cc..01d37d0eec 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -15,7 +15,7 @@ use tracing::warn; use crate::cancellation::CancellationHandler; use crate::config::ProxyConfig; use crate::context::RequestContext; -use crate::error::{ReportableError, io_error}; +use crate::error::ReportableError; use crate::metrics::Metrics; use crate::proxy::{ClientMode, ErrorSource, handle_client}; use crate::rate_limiter::EndpointRateLimiter; @@ -50,23 +50,23 @@ impl AsyncWrite for WebSocketRw { let this = self.project(); let mut stream = this.stream; - ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?; + ready!(stream.as_mut().poll_ready(cx).map_err(io::Error::other))?; this.send.put(buf); match stream.as_mut().start_send(Frame::binary(this.send.split())) { Ok(()) => Poll::Ready(Ok(buf.len())), - Err(e) => Poll::Ready(Err(io_error(e))), + Err(e) => Poll::Ready(Err(io::Error::other(e))), } } fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let stream = self.project().stream; - stream.poll_flush(cx).map_err(io_error) + stream.poll_flush(cx).map_err(io::Error::other) } fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let stream = self.project().stream; - stream.poll_close(cx).map_err(io_error) + stream.poll_close(cx).map_err(io::Error::other) } } @@ -97,7 +97,7 @@ impl AsyncBufRead for WebSocketRw { } let res = ready!(this.stream.as_mut().poll_next(cx)); - match res.transpose().map_err(io_error)? { + match res.transpose().map_err(io::Error::other)? { Some(message) => match message.opcode { OpCode::Ping => {} OpCode::Pong => {} @@ -105,7 +105,7 @@ impl AsyncBufRead for WebSocketRw { // We expect to see only binary messages. let error = "unexpected text message in the websocket"; warn!(length = message.payload.len(), error); - return Poll::Ready(Err(io_error(error))); + return Poll::Ready(Err(io::Error::other(error))); } OpCode::Binary | OpCode::Continuation => { debug_assert!(this.recv.is_empty()); diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index eab9940e7d..5a95e69fde 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -173,7 +173,7 @@ impl CertResolver { } pub fn get_common_names(&self) -> HashSet { - self.certs.keys().map(|s| s.to_string()).collect() + self.certs.keys().cloned().collect() } } diff --git a/pyproject.toml b/pyproject.toml index e009b0773e..c6dfdc223c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ websockets = "^12.0" clickhouse-connect = "^0.7.16" kafka-python = "^2.0.2" jwcrypto = "^1.5.6" -h2 = {git = "https://github.com/python-hyper/h2"} +h2 = "^4.2.0" types-jwcrypto = "^1.5.0.20240925" pyyaml = "^6.0.2" types-pyyaml = "^6.0.12.20240917" @@ -53,7 +53,7 @@ jsonnet = "^0.21.0-rc2" [tool.poetry.group.dev.dependencies] mypy = "==1.13.0" -ruff = "^0.7.0" +ruff = "^0.11.2" [build-system] requires = ["poetry-core>=1.0.0"] @@ -109,4 +109,5 @@ select = [ "W", # pycodestyle "B", # bugbear "UP", # pyupgrade + "TC", # flake8-type-checking ] diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 591d60ea79..a0d5970bd5 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.85.0" +channel = "1.86.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 424cd89221..5849df0343 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -38,9 +38,8 @@ pub enum Error { #[error("Cancelled")] Cancelled, - /// Failed to create client. - #[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] - CreateClient(reqwest::Error), + #[error("request timed out: {0}")] + Timeout(String), } pub type Result = std::result::Result; @@ -116,7 +115,17 @@ impl Client { "{}/v1/tenant/{}/timeline/{}", self.mgmt_api_endpoint, tenant_id, timeline_id ); - let resp = self.request(Method::DELETE, &uri, ()).await?; + let resp = self + .request_maybe_body(Method::DELETE, &uri, None::<()>) + .await?; + resp.json().await.map_err(Error::ReceiveBody) + } + + pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result { + let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id); + let resp = self + .request_maybe_body(Method::DELETE, &uri, None::<()>) + .await?; resp.json().await.map_err(Error::ReceiveBody) } @@ -192,6 +201,16 @@ impl Client { method: Method, uri: U, body: B, + ) -> Result { + self.request_maybe_body(method, uri, Some(body)).await + } + + /// Send the request and check that the status code is good, with an optional body. + async fn request_maybe_body( + &self, + method: Method, + uri: U, + body: Option, ) -> Result { let res = self.request_noerror(method, uri, body).await?; let response = res.error_from_body().await?; @@ -203,12 +222,15 @@ impl Client { &self, method: Method, uri: U, - body: B, + body: Option, ) -> Result { let mut req = self.client.request(method, uri); if let Some(value) = &self.authorization_header { req = req.header(reqwest::header::AUTHORIZATION, value.get_contents()) } - req.json(&body).send().await.map_err(Error::ReceiveBody) + if let Some(body) = body { + req = req.json(&body); + } + req.send().await.map_err(Error::ReceiveBody) } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 9ca79de179..b8c122ea72 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -21,7 +21,7 @@ use safekeeper::defaults::{ DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, - DEFAULT_SSL_KEY_FILE, + DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, }; use safekeeper::{ BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker, @@ -214,9 +214,18 @@ struct Args { /// Path to a file with a X509 certificate for https API. #[arg(long, default_value = DEFAULT_SSL_CERT_FILE)] ssl_cert_file: Utf8PathBuf, - /// Trusted root CA certificate to use in https APIs. + /// Period to reload certificate and private key from files. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_SSL_CERT_RELOAD_PERIOD)] + pub ssl_cert_reload_period: Duration, + /// Trusted root CA certificates to use in https APIs. #[arg(long)] - ssl_ca_file: Option, + pub ssl_ca_file: Option, + /// Flag to use https for requests to peer's safekeeper API. + #[arg(long)] + pub use_https_safekeeper_api: bool, + /// Path to the JWT auth token used to authenticate with other safekeepers. + #[arg(long)] + auth_token_path: Option, } // Like PathBufValueParser, but allows empty string. @@ -335,14 +344,24 @@ async fn main() -> anyhow::Result<()> { }; // Load JWT auth token to connect to other safekeepers for pull_timeline. + // First check if the env var is present, then check the arg with the path. + // We want to deprecate and remove the env var method in the future. let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") { Ok(v) => { info!("loaded JWT token for authentication with safekeepers"); Some(SecretString::from(v)) } Err(VarError::NotPresent) => { - info!("no JWT token for authentication with safekeepers detected"); - None + if let Some(auth_token_path) = args.auth_token_path.as_ref() { + info!( + "loading JWT token for authentication with safekeepers from {auth_token_path}" + ); + let auth_token = tokio::fs::read_to_string(auth_token_path).await?; + Some(SecretString::from(auth_token.trim().to_owned())) + } else { + info!("no JWT token for authentication with safekeepers detected"); + None + } } Err(_) => { warn!("JWT token for authentication with safekeepers is not unicode"); @@ -350,13 +369,13 @@ async fn main() -> anyhow::Result<()> { } }; - let ssl_ca_cert = match args.ssl_ca_file.as_ref() { + let ssl_ca_certs = match args.ssl_ca_file.as_ref() { Some(ssl_ca_file) => { tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); let buf = tokio::fs::read(ssl_ca_file).await?; - Some(Certificate::from_pem(&buf)?) + Certificate::from_pem_bundle(&buf)? } - None => None, + None => Vec::new(), }; let conf = Arc::new(SafeKeeperConf { @@ -394,7 +413,9 @@ async fn main() -> anyhow::Result<()> { max_delta_for_fanout: args.max_delta_for_fanout, ssl_key_file: args.ssl_key_file, ssl_cert_file: args.ssl_cert_file, - ssl_ca_cert, + ssl_cert_reload_period: args.ssl_cert_reload_period, + ssl_ca_certs, + use_https_safekeeper_api: args.use_https_safekeeper_api, }); // initialize sentry if SENTRY_DSN is provided diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 4908863a4b..003a75faa6 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,6 +1,7 @@ pub mod routes; use std::sync::Arc; +use http_utils::tls_certs::ReloadingCertificateResolver; pub use routes::make_router; pub use safekeeper_api::models; use tokio_util::sync::CancellationToken; @@ -29,12 +30,16 @@ pub async fn task_main_https( https_listener: std::net::TcpListener, global_timelines: Arc, ) -> anyhow::Result<()> { - let certs = http_utils::tls_certs::load_cert_chain(&conf.ssl_cert_file)?; - let key = http_utils::tls_certs::load_private_key(&conf.ssl_key_file)?; + let cert_resolver = ReloadingCertificateResolver::new( + &conf.ssl_key_file, + &conf.ssl_cert_file, + conf.ssl_cert_reload_period, + ) + .await?; let server_config = rustls::ServerConfig::builder() .with_no_client_auth() - .with_single_cert(certs, key)?; + .with_cert_resolver(cert_resolver); let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 3299d77545..312456e5b2 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -16,9 +16,9 @@ use http_utils::{RequestExt, RouterBuilder}; use hyper::{Body, Request, Response, StatusCode}; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::{ - AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry, - TimelineCopyRequest, TimelineCreateRequest, TimelineDeleteResult, TimelineStatus, - TimelineTermBumpRequest, + AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TenantDeleteResult, + TermSwitchApiEntry, TimelineCopyRequest, TimelineCreateRequest, TimelineDeleteResult, + TimelineStatus, TimelineTermBumpRequest, }; use safekeeper_api::{ServerInfo, membership, models}; use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; @@ -83,13 +83,11 @@ async fn tenant_delete_handler(mut request: Request) -> Result>(), - ) + let response_body: TenantDeleteResult = delete_info + .iter() + .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) + .collect::>(); + json_response(StatusCode::OK, response_body) } async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { @@ -235,7 +233,7 @@ async fn timeline_pull_handler(mut request: Request) -> Result) -> Result, pub ssl_key_file: Utf8PathBuf, pub ssl_cert_file: Utf8PathBuf, - pub ssl_ca_cert: Option, + pub ssl_cert_reload_period: Duration, + pub ssl_ca_certs: Vec, + pub use_https_safekeeper_api: bool, } impl SafeKeeperConf { @@ -166,7 +169,9 @@ impl SafeKeeperConf { max_delta_for_fanout: None, ssl_key_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_KEY_FILE), ssl_cert_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_CERT_FILE), - ssl_ca_cert: None, + ssl_cert_reload_period: Duration::from_secs(60), + ssl_ca_certs: Vec::new(), + use_https_safekeeper_api: false, } } } diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index dab8142dfb..653b084ad8 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -393,7 +393,7 @@ pub struct DebugDumpResponse { pub async fn handle_request( request: PullTimelineRequest, sk_auth_token: Option, - ssl_ca_cert: Option, + ssl_ca_certs: Vec, global_timelines: Arc, ) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( @@ -405,7 +405,7 @@ pub async fn handle_request( } let mut http_client = reqwest::Client::builder(); - if let Some(ssl_ca_cert) = ssl_ca_cert { + for ssl_ca_cert in ssl_ca_certs { http_client = http_client.add_root_certificate(ssl_ca_cert); } let http_client = http_client.build()?; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 7967acde3f..9975153f6c 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -94,10 +94,10 @@ impl WalReceivers { /// Get reference to locked slot contents. Slot must exist (registered /// earlier). - fn get_slot<'a>( - self: &'a Arc, + fn get_slot( + self: &Arc, id: WalReceiverId, - ) -> MappedMutexGuard<'a, WalReceiverState> { + ) -> MappedMutexGuard<'_, WalReceiverState> { MutexGuard::map(self.mutex.lock(), |locked| { locked.slots[id] .as_mut() diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index c2760792b8..25b40f5d2e 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -176,6 +176,7 @@ pub struct Donor { pub flush_lsn: Lsn, pub pg_connstr: String, pub http_connstr: String, + pub https_connstr: Option, } impl From<&PeerInfo> for Donor { @@ -186,6 +187,7 @@ impl From<&PeerInfo> for Donor { flush_lsn: p.flush_lsn, pg_connstr: p.pg_connstr.clone(), http_connstr: p.http_connstr.clone(), + https_connstr: p.https_connstr.clone(), } } } @@ -236,11 +238,33 @@ async fn recover( conf: &SafeKeeperConf, ) -> anyhow::Result { // Learn donor term switch history to figure out starting point. - let client = reqwest::Client::new(); + + let mut client = reqwest::Client::builder(); + for cert in &conf.ssl_ca_certs { + client = client.add_root_certificate(cert.clone()); + } + let client = client + .build() + .context("Failed to build http client for recover")?; + + let url = if conf.use_https_safekeeper_api { + if let Some(https_connstr) = donor.https_connstr.as_ref() { + format!("https://{https_connstr}") + } else { + anyhow::bail!( + "cannot recover from donor {}: \ + https is enabled, but https_connstr is not specified", + donor.sk_id + ); + } + } else { + format!("http://{}", donor.http_connstr) + }; + let timeline_info: TimelineStatus = client .get(format!( - "http://{}/v1/tenant/{}/timeline/{}", - donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id + "{}/v1/tenant/{}/timeline/{}", + url, tli.ttid.tenant_id, tli.ttid.timeline_id )) .send() .await? diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index d3c841ec09..e6a7ade9f2 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -50,6 +50,7 @@ fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> Peer local_start_lsn: Lsn(sk_info.local_start_lsn), pg_connstr: sk_info.safekeeper_connstr.clone(), http_connstr: sk_info.http_connstr.clone(), + https_connstr: sk_info.https_connstr.clone(), ts, } } @@ -363,6 +364,7 @@ impl SharedState { .to_owned() .unwrap_or(conf.listen_pg_addr.clone()), http_connstr: conf.listen_http_addr.to_owned(), + https_connstr: conf.listen_https_addr.to_owned(), backup_lsn: self.sk.state().inmem.backup_lsn.0, local_start_lsn: self.sk.state().local_start_lsn.0, availability_zone: conf.availability_zone.clone(), @@ -699,7 +701,7 @@ impl Timeline { } /// Take a writing mutual exclusive lock on timeline shared_state. - pub async fn write_shared_state<'a>(self: &'a Arc) -> WriteGuardSharedState<'a> { + pub async fn write_shared_state(self: &Arc) -> WriteGuardSharedState<'_> { WriteGuardSharedState::new(self.clone(), self.mutex.write().await) } diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs index 8e54d2bb86..3acf9f72c4 100644 --- a/safekeeper/tests/misc_test.rs +++ b/safekeeper/tests/misc_test.rs @@ -116,7 +116,7 @@ fn test_many_tx() -> anyhow::Result<()> { } None }) - .last() + .next_back() .unwrap(); let initdb_lsn = 21623024; diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 0dfdafcc51..b3f088d31c 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -182,7 +182,9 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { max_delta_for_fanout: None, ssl_key_file: Utf8PathBuf::from(""), ssl_cert_file: Utf8PathBuf::from(""), - ssl_ca_cert: None, + ssl_cert_reload_period: Duration::ZERO, + ssl_ca_certs: Vec::new(), + use_https_safekeeper_api: false, }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/scripts/download_basebackup.py b/scripts/download_basebackup.py index e23e4f99c3..08a7128842 100755 --- a/scripts/download_basebackup.py +++ b/scripts/download_basebackup.py @@ -8,9 +8,12 @@ from __future__ import annotations import argparse +from typing import TYPE_CHECKING import psycopg2 -from psycopg2.extensions import connection as PgConnection + +if TYPE_CHECKING: + from psycopg2.extensions import connection as PgConnection def main(args: argparse.Namespace): diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py index 835e28c5d6..154150e922 100644 --- a/scripts/force_layer_download.py +++ b/scripts/force_layer_download.py @@ -7,13 +7,13 @@ import logging import signal import sys from collections import defaultdict -from collections.abc import Awaitable from dataclasses import dataclass from typing import TYPE_CHECKING import aiohttp if TYPE_CHECKING: + from collections.abc import Awaitable from typing import Any diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 86f2dd9a6c..0fef6a58e0 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -141,6 +141,7 @@ async fn publish(client: Option, n_keys: u64) { peer_horizon_lsn: 5, safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(), http_connstr: "zenith-1-sk-1.local:7677".to_owned(), + https_connstr: Some("zenith-1-sk-1.local:7678".to_owned()), local_start_lsn: 0, availability_zone: None, standby_horizon: 0, diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index a420fd9c66..3891685589 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -45,8 +45,10 @@ message SafekeeperTimelineInfo { uint64 standby_horizon = 14; // A connection string to use for WAL receiving. string safekeeper_connstr = 10; - // HTTP endpoint connection string + // HTTP endpoint connection string. string http_connstr = 13; + // HTTPS endpoint connection string. + optional string https_connstr = 15; // Availability zone of a safekeeper. optional string availability_zone = 11; } diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index cc33ec20ff..f1bd7ba708 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -764,6 +764,7 @@ mod tests { peer_horizon_lsn: 5, safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(), http_connstr: "neon-1-sk-1.local:7677".to_owned(), + https_connstr: Some("neon-1-sk-1.local:7678".to_owned()), local_start_lsn: 0, availability_zone: None, standby_horizon: 0, diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 8211bdce62..c41e174d9d 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -16,10 +16,11 @@ testing = [] [dependencies] anyhow.workspace = true bytes.workspace = true +camino.workspace = true chrono.workspace = true clap.workspace = true -cron.workspace = true clashmap.workspace = true +cron.workspace = true fail.workspace = true futures.workspace = true governor.workspace = true @@ -44,8 +45,9 @@ rustls-native-certs.workspace = true serde.workspace = true serde_json.workspace = true thiserror.workspace = true -tokio.workspace = true +tokio-rustls.workspace = true tokio-util.workspace = true +tokio.workspace = true tracing.workspace = true measured.workspace = true rustls.workspace = true diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs index 7888b18aa7..7afc835675 100644 --- a/storage_controller/client/src/control_api.rs +++ b/storage_controller/client/src/control_api.rs @@ -10,13 +10,11 @@ pub struct Client { } impl Client { - pub fn new(base_url: Url, jwt_token: Option) -> Self { + pub fn new(http_client: reqwest::Client, base_url: Url, jwt_token: Option) -> Self { Self { base_url, jwt_token, - client: reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"), + client: http_client, } } diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 5ce4d63d77..31ab443ccd 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -4,6 +4,7 @@ use std::error::Error as _; use std::sync::Arc; use std::time::Duration; +use anyhow::Context; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; use futures::StreamExt; @@ -364,25 +365,28 @@ pub(crate) struct ShardUpdate<'a> { } impl ComputeHook { - pub(super) fn new(config: Config) -> Self { + pub(super) fn new(config: Config) -> anyhow::Result { let authorization_header = config .control_plane_jwt_token .clone() .map(|jwt| format!("Bearer {}", jwt)); - let client = reqwest::ClientBuilder::new() - .timeout(NOTIFY_REQUEST_TIMEOUT) + let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); + for cert in &config.ssl_ca_certs { + client = client.add_root_certificate(cert.clone()); + } + let client = client .build() - .expect("Failed to construct HTTP client"); + .context("Failed to build http client for compute hook")?; - Self { + Ok(Self { state: Default::default(), config, authorization_header, neon_local_lock: Default::default(), api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), client, - } + }) } /// For test environments: use neon_local's LocalEnv to update compute @@ -624,16 +628,19 @@ impl ComputeHook { MaybeSendResult::Transmit((request, lock)) => (request, lock), }; - let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url { - Some(if control_plane_url.ends_with('/') { - format!("{control_plane_url}notify-attach") + let result = if !self.config.use_local_compute_notifications { + let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url { + Some(if control_plane_url.ends_with('/') { + format!("{control_plane_url}notify-attach") + } else { + format!("{control_plane_url}/notify-attach") + }) } else { - format!("{control_plane_url}/notify-attach") - }) - } else { - self.config.compute_hook_url.clone() - }; - let result = if let Some(notify_url) = &compute_hook_url { + self.config.compute_hook_url.clone() + }; + + // We validate this at startup + let notify_url = compute_hook_url.as_ref().unwrap(); self.do_notify(notify_url, &request, cancel).await } else { self.do_notify_local(&request).await.map_err(|e| { diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index ee4c9ef9cd..fe916aa36a 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -8,11 +8,11 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use pageserver_api::controller_api::{NodeAvailability, SkSchedulingPolicy}; use pageserver_api::models::PageserverUtilization; -use reqwest::Certificate; use safekeeper_api::models::SafekeeperUtilization; use safekeeper_client::mgmt_api; use thiserror::Error; use tokio_util::sync::CancellationToken; +use tracing::Instrument; use utils::id::NodeId; use utils::logging::SecretString; @@ -27,8 +27,8 @@ struct HeartbeaterTask { max_offline_interval: Duration, max_warming_up_interval: Duration, + http_client: reqwest::Client, jwt_token: Option, - ssl_ca_cert: Option, } #[derive(Debug, Clone)] @@ -76,8 +76,8 @@ where HeartbeaterTask: HeartBeat, { pub(crate) fn new( + http_client: reqwest::Client, jwt_token: Option, - ssl_ca_cert: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, cancel: CancellationToken, @@ -86,8 +86,8 @@ where tokio::sync::mpsc::unbounded_channel::>(); let mut heartbeater = HeartbeaterTask::new( receiver, + http_client, jwt_token, - ssl_ca_cert, max_offline_interval, max_warming_up_interval, cancel, @@ -122,8 +122,8 @@ where { fn new( receiver: tokio::sync::mpsc::UnboundedReceiver>, + http_client: reqwest::Client, jwt_token: Option, - ssl_ca_cert: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, cancel: CancellationToken, @@ -134,8 +134,8 @@ where state: HashMap::new(), max_offline_interval, max_warming_up_interval, + http_client, jwt_token, - ssl_ca_cert, } } async fn run(&mut self) { @@ -178,7 +178,7 @@ impl HeartBeat for HeartbeaterTask let mut heartbeat_futs = FuturesUnordered::new(); for (node_id, node) in &*pageservers { heartbeat_futs.push({ - let ssl_ca_cert = self.ssl_ca_cert.clone(); + let http_client = self.http_client.clone(); let jwt_token = self.jwt_token.clone(); let cancel = self.cancel.clone(); @@ -193,8 +193,8 @@ impl HeartBeat for HeartbeaterTask let response = node_clone .with_client_retries( |client| async move { client.get_utilization().await }, + &http_client, &jwt_token, - &ssl_ca_cert, 3, 3, Duration::from_secs(1), @@ -228,6 +228,7 @@ impl HeartBeat for HeartbeaterTask Some((*node_id, status)) } + .instrument(tracing::info_span!("heartbeat_ps", %node_id)) }); } @@ -254,7 +255,7 @@ impl HeartBeat for HeartbeaterTask PageserverState::WarmingUp { .. } => { warming_up += 1; } - PageserverState::Offline { .. } => offline += 1, + PageserverState::Offline => offline += 1, PageserverState::Available { .. } => {} } } @@ -329,19 +330,19 @@ impl HeartBeat for HeartbeaterTask for HeartbeaterTask for HeartbeaterTask offline += 1, + SafekeeperState::Offline => offline += 1, SafekeeperState::Available { .. } => {} } } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 52e3ef5b0a..0d1dc8f8ee 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -24,9 +24,9 @@ use pageserver_api::controller_api::{ ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, }; use pageserver_api::models::{ - DetachBehavior, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, - TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest, - TimelineCreateRequest, + DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest, + TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, + TimelineArchivalConfigRequest, TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; @@ -582,6 +582,32 @@ async fn handle_tenant_timeline_download_heatmap_layers( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_lsn_lease( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let lsn_lease_request = json_request::(&mut req).await?; + + service + .tenant_timeline_lsn_lease(tenant_id, timeline_id, lsn_lease_request.lsn) + .await?; + + json_response(StatusCode::OK, ()) +} + // For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters // and tenant/timeline IDs. Since we are proxying to arbitrary paths, we don't have routing templates to // compare to, so we can just filter out our well known ID format with regexes. @@ -613,6 +639,15 @@ async fn handle_tenant_timeline_passthrough( return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); }; + let method = match *req.method() { + hyper::Method::GET => reqwest::Method::GET, + hyper::Method::POST => reqwest::Method::POST, + hyper::Method::PUT => reqwest::Method::PUT, + hyper::Method::DELETE => reqwest::Method::DELETE, + hyper::Method::PATCH => reqwest::Method::PATCH, + _ => return Err(ApiError::BadRequest(anyhow::anyhow!("Unsupported method"))), + }; + tracing::info!( "Proxying request for tenant {} ({})", tenant_or_shard_id.tenant_id, @@ -656,12 +691,11 @@ async fn handle_tenant_timeline_passthrough( let _timer = latency.start_timer(labels.clone()); let client = mgmt_api::Client::new( + service.get_http_client().clone(), node.base_url(), service.get_config().pageserver_jwt_token.as_deref(), - service.get_config().ssl_ca_cert.clone(), - ) - .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; - let resp = client.get_raw(path).await.map_err(|e| + ); + let resp = client.op_raw(method, path).await.map_err(|e| // We return 503 here because if we can't successfully send a request to the pageserver, // either we aren't available or the pageserver is unavailable. ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?; @@ -865,7 +899,7 @@ async fn handle_node_status(req: Request) -> Result, ApiErr let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; - let node_status = state.service.get_node(node_id).await?; + let node_status = state.service.get_node(node_id).await?.describe(); json_response(StatusCode::OK, node_status) } @@ -1382,6 +1416,12 @@ async fn handle_upsert_safekeeper(mut req: Request) -> Result { return res; @@ -1693,9 +1733,9 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { }; if *self_addr == leader_addr { - return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( - "Leader is stepped down instance" - )))); + return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable( + "Leader is stepped down instance".into(), + ))); } } @@ -1704,19 +1744,17 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and // include some leeway to get the timeout for proxied requests. const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10); - let client = reqwest::ClientBuilder::new() - .timeout(PROXIED_REQUEST_TIMEOUT) - .build(); - let client = match client { - Ok(client) => client, - Err(err) => { - return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( - "Failed to build leader client for forwarding while in stepped down state: {err}" - )))); - } - }; - let request: reqwest::Request = match convert_request(req, &client, leader.address).await { + let client = state.service.get_http_client().clone(); + + let request: reqwest::Request = match convert_request( + req, + &client, + leader.address, + PROXIED_REQUEST_TIMEOUT, + ) + .await + { Ok(r) => r, Err(err) => { return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -1774,6 +1812,7 @@ async fn convert_request( req: hyper::Request, client: &reqwest::Client, to_address: String, + timeout: Duration, ) -> Result { use std::str::FromStr; @@ -1828,6 +1867,7 @@ async fn convert_request( .request(method, uri) .headers(headers) .body(body) + .timeout(timeout) .build() .map_err(|err| { ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) @@ -2193,6 +2233,17 @@ pub fn make_router( ) }, ) + // LSN lease passthrough to all shards + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_lsn_lease, + RequestName("v1_tenant_timeline_lsn_lease"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( @@ -2211,6 +2262,17 @@ pub fn make_router( RequestName("v1_tenant_passthrough"), ) }) + // Tenant timeline mark_invisible passthrough to shard zero + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_timeline_mark_invisible_passthrough"), + ) + }, + ) } #[cfg(test)] diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs index 5e1d6f3ec9..39c28d60a9 100644 --- a/storage_controller/src/leadership.rs +++ b/storage_controller/src/leadership.rs @@ -110,7 +110,20 @@ impl Leadership { ) -> Option { tracing::info!("Sending step down request to {leader:?}"); + let mut http_client = reqwest::Client::builder(); + for cert in &self.config.ssl_ca_certs { + http_client = http_client.add_root_certificate(cert.clone()); + } + let http_client = match http_client.build() { + Ok(http_client) => http_client, + Err(err) => { + tracing::error!("Failed to build client for leader step-down request: {err}"); + return None; + } + }; + let client = PeerClient::new( + http_client, Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"), self.config.peer_jwt_token.clone(), ); diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 6e3c70c42b..1aa9ae10ae 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -4,7 +4,10 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, anyhow}; +use camino::Utf8PathBuf; use clap::Parser; +use futures::future::OptionFuture; +use http_utils::tls_certs::ReloadingCertificateResolver; use hyper0::Uri; use metrics::BuildInfo; use metrics::launch_timestamp::LaunchTimestamp; @@ -39,13 +42,29 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; +const DEFAULT_SSL_KEY_FILE: &str = "server.key"; +const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; +const DEFAULT_SSL_CERT_RELOAD_PERIOD: &str = "60s"; + #[derive(Parser)] #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] +#[clap(group( + clap::ArgGroup::new("listen-addresses") + .required(true) + .multiple(true) + .args(&["listen", "listen_https"]), +))] struct Cli { - /// Host and port to listen on, like `127.0.0.1:1234` + /// Host and port to listen HTTP on, like `127.0.0.1:1234`. + /// At least one of ["listen", "listen_https"] should be specified. + // TODO: Make this option dev-only when https is out everywhere. #[arg(short, long)] - listen: std::net::SocketAddr, + listen: Option, + /// Host and port to listen HTTPS on, like `127.0.0.1:1234`. + /// At least one of ["listen", "listen_https"] should be specified. + #[arg(long)] + listen_https: Option, /// Public key for JWT authentication of clients #[arg(long)] @@ -96,6 +115,19 @@ struct Cli { #[arg(long)] split_threshold: Option, + /// Maximum number of shards during autosplits. 0 disables autosplits. Defaults + /// to 16 as a safety to avoid too many shards by accident. + #[arg(long, default_value = "16")] + max_split_shards: u8, + + /// Size threshold for initial shard splits of unsharded tenants. 0 disables initial splits. + #[arg(long)] + initial_split_threshold: Option, + + /// Number of target shards for initial splits. 0 or 1 disables initial splits. Defaults to 2. + #[arg(long, default_value = "2")] + initial_split_shards: u8, + /// Maximum number of normal-priority reconcilers that may run in parallel #[arg(long)] reconciler_concurrency: Option, @@ -157,9 +189,23 @@ struct Cli { #[arg(long, default_value = "false")] use_https_safekeeper_api: bool, - /// Trusted root CA certificate to use in https APIs. + /// Path to a file with certificate's private key for https API. + #[arg(long, default_value = DEFAULT_SSL_KEY_FILE)] + ssl_key_file: Utf8PathBuf, + /// Path to a file with a X509 certificate for https API. + #[arg(long, default_value = DEFAULT_SSL_CERT_FILE)] + ssl_cert_file: Utf8PathBuf, + /// Period to reload certificate and private key from files. + #[arg(long, default_value = DEFAULT_SSL_CERT_RELOAD_PERIOD)] + ssl_cert_reload_period: humantime::Duration, + /// Trusted root CA certificates to use in https APIs. #[arg(long)] ssl_ca_file: Option, + + /// Neon local specific flag. When set, ignore [`Cli::control_plane_url`] and deliver + /// the compute notification directly (instead of via control plane). + #[arg(long, default_value = "false")] + use_local_compute_notifications: bool, } enum StrictMode { @@ -237,10 +283,8 @@ impl Secrets { fn load_secret(cli: &Option, env_name: &str) -> Option { if let Some(v) = cli { Some(v.clone()) - } else if let Ok(v) = std::env::var(env_name) { - Some(v) } else { - None + std::env::var(env_name).ok() } } } @@ -283,11 +327,10 @@ async fn async_main() -> anyhow::Result<()> { let args = Cli::parse(); tracing::info!( - "version: {}, launch_timestamp: {}, build_tag {}, listening on {}", + "version: {}, launch_timestamp: {}, build_tag {}", GIT_VERSION, launch_ts.to_string(), BUILD_TAG, - args.listen ); let build_info = BuildInfo { @@ -326,6 +369,9 @@ async fn async_main() -> anyhow::Result<()> { "neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode" ); } + StrictMode::Strict if args.use_local_compute_notifications => { + anyhow::bail!("`--use-local-compute-notifications` is only permitted in `--dev` mode"); + } StrictMode::Strict => { tracing::info!("Starting in strict mode: configuration is OK.") } @@ -334,13 +380,13 @@ async fn async_main() -> anyhow::Result<()> { } } - let ssl_ca_cert = match args.ssl_ca_file.as_ref() { + let ssl_ca_certs = match args.ssl_ca_file.as_ref() { Some(ssl_ca_file) => { tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); let buf = tokio::fs::read(ssl_ca_file).await?; - Some(Certificate::from_pem(&buf)?) + Certificate::from_pem_bundle(&buf)? } - None => None, + None => Vec::new(), }; let config = Config { @@ -366,6 +412,9 @@ async fn async_main() -> anyhow::Result<()> { .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), tenant_rate_limit: args.tenant_rate_limit, split_threshold: args.split_threshold, + max_split_shards: args.max_split_shards, + initial_split_threshold: args.initial_split_threshold, + initial_split_shards: args.initial_split_shards, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, heartbeat_interval: args @@ -378,11 +427,11 @@ async fn async_main() -> anyhow::Result<()> { .unwrap_or(LONG_RECONCILE_THRESHOLD_DEFAULT), address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, - http_service_port: args.listen.port() as i32, use_https_pageserver_api: args.use_https_pageserver_api, use_https_safekeeper_api: args.use_https_safekeeper_api, - ssl_ca_cert, + ssl_ca_certs, timelines_onto_safekeepers: args.timelines_onto_safekeepers, + use_local_compute_notifications: args.use_local_compute_notifications, }; // Validate that we can connect to the database @@ -392,28 +441,57 @@ async fn async_main() -> anyhow::Result<()> { let service = Service::spawn(config, persistence.clone()).await?; - let http_listener = tcp_listener::bind(args.listen)?; - let auth = secrets .public_key .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); let router = make_router(service.clone(), auth, build_info) .build() .map_err(|err| anyhow!(err))?; - let router_service = http_utils::RouterService::new(router).unwrap(); + let http_service = + Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?); + + let api_shutdown = CancellationToken::new(); // Start HTTP server - let server_shutdown = CancellationToken::new(); - let server = hyper0::Server::from_tcp(http_listener)? - .serve(router_service) - .with_graceful_shutdown({ - let server_shutdown = server_shutdown.clone(); - async move { - server_shutdown.cancelled().await; - } - }); - tracing::info!("Serving on {0}", args.listen); - let server_task = tokio::task::spawn(server); + let http_server_task: OptionFuture<_> = match args.listen { + Some(http_addr) => { + let http_listener = tcp_listener::bind(http_addr)?; + let http_server = + http_utils::server::Server::new(Arc::clone(&http_service), http_listener, None)?; + + tracing::info!("Serving HTTP on {}", http_addr); + Some(tokio::task::spawn(http_server.serve(api_shutdown.clone()))) + } + None => None, + } + .into(); + + // Start HTTPS server + let https_server_task: OptionFuture<_> = match args.listen_https { + Some(https_addr) => { + let https_listener = tcp_listener::bind(https_addr)?; + + let resolver = ReloadingCertificateResolver::new( + &args.ssl_key_file, + &args.ssl_cert_file, + *args.ssl_cert_reload_period, + ) + .await?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_cert_resolver(resolver); + + let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + let https_server = + http_utils::server::Server::new(http_service, https_listener, Some(tls_acceptor))?; + + tracing::info!("Serving HTTPS on {}", https_addr); + Some(tokio::task::spawn(https_server.serve(api_shutdown.clone()))) + } + None => None, + } + .into(); let chaos_task = args.chaos_interval.map(|interval| { let service = service.clone(); @@ -437,28 +515,41 @@ async fn async_main() -> anyhow::Result<()> { let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?; let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?; let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?; + tokio::pin!(http_server_task, https_server_task); tokio::select! { _ = sigint.recv() => {}, _ = sigterm.recv() => {}, _ = sigquit.recv() => {}, + Some(err) = &mut http_server_task => { + panic!("HTTP server task failed: {err:#?}"); + } + Some(err) = &mut https_server_task => { + panic!("HTTPS server task failed: {err:#?}"); + } } tracing::info!("Terminating on signal"); - // Stop HTTP server first, so that we don't have to service requests + // Stop HTTP and HTTPS servers first, so that we don't have to service requests // while shutting down Service. - server_shutdown.cancel(); - match tokio::time::timeout(Duration::from_secs(5), server_task).await { - Ok(Ok(_)) => { - tracing::info!("Joined HTTP server task"); - } - Ok(Err(e)) => { - tracing::error!("Error joining HTTP server task: {e}") - } - Err(_) => { - tracing::warn!("Timed out joining HTTP server task"); - // We will fall through and shut down the service anyway, any request handlers - // in flight will experience cancellation & their clients will see a torn connection. - } + api_shutdown.cancel(); + + // If the deadline is exceeded, we will fall through and shut down the service anyway, + // any request handlers in flight will experience cancellation & their clients will + // see a torn connection. + let deadline = tokio::time::Instant::now() + Duration::from_secs(5); + + match tokio::time::timeout_at(deadline, http_server_task).await { + Ok(Some(Ok(_))) => tracing::info!("Joined HTTP server task"), + Ok(Some(Err(e))) => tracing::error!("Error joining HTTP server task: {e}"), + Ok(None) => {} // HTTP is disabled. + Err(_) => tracing::warn!("Timed out joining HTTP server task"), + } + + match tokio::time::timeout_at(deadline, https_server_task).await { + Ok(Some(Ok(_))) => tracing::info!("Joined HTTPS server task"), + Ok(Some(Err(e))) => tracing::error!("Error joining HTTPS server task: {e}"), + Ok(None) => {} // HTTPS is disabled. + Err(_) => tracing::warn!("Timed out joining HTTPS server task"), } // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 40f3c7c58e..f667514517 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -7,7 +7,7 @@ use pageserver_api::controller_api::{ }; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; -use reqwest::{Certificate, StatusCode}; +use reqwest::StatusCode; use serde::Serialize; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -280,8 +280,8 @@ impl Node { pub(crate) async fn with_client_retries( &self, mut op: O, + http_client: &reqwest::Client, jwt: &Option, - ssl_ca_cert: &Option, warn_threshold: u32, max_retries: u32, timeout: Duration, @@ -300,24 +300,13 @@ impl Node { | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, ApiError(_, _) => true, Cancelled => true, - CreateClient(_) => true, + Timeout(_) => false, } } - // TODO: refactor PageserverClient and with_client_retires (#11113). - let mut http_client = reqwest::ClientBuilder::new().timeout(timeout); - if let Some(ssl_ca_cert) = ssl_ca_cert.as_ref() { - http_client = http_client.add_root_certificate(ssl_ca_cert.clone()) - } - - let http_client = match http_client.build() { - Ok(http_client) => http_client, - Err(err) => return Some(Err(mgmt_api::Error::CreateClient(err))), - }; - backoff::retry( || { - let client = PageserverClient::from_client( + let client = PageserverClient::new( self.get_id(), http_client.clone(), self.base_url(), @@ -326,11 +315,14 @@ impl Node { let node_cancel_fut = self.cancel.cancelled(); - let op_fut = op(client); + let op_fut = tokio::time::timeout(timeout, op(client)); async { tokio::select! { - r = op_fut=> {r}, + r = op_fut => match r { + Ok(r) => r, + Err(e) => Err(mgmt_api::Error::Timeout(format!("{e}"))), + }, _ = node_cancel_fut => { Err(mgmt_api::Error::Cancelled) }} diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 05e7aa88c6..d14fc35b39 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,6 +1,6 @@ use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::models::{ - DetachBehavior, LocationConfig, LocationConfigListResponse, PageserverUtilization, + DetachBehavior, LocationConfig, LocationConfigListResponse, LsnLease, PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, @@ -8,8 +8,9 @@ use pageserver_api::models::{ use pageserver_api::shard::TenantShardId; use pageserver_client::BlockUnblock; use pageserver_client::mgmt_api::{Client, Result}; -use reqwest::{Certificate, StatusCode}; +use reqwest::StatusCode; use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage /// controller to collect metrics in a non-intrusive manner. @@ -47,25 +48,13 @@ macro_rules! measured_request { impl PageserverClient { pub(crate) fn new( - node_id: NodeId, - mgmt_api_endpoint: String, - jwt: Option<&str>, - ssl_ca_cert: Option, - ) -> Result { - Ok(Self { - inner: Client::new(mgmt_api_endpoint, jwt, ssl_ca_cert)?, - node_id_label: node_id.0.to_string(), - }) - } - - pub(crate) fn from_client( node_id: NodeId, raw_client: reqwest::Client, mgmt_api_endpoint: String, jwt: Option<&str>, ) -> Self { Self { - inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + inner: Client::new(raw_client, mgmt_api_endpoint, jwt), node_id_label: node_id.0.to_string(), } } @@ -207,6 +196,22 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_lease_lsn( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result { + measured_request!( + "timeline_lease_lsn", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_init_lsn_lease(tenant_shard_id, timeline_id, lsn) + .await + ) + } + pub(crate) async fn tenant_shard_split( &self, tenant_shard_id: TenantShardId, diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index f3f275dee0..604d1024ba 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -59,11 +59,11 @@ impl ResponseErrorMessageExt for reqwest::Response { pub(crate) struct GlobalObservedState(pub(crate) HashMap); impl PeerClient { - pub(crate) fn new(uri: Uri, jwt: Option) -> Self { + pub(crate) fn new(http_client: reqwest::Client, uri: Uri, jwt: Option) -> Self { Self { uri, jwt, - client: reqwest::Client::new(), + client: http_client, } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 85d9c574a1..d25448718f 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -997,10 +997,11 @@ impl Persistence { // Clear sharding flag let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)) .set((splitting.eq(0),)) .execute(conn) .await?; - debug_assert!(updated > 0); + assert!(updated == new_shard_count.count() as usize); Ok(()) }) @@ -1367,6 +1368,93 @@ impl Persistence { Ok(timeline_from_db) } + + /// Set `delete_at` for the given timeline + pub(crate) async fn timeline_set_deleted_at( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult<()> { + use crate::schema::timelines; + + let deletion_time = chrono::Local::now().to_utc(); + self.with_measured_conn(DatabaseOperation::InsertTimeline, move |conn| { + Box::pin(async move { + let updated = diesel::update(timelines::table) + .filter(timelines::tenant_id.eq(tenant_id.to_string())) + .filter(timelines::timeline_id.eq(timeline_id.to_string())) + .set(timelines::deleted_at.eq(Some(deletion_time))) + .execute(conn) + .await?; + + match updated { + 0 => Ok(()), + 1 => Ok(()), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + updated + ))), + } + }) + }) + .await + } + + /// Load timeline from db. Returns `None` if not present. + /// + /// Only works if `deleted_at` is set, so you should call [`Self::timeline_set_deleted_at`] before. + pub(crate) async fn delete_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult<()> { + use crate::schema::timelines::dsl; + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn(DatabaseOperation::GetTimeline, move |conn| { + Box::pin(async move { + diesel::delete(dsl::timelines) + .filter(dsl::tenant_id.eq(&tenant_id.to_string())) + .filter(dsl::timeline_id.eq(&timeline_id.to_string())) + .filter(dsl::deleted_at.is_not_null()) + .execute(conn) + .await?; + Ok(()) + }) + }) + .await?; + + Ok(()) + } + + /// Loads a list of all timelines from database. + pub(crate) async fn list_timelines_for_tenant( + &self, + tenant_id: TenantId, + ) -> DatabaseResult> { + use crate::schema::timelines::dsl; + + let tenant_id = &tenant_id; + let timelines = self + .with_measured_conn(DatabaseOperation::GetTimeline, move |conn| { + Box::pin(async move { + let timelines: Vec = dsl::timelines + .filter(dsl::tenant_id.eq(&tenant_id.to_string())) + .load(conn) + .await?; + Ok(timelines) + }) + }) + .await?; + + let timelines = timelines + .into_iter() + .map(TimelineFromDb::into_persistence) + .collect(); + Ok(timelines) + } + /// Persist pending op. Returns if it was newly inserted. If it wasn't, we haven't done any writes. pub(crate) async fn insert_pending_op( &self, @@ -1409,7 +1497,7 @@ impl Persistence { pub(crate) async fn remove_pending_op( &self, tenant_id: TenantId, - timeline_id: TimelineId, + timeline_id: Option, sk_id: NodeId, generation: u32, ) -> DatabaseResult<()> { @@ -1418,10 +1506,11 @@ impl Persistence { let tenant_id = &tenant_id; let timeline_id = &timeline_id; self.with_measured_conn(DatabaseOperation::RemoveTimelineReconcile, move |conn| { + let timeline_id_str = timeline_id.map(|tid| tid.to_string()).unwrap_or_default(); Box::pin(async move { diesel::delete(dsl::safekeeper_timeline_pending_ops) .filter(dsl::tenant_id.eq(tenant_id.to_string())) - .filter(dsl::timeline_id.eq(timeline_id.to_string())) + .filter(dsl::timeline_id.eq(timeline_id_str)) .filter(dsl::sk_id.eq(sk_id.0 as i64)) .filter(dsl::generation.eq(generation as i32)) .execute(conn) @@ -1435,22 +1524,39 @@ impl Persistence { /// Load pending operations from db. pub(crate) async fn list_pending_ops( &self, - filter_for_sk: Option, ) -> DatabaseResult> { use crate::schema::safekeeper_timeline_pending_ops::dsl; - const FILTER_VAL_1: i64 = 1; - const FILTER_VAL_2: i64 = 2; - let filter_opt = filter_for_sk.map(|id| id.0 as i64); let timeline_from_db = self + .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| { + Box::pin(async move { + let from_db: Vec = + dsl::safekeeper_timeline_pending_ops.load(conn).await?; + Ok(from_db) + }) + }) + .await?; + + Ok(timeline_from_db) + } + /// List pending operations for a given timeline (including tenant-global ones) + pub(crate) async fn list_pending_ops_for_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult> { + use crate::schema::safekeeper_timeline_pending_ops::dsl; + + let timelines_from_db = self .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| { Box::pin(async move { let from_db: Vec = dsl::safekeeper_timeline_pending_ops + .filter(dsl::tenant_id.eq(tenant_id.to_string())) .filter( - dsl::sk_id - .eq(filter_opt.unwrap_or(FILTER_VAL_1)) - .and(dsl::sk_id.eq(filter_opt.unwrap_or(FILTER_VAL_2))), + dsl::timeline_id + .eq(timeline_id.to_string()) + .or(dsl::timeline_id.eq("")), ) .load(conn) .await?; @@ -1459,7 +1565,35 @@ impl Persistence { }) .await?; - Ok(timeline_from_db) + Ok(timelines_from_db) + } + + /// Delete all pending ops for the given timeline. + /// + /// Use this only at timeline deletion, otherwise use generation based APIs + pub(crate) async fn remove_pending_ops_for_timeline( + &self, + tenant_id: TenantId, + timeline_id: Option, + ) -> DatabaseResult<()> { + use crate::schema::safekeeper_timeline_pending_ops::dsl; + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| { + let timeline_id_str = timeline_id.map(|tid| tid.to_string()).unwrap_or_default(); + Box::pin(async move { + diesel::delete(dsl::safekeeper_timeline_pending_ops) + .filter(dsl::tenant_id.eq(tenant_id.to_string())) + .filter(dsl::timeline_id.eq(timeline_id_str)) + .execute(conn) + .await?; + Ok(()) + }) + }) + .await?; + + Ok(()) } } @@ -1916,7 +2050,7 @@ impl ToSql for LsnWrapper { } } -#[derive(Insertable, AsChangeset, Queryable, Selectable, Clone)] +#[derive(Insertable, AsChangeset, Clone)] #[diesel(table_name = crate::schema::timelines)] pub(crate) struct TimelinePersistence { pub(crate) tenant_id: String, diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 9f0b789f19..b03a6dae04 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -86,6 +86,9 @@ pub(super) struct Reconciler { /// Access to persistent storage for updating generation numbers pub(crate) persistence: Arc, + + /// HTTP client with proper CA certs. + pub(crate) http_client: reqwest::Client, } pub(crate) struct ReconcilerConfigBuilder { @@ -298,8 +301,8 @@ impl Reconciler { .location_config(tenant_shard_id, config.clone(), flush_ms, lazy) .await }, + &self.http_client, &self.service_config.pageserver_jwt_token, - &self.service_config.ssl_ca_cert, 1, 3, timeout, @@ -419,10 +422,10 @@ impl Reconciler { let client = PageserverClient::new( node.get_id(), + self.http_client.clone(), node.base_url(), self.service_config.pageserver_jwt_token.as_deref(), - self.service_config.ssl_ca_cert.clone(), - )?; + ); client .wait_lsn( @@ -443,10 +446,10 @@ impl Reconciler { ) -> anyhow::Result> { let client = PageserverClient::new( node.get_id(), + self.http_client.clone(), node.base_url(), self.service_config.pageserver_jwt_token.as_deref(), - self.service_config.ssl_ca_cert.clone(), - )?; + ); let timelines = client.timeline_list(&tenant_shard_id).await?; Ok(timelines @@ -483,8 +486,8 @@ impl Reconciler { ) .await }, + &self.http_client, &self.service_config.pageserver_jwt_token, - &self.service_config.ssl_ca_cert, 1, 3, request_download_timeout * 2, @@ -683,6 +686,8 @@ impl Reconciler { .await?, ); + pausable_failpoint!("reconciler-live-migrate-post-generation-inc"); + let dest_conf = build_location_config( &self.shard, &self.config, @@ -757,7 +762,9 @@ impl Reconciler { Ok(()) } - async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> { + /// Returns true if the observed state of the attached location was refreshed + /// and false otherwise. + async fn maybe_refresh_observed(&mut self) -> Result { // If the attached node has uncertain state, read it from the pageserver before proceeding: this // is important to avoid spurious generation increments. // @@ -767,7 +774,7 @@ impl Reconciler { let Some(attached_node) = self.intent.attached.as_ref() else { // Nothing to do - return Ok(()); + return Ok(false); }; if matches!( @@ -778,8 +785,8 @@ impl Reconciler { let observed_conf = match attached_node .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, + &self.http_client, &self.service_config.pageserver_jwt_token, - &self.service_config.ssl_ca_cert, 1, 1, Duration::from_secs(5), @@ -812,7 +819,7 @@ impl Reconciler { } } - Ok(()) + Ok(true) } /// Reconciling a tenant makes API calls to pageservers until the observed state @@ -828,7 +835,7 @@ impl Reconciler { /// state where it still requires later reconciliation. pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it - self.maybe_refresh_observed().await?; + let refreshed = self.maybe_refresh_observed().await?; // Special case: live migration self.maybe_live_migrate().await?; @@ -852,8 +859,14 @@ impl Reconciler { ); match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { - // Nothing to do - tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") + if refreshed { + tracing::info!( + node_id=%node.get_id(), "Observed configuration correct after refresh. Notifying compute."); + self.compute_notify().await?; + } else { + // Nothing to do + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct."); + } } observed => { // In all cases other than a matching observed configuration, we will @@ -1127,8 +1140,8 @@ impl Reconciler { match origin .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, + &self.http_client, &self.service_config.pageserver_jwt_token, - &self.service_config.ssl_ca_cert, 1, 3, Duration::from_secs(5), diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 2bd28f29af..3b731acf7e 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -1,7 +1,7 @@ use std::time::Duration; use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; -use reqwest::{Certificate, StatusCode}; +use reqwest::StatusCode; use safekeeper_client::mgmt_api; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -94,8 +94,8 @@ impl Safekeeper { pub(crate) async fn with_client_retries( &self, mut op: O, + http_client: &reqwest::Client, jwt: &Option, - ssl_ca_cert: &Option, warn_threshold: u32, max_retries: u32, timeout: Duration, @@ -114,17 +114,10 @@ impl Safekeeper { | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, ApiError(_, _) => true, Cancelled => true, - CreateClient(_) => true, + Timeout(_) => false, } } - // TODO: refactor SafekeeperClient and with_client_retires (#11113). - let mut http_client = reqwest::Client::builder().timeout(timeout); - if let Some(ssl_ca_cert) = ssl_ca_cert.as_ref() { - http_client = http_client.add_root_certificate(ssl_ca_cert.clone()); - } - let http_client = http_client.build().map_err(mgmt_api::Error::CreateClient)?; - backoff::retry( || { let client = SafekeeperClient::new( @@ -136,11 +129,14 @@ impl Safekeeper { let node_cancel_fut = self.cancel.cancelled(); - let op_fut = op(client); + let op_fut = tokio::time::timeout(timeout, op(client)); async { tokio::select! { - r = op_fut=> {r}, + r = op_fut => match r { + Ok(r) => r, + Err(e) => Err(mgmt_api::Error::Timeout(format!("{e}"))), + }, _ = node_cancel_fut => { Err(mgmt_api::Error::Cancelled) }} diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index b30237e404..988159af4a 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -98,6 +98,18 @@ impl SafekeeperClient { ) } + pub(crate) async fn delete_tenant( + &self, + tenant_id: TenantId, + ) -> Result { + measured_request!( + "delete_tenant", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner.delete_tenant(tenant_id).await + ) + } + pub(crate) async fn pull_timeline( &self, req: &PullTimelineRequest, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 61a6c12f47..50f642deaf 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -12,7 +12,7 @@ use std::ops::{Deref, DerefMut}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; -use std::time::{Duration, Instant}; +use std::time::{Duration, Instant, SystemTime}; use anyhow::Context; use context_iterator::TenantShardContextIterator; @@ -34,7 +34,7 @@ use pageserver_api::controller_api::{ TenantShardMigrateRequest, TenantShardMigrateResponse, }; use pageserver_api::models::{ - self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, + self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, @@ -60,6 +60,7 @@ use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; use utils::sync::gate::Gate; use utils::{failpoint_support, pausable_failpoint}; @@ -152,6 +153,7 @@ enum TenantOperations { TimelineGcBlockUnblock, DropDetached, DownloadHeatmapLayers, + TimelineLsnLease, } #[derive(Clone, strum_macros::Display)] @@ -267,7 +269,7 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { ApiError::Conflict(format!("{node} {status}: {status} {msg}")) } mgmt_api::Error::Cancelled => ApiError::ShuttingDown, - mgmt_api::Error::CreateClient(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), + mgmt_api::Error::Timeout(e) => ApiError::Timeout(e.into()), } } @@ -389,14 +391,41 @@ pub struct Config { /// tenant-scoped API endpoints. Further API requests queue until ready. pub tenant_rate_limit: NonZeroU32, - /// The size at which an unsharded tenant should be split (into 8 shards). This uses the logical - /// size of the largest timeline in the shard (i.e. max_logical_size). + /// If a tenant shard's largest timeline (max_logical_size) exceeds this value, all tenant + /// shards will be split in 2 until they fall below split_threshold (up to max_split_shards). + /// + /// This will greedily split into as many shards as necessary to fall below split_threshold, as + /// powers of 2: if a tenant shard is 7 times larger than split_threshold, it will split into 8 + /// immediately, rather than first 2 then 4 then 8. /// /// None or 0 disables auto-splitting. /// /// TODO: consider using total logical size of all timelines instead. pub split_threshold: Option, + /// The maximum number of shards a tenant can be split into during autosplits. Does not affect + /// manual split requests. 0 or 1 disables autosplits, as we already have 1 shard. + pub max_split_shards: u8, + + /// The size at which an unsharded tenant should initially split. Ingestion is significantly + /// faster with multiple shards, so eagerly splitting below split_threshold will typically speed + /// up initial ingestion of large tenants. + /// + /// This should be below split_threshold, but it is not required. If both split_threshold and + /// initial_split_threshold qualify, the largest number of target shards will be used. + /// + /// Does not apply to already sharded tenants: changing initial_split_threshold or + /// initial_split_shards is not retroactive for already-sharded tenants. + /// + /// None or 0 disables initial splits. + pub initial_split_threshold: Option, + + /// The number of shards to split into when reaching initial_split_threshold. Will + /// be clamped to max_split_shards. + /// + /// 0 or 1 disables initial splits. Has no effect if initial_split_threshold is disabled. + pub initial_split_shards: u8, + // TODO: make this cfg(feature = "testing") pub neon_local_repo_dir: Option, @@ -412,17 +441,17 @@ pub struct Config { pub start_as_candidate: bool, - pub http_service_port: i32, - pub long_reconcile_threshold: Duration, pub use_https_pageserver_api: bool, pub use_https_safekeeper_api: bool, - pub ssl_ca_cert: Option, + pub ssl_ca_certs: Vec, pub timelines_onto_safekeepers: bool, + + pub use_local_compute_notifications: bool, } impl From for ApiError { @@ -499,6 +528,9 @@ pub struct Service { /// This waits for initial reconciliation with pageservers to complete. Until this barrier /// passes, it isn't safe to do any actions that mutate tenants. pub(crate) startup_complete: Barrier, + + /// HTTP client with proper CA certs. + http_client: reqwest::Client, } impl From for ApiError { @@ -574,6 +606,22 @@ enum TenantShardSplitAbortError { Unavailable, } +/// Inputs for computing a target shard count for a tenant. +struct ShardSplitInputs { + /// Current shard count. + shard_count: ShardCount, + /// Total size of largest timeline summed across all shards. + max_logical_size: u64, + /// Size-based split threshold. Zero if size-based splits are disabled. + split_threshold: u64, + /// Upper bound on target shards. 0 or 1 disables splits. + max_split_shards: u8, + /// Initial split threshold. Zero if initial splits are disabled. + initial_split_threshold: u64, + /// Number of shards for initial splits. 0 or 1 disables initial splits. + initial_split_shards: u8, +} + struct ShardUpdate { tenant_shard_id: TenantShardId, placement_policy: PlacementPolicy, @@ -626,6 +674,10 @@ impl Service { &self.config } + pub fn get_http_client(&self) -> &reqwest::Client { + &self.http_client + } + /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date /// view of the world, and determine which pageservers are responsive. #[instrument(skip_all)] @@ -924,8 +976,8 @@ impl Service { let response = node .with_client_retries( |client| async move { client.list_location_config().await }, + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, 1, 5, timeout, @@ -1023,20 +1075,12 @@ impl Service { break; } - let client = match PageserverClient::new( + let client = PageserverClient::new( node.get_id(), + self.http_client.clone(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - self.config.ssl_ca_cert.clone(), - ) { - Ok(client) => client, - Err(e) => { - tracing::error!( - "Failed to create client to detach unknown shard {tenant_shard_id} on pageserver {node_id}: {e}" - ); - continue; - } - }; + ); match client .location_config( tenant_shard_id, @@ -1614,17 +1658,36 @@ impl Service { let cancel = CancellationToken::new(); let reconcilers_cancel = cancel.child_token(); + let mut http_client = reqwest::Client::builder(); + // We intentionally disable the connection pool, so every request will create its own TCP connection. + // It's especially important for heartbeaters to notice more network problems. + // + // TODO: It makes sense to use this client only in heartbeaters and create a second one with + // connection pooling for everything else. But reqwest::Client may create a connection without + // ever using it (it uses hyper's Client under the hood): + // https://github.com/hyperium/hyper-util/blob/d51318df3461d40e5f5e5ca163cb3905ac960209/src/client/legacy/client.rs#L415 + // + // Because of a bug in hyper0::Connection::graceful_shutdown such connections hang during + // graceful server shutdown: https://github.com/hyperium/hyper/issues/2730 + // + // The bug has been fixed in hyper v1, so keep alive may be enabled only after we migrate to hyper1. + http_client = http_client.pool_max_idle_per_host(0); + for ssl_ca_cert in &config.ssl_ca_certs { + http_client = http_client.add_root_certificate(ssl_ca_cert.clone()); + } + let http_client = http_client.build()?; + let heartbeater_ps = Heartbeater::new( + http_client.clone(), config.pageserver_jwt_token.clone(), - config.ssl_ca_cert.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), ); let heartbeater_sk = Heartbeater::new( + http_client.clone(), config.safekeeper_jwt_token.clone(), - config.ssl_ca_cert.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), @@ -1648,7 +1711,7 @@ impl Service { ))), config: config.clone(), persistence, - compute_hook: Arc::new(ComputeHook::new(config.clone())), + compute_hook: Arc::new(ComputeHook::new(config.clone())?), result_tx, heartbeater_ps, heartbeater_sk, @@ -1667,6 +1730,7 @@ impl Service { reconcilers_gate: Gate::default(), tenant_op_locks: Default::default(), node_op_locks: Default::default(), + http_client, }); let result_task_this = this.clone(); @@ -1972,8 +2036,8 @@ impl Service { let configs = match node .with_client_retries( |client| async move { client.list_location_config().await }, + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -2051,8 +2115,8 @@ impl Service { .location_config(tenant_shard_id, config, None, false) .await }, + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -3194,11 +3258,10 @@ impl Service { for tenant_shard_id in shard_ids { let client = PageserverClient::new( node.get_id(), + self.http_client.clone(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - self.config.ssl_ca_cert.clone(), - ) - .map_err(|e| passthrough_api_error(&node, e))?; + ); tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); @@ -3257,11 +3320,10 @@ impl Service { for (tenant_shard_id, node) in targets { let client = PageserverClient::new( node.get_id(), + self.http_client.clone(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - self.config.ssl_ca_cert.clone(), - ) - .map_err(|e| passthrough_api_error(&node, e))?; + ); futs.push(async move { let result = client .tenant_secondary_download(tenant_shard_id, wait) @@ -3325,7 +3387,10 @@ impl Service { } } - pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result { + pub(crate) async fn tenant_delete( + self: &Arc, + tenant_id: TenantId, + ) -> Result { let _tenant_lock = trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; @@ -3383,8 +3448,8 @@ impl Service { .tenant_delete(TenantShardId::unsharded(tenant_id)) .await }, + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, 1, 3, RECONCILE_TIMEOUT, @@ -3433,6 +3498,11 @@ impl Service { ); }; + // Delete the tenant from safekeepers (if needed) + self.tenant_delete_safekeepers(tenant_id) + .instrument(tracing::info_span!("tenant_delete_safekeepers", %tenant_id)) + .await?; + // Success is represented as 404, to imitate the existing pageserver deletion API Ok(StatusCode::NOT_FOUND) } @@ -3531,8 +3601,8 @@ impl Service { async fn create_one( tenant_shard_id: TenantShardId, locations: ShardMutationLocations, + http_client: reqwest::Client, jwt: Option, - ssl_ca_cert: Option, create_req: TimelineCreateRequest, ) -> Result { let latest = locations.latest.node; @@ -3545,8 +3615,7 @@ impl Service { ); let client = - PageserverClient::new(latest.get_id(), latest.base_url(), jwt.as_deref(), ssl_ca_cert.clone()) - .map_err(|e| passthrough_api_error(&latest, e))?; + PageserverClient::new(latest.get_id(), http_client.clone(), latest.base_url(), jwt.as_deref()); let timeline_info = client .timeline_create(tenant_shard_id, &create_req) @@ -3567,11 +3636,10 @@ impl Service { let client = PageserverClient::new( location.node.get_id(), + http_client.clone(), location.node.base_url(), jwt.as_deref(), - ssl_ca_cert.clone(), - ) - .map_err(|e| passthrough_api_error(&location.node, e))?; + ); let res = client .timeline_create(tenant_shard_id, &create_req) @@ -3599,8 +3667,8 @@ impl Service { let timeline_info = create_one( shard_zero_tid, shard_zero_locations, + self.http_client.clone(), self.config.pageserver_jwt_token.clone(), - self.config.ssl_ca_cert.clone(), create_req.clone(), ) .await?; @@ -3629,8 +3697,8 @@ impl Service { Box::pin(create_one( tenant_shard_id, mutation_locations, + self.http_client.clone(), jwt.clone(), - self.config.ssl_ca_cert.clone(), create_req, )) }, @@ -3713,16 +3781,15 @@ impl Service { tenant_shard_id: TenantShardId, timeline_id: TimelineId, node: Node, + http_client: reqwest::Client, jwt: Option, - ssl_ca_cert: Option, req: TimelineArchivalConfigRequest, ) -> Result<(), ApiError> { tracing::info!( "Setting archival config of timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref(), ssl_ca_cert) - .map_err(|e| passthrough_api_error(&node, e))?; + let client = PageserverClient::new(node.get_id(), http_client, node.base_url(), jwt.as_deref()); client .timeline_archival_config(tenant_shard_id, timeline_id, &req) @@ -3744,8 +3811,8 @@ impl Service { tenant_shard_id, timeline_id, node, + self.http_client.clone(), self.config.pageserver_jwt_token.clone(), - self.config.ssl_ca_cert.clone(), req.clone(), )) }) @@ -3782,16 +3849,15 @@ impl Service { tenant_shard_id: TenantShardId, timeline_id: TimelineId, node: Node, + http_client: reqwest::Client, jwt: Option, - ssl_ca_cert: Option, behavior: Option, ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { tracing::info!( "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref(), ssl_ca_cert) - .map_err(|e| passthrough_api_error(&node, e))?; + let client = PageserverClient::new(node.get_id(), http_client, node.base_url(), jwt.as_deref()); client .timeline_detach_ancestor(tenant_shard_id, timeline_id, behavior) @@ -3830,8 +3896,8 @@ impl Service { tenant_shard_id, timeline_id, node, + self.http_client.clone(), self.config.pageserver_jwt_token.clone(), - self.config.ssl_ca_cert.clone(), behavior, )) }) @@ -3884,17 +3950,16 @@ impl Service { tenant_shard_id: TenantShardId, timeline_id: TimelineId, node: Node, + http_client: reqwest::Client, jwt: Option, - ssl_ca_cert: Option, dir: BlockUnblock, ) -> Result<(), ApiError> { let client = PageserverClient::new( node.get_id(), + http_client, node.base_url(), jwt.as_deref(), - ssl_ca_cert, - ) - .map_err(|e| passthrough_api_error(&node, e))?; + ); client .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir) @@ -3913,8 +3978,8 @@ impl Service { tenant_shard_id, timeline_id, node, + self.http_client.clone(), self.config.pageserver_jwt_token.clone(), - self.config.ssl_ca_cert.clone(), dir, )) }) @@ -3924,6 +3989,75 @@ impl Service { Ok(()) } + pub(crate) async fn tenant_timeline_lsn_lease( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineLsnLease, + ) + .await; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + // If the request got an unsharded tenant id, then apply + // the operation to all shards. Otherwise, apply it to a specific shard. + let shards_range = TenantShardId::tenant_range(tenant_id); + + for (tenant_shard_id, shard) in locked.tenants.range(shards_range) { + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + let res = self + .tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + let mut valid_until = None; + for r in res { + match r { + Ok(lease) => { + if let Some(ref mut valid_until) = valid_until { + *valid_until = std::cmp::min(*valid_until, lease.valid_until); + } else { + valid_until = Some(lease.valid_until); + } + } + Err(e) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + } + } + } + Ok(LsnLease { + valid_until: valid_until.unwrap_or_else(SystemTime::now), + }) + } + pub(crate) async fn tenant_timeline_download_heatmap_layers( &self, tenant_shard_id: TenantShardId, @@ -4042,8 +4176,8 @@ impl Service { let r = node .with_client_retries( |client| op(tenant_shard_id, client), + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, warn_threshold, max_retries, timeout, @@ -4267,15 +4401,14 @@ impl Service { tenant_shard_id: TenantShardId, timeline_id: TimelineId, node: Node, + http_client: reqwest::Client, jwt: Option, - ssl_ca_cert: Option, ) -> Result { tracing::info!( "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref(), ssl_ca_cert) - .map_err(|e| passthrough_api_error(&node, e))?; + let client = PageserverClient::new(node.get_id(), http_client, node.base_url(), jwt.as_deref()); let res = client .timeline_delete(tenant_shard_id, timeline_id) .await; @@ -4301,8 +4434,8 @@ impl Service { tenant_shard_id, timeline_id, node, + self.http_client.clone(), self.config.pageserver_jwt_token.clone(), - self.config.ssl_ca_cert.clone(), )) }) .await?; @@ -4324,8 +4457,8 @@ impl Service { shard_zero_tid, timeline_id, shard_zero_locations.latest.node, + self.http_client.clone(), self.config.pageserver_jwt_token.clone(), - self.config.ssl_ca_cert.clone(), ) .await?; Ok(shard_zero_status) @@ -4760,8 +4893,8 @@ impl Service { client.location_config(child_id, config, None, false).await }, + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, 1, 10, Duration::from_secs(5), @@ -5363,11 +5496,10 @@ impl Service { } = target; let client = PageserverClient::new( node.get_id(), + self.http_client.clone(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - self.config.ssl_ca_cert.clone(), - ) - .map_err(|e| passthrough_api_error(node, e))?; + ); let response = client .tenant_shard_split( *parent_id, @@ -5407,6 +5539,8 @@ impl Service { } } + pausable_failpoint!("shard-split-pre-complete"); + // TODO: if the pageserver restarted concurrently with our split API call, // the actual generation of the child shard might differ from the generation // we expect it to have. In order for our in-database generation to end up @@ -5849,11 +5983,10 @@ impl Service { let client = PageserverClient::new( node.get_id(), + self.http_client.clone(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - self.config.ssl_ca_cert.clone(), - ) - .map_err(|e| passthrough_api_error(&node, e))?; + ); let scan_result = client .tenant_scan_remote_storage(tenant_id) @@ -7087,6 +7220,7 @@ impl Service { units, gate_guard, &self.reconcilers_cancel, + self.http_client.clone(), ) } @@ -7494,8 +7628,8 @@ impl Service { match attached_node .with_client_retries( |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7531,8 +7665,8 @@ impl Service { ) .await }, + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7559,99 +7693,232 @@ impl Service { } } - /// Asynchronously split a tenant that's eligible for automatic splits: + /// Asynchronously split a tenant that's eligible for automatic splits. At most one tenant will + /// be split per call. /// - /// * The tenant is unsharded. - /// * The logical size of its largest timeline exceeds split_threshold. - /// * The tenant's scheduling policy is active. + /// Two sets of criteria are used: initial splits and size-based splits (in that order). + /// Initial splits are used to eagerly split unsharded tenants that may be performing initial + /// ingestion, since sharded tenants have significantly better ingestion throughput. Size-based + /// splits are used to bound the maximum shard size and balance out load. /// - /// At most one tenant will be split per call: the one with the largest max logical size. It - /// will split 1 → 8 shards. + /// Splits are based on max_logical_size, i.e. the logical size of the largest timeline in a + /// tenant. We use this instead of the total logical size because branches will duplicate + /// logical size without actually using more storage. We could also use visible physical size, + /// but this might overestimate tenants that frequently churn branches. + /// + /// Initial splits (initial_split_threshold): + /// * Applies to tenants with 1 shard. + /// * The largest timeline (max_logical_size) exceeds initial_split_threshold. + /// * Splits into initial_split_shards. + /// + /// Size-based splits (split_threshold): + /// * Applies to all tenants. + /// * The largest timeline (max_logical_size) divided by shard count exceeds split_threshold. + /// * Splits such that max_logical_size / shard_count <= split_threshold, in powers of 2. + /// + /// Tenant shards are ordered by descending max_logical_size, first initial split candidates + /// then size-based split candidates. The first matching candidate is split. + /// + /// The shard count is clamped to max_split_shards. If a candidate is eligible for both initial + /// and size-based splits, the largest shard count will be used. /// /// An unsharded tenant will get DEFAULT_STRIPE_SIZE, regardless of what its ShardIdentity says. /// A sharded tenant will retain its stripe size, as splits do not allow changing it. /// - /// TODO: consider splitting based on total logical size rather than max logical size. - /// /// TODO: consider spawning multiple splits in parallel: this is only called once every 20 /// seconds, so a large backlog can take a long time, and if a tenant fails to split it will /// block all other splits. async fn autosplit_tenants(self: &Arc) { - let Some(split_threshold) = self.config.split_threshold else { - return; // auto-splits are disabled - }; - if split_threshold == 0 { + // If max_split_shards is set to 0 or 1, we can't split. + let max_split_shards = self.config.max_split_shards; + if max_split_shards <= 1 { return; } - // Fetch the largest eligible shards by logical size. - const MAX_SHARDS: ShardCount = ShardCount::new(8); + // If initial_split_shards is set to 0 or 1, disable initial splits. + let mut initial_split_threshold = self.config.initial_split_threshold.unwrap_or(0); + let initial_split_shards = self.config.initial_split_shards; + if initial_split_shards <= 1 { + initial_split_threshold = 0; + } - let mut top_n = self - .get_top_tenant_shards(&TopTenantShardsRequest { - order_by: TenantSorting::MaxLogicalSize, - limit: 10, - where_shards_lt: Some(MAX_SHARDS), - where_gt: Some(split_threshold), - }) - .await; + // If no split_threshold nor initial_split_threshold, disable autosplits. + let split_threshold = self.config.split_threshold.unwrap_or(0); + if split_threshold == 0 && initial_split_threshold == 0 { + return; + } + + // Fetch split candidates in prioritized order. + // + // If initial splits are enabled, fetch eligible tenants first. We prioritize initial splits + // over size-based splits, since these are often performing initial ingestion and rely on + // splits to improve ingest throughput. + let mut candidates = Vec::new(); + + if initial_split_threshold > 0 { + // Initial splits: fetch tenants with 1 shard where the logical size of the largest + // timeline exceeds the initial split threshold. + let initial_candidates = self + .get_top_tenant_shards(&TopTenantShardsRequest { + order_by: TenantSorting::MaxLogicalSize, + limit: 10, + where_shards_lt: Some(ShardCount(2)), + where_gt: Some(initial_split_threshold), + }) + .await; + candidates.extend(initial_candidates); + } + + if split_threshold > 0 { + // Size-based splits: fetch tenants where the logical size of the largest timeline + // divided by shard count exceeds the split threshold. + // + // max_logical_size is only tracked on shard 0, and contains the total logical size + // across all shards. We have to order and filter by MaxLogicalSizePerShard, i.e. + // max_logical_size / shard_count, such that we only receive tenants that are actually + // eligible for splits. But we still use max_logical_size for later split calculations. + let size_candidates = self + .get_top_tenant_shards(&TopTenantShardsRequest { + order_by: TenantSorting::MaxLogicalSizePerShard, + limit: 10, + where_shards_lt: Some(ShardCount(max_split_shards)), + where_gt: Some(split_threshold), + }) + .await; + #[cfg(feature = "testing")] + assert!( + size_candidates.iter().all(|c| c.id.is_shard_zero()), + "MaxLogicalSizePerShard returned non-zero shard: {size_candidates:?}", + ); + candidates.extend(size_candidates); + } // Filter out tenants in a prohibiting scheduling mode. { let state = self.inner.read().unwrap(); - top_n.retain(|i| { + candidates.retain(|i| { let policy = state.tenants.get(&i.id).map(|s| s.get_scheduling_policy()); policy == Some(ShardSchedulingPolicy::Active) }); } - let Some(split_candidate) = top_n.into_iter().next() else { - debug!("No split-elegible shards found"); + // Pick the first candidate to split. This will generally always be the first one in + // candidates, but we defensively skip candidates that end up not actually splitting. + let Some((candidate, new_shard_count)) = candidates + .into_iter() + .filter_map(|candidate| { + let new_shard_count = Self::compute_split_shards(ShardSplitInputs { + shard_count: candidate.id.shard_count, + max_logical_size: candidate.max_logical_size, + split_threshold, + max_split_shards, + initial_split_threshold, + initial_split_shards, + }); + new_shard_count.map(|shards| (candidate, shards.count())) + }) + .next() + else { + debug!("no split-eligible tenants found"); return; }; - // We spawn a task to run this, so it's exactly like some external API client requesting it. - // We don't want to block the background reconcile loop on this. - info!( - "Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}" - ); - // Retain the stripe size of sharded tenants, as splits don't allow changing it. Otherwise, // use DEFAULT_STRIPE_SIZE for unsharded tenants -- their stripe size doesn't really matter, // and if we change the default stripe size we want to use the new default rather than an // old, persisted stripe size. - let new_stripe_size = match split_candidate.id.shard_count.count() { + let new_stripe_size = match candidate.id.shard_count.count() { 0 => panic!("invalid shard count 0"), 1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE), 2.. => None, }; + // We spawn a task to run this, so it's exactly like some external API client requesting + // it. We don't want to block the background reconcile loop on this. + let old_shard_count = candidate.id.shard_count.count(); + info!( + "auto-splitting tenant {old_shard_count} → {new_shard_count} shards, \ + current size {candidate:?} (split_threshold={split_threshold} \ + initial_split_threshold={initial_split_threshold})" + ); + let this = self.clone(); tokio::spawn( async move { match this .tenant_shard_split( - split_candidate.id.tenant_id, + candidate.id.tenant_id, TenantShardSplitRequest { - // Always split to the max number of shards: this avoids stepping - // through intervening shard counts and encountering the overhead of a - // split+cleanup each time as a tenant grows, and is not too expensive - // because our max shard count is relatively low anyway. This policy - // will be adjusted in future once we support higher shard count. - new_shard_count: MAX_SHARDS.literal(), + new_shard_count, new_stripe_size, }, ) .await { - Ok(_) => info!("Successful auto-split"), - Err(err) => error!("Auto-split failed: {err}"), + Ok(_) => { + info!("successful auto-split {old_shard_count} → {new_shard_count} shards") + } + Err(err) => error!("auto-split failed: {err}"), } } - .instrument(info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)), + .instrument(info_span!("auto_split", tenant_id=%candidate.id.tenant_id)), ); } + /// Returns the number of shards to split a tenant into, or None if the tenant shouldn't split, + /// based on the total logical size of the largest timeline summed across all shards. Uses the + /// larger of size-based and initial splits, clamped to max_split_shards. + /// + /// NB: the thresholds are exclusive, since TopTenantShardsRequest uses where_gt. + fn compute_split_shards(inputs: ShardSplitInputs) -> Option { + let ShardSplitInputs { + shard_count, + max_logical_size, + split_threshold, + max_split_shards, + initial_split_threshold, + initial_split_shards, + } = inputs; + + let mut new_shard_count: u8 = shard_count.count(); + + // Size-based splits. Ensures max_logical_size / new_shard_count <= split_threshold, using + // power-of-two shard counts. + // + // If the current shard count is not a power of two, and does not exceed split_threshold, + // then we leave it alone rather than forcing a power-of-two split. + if split_threshold > 0 + && max_logical_size.div_ceil(split_threshold) > shard_count.count() as u64 + { + new_shard_count = max_logical_size + .div_ceil(split_threshold) + .checked_next_power_of_two() + .unwrap_or(u8::MAX as u64) + .try_into() + .unwrap_or(u8::MAX); + } + + // Initial splits. Use the larger of size-based and initial split shard counts. This only + // applies to unsharded tenants, i.e. changes to initial_split_threshold or + // initial_split_shards are not retroactive for sharded tenants. + if initial_split_threshold > 0 + && shard_count.count() <= 1 + && max_logical_size > initial_split_threshold + { + new_shard_count = new_shard_count.max(initial_split_shards); + } + + // Clamp to max shards. + new_shard_count = new_shard_count.min(max_split_shards); + + // Don't split if we're not increasing the shard count. + if new_shard_count <= shard_count.count() { + return None; + } + + Some(ShardCount(new_shard_count)) + } + /// Fetches the top tenant shards from every node, in descending order of /// max logical size. Any node errors will be logged and ignored. async fn get_top_tenant_shards( @@ -7672,8 +7939,8 @@ impl Service { futures.push(async move { node.with_client_retries( |client| async move { client.top_tenant_shards(request.clone()).await }, + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, 3, 3, Duration::from_secs(5), @@ -7792,8 +8059,8 @@ impl Service { match node .with_client_retries( |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, + &self.http_client, &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, 1, 3, Duration::from_millis(250), @@ -8414,3 +8681,329 @@ impl Service { }) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Tests Service::compute_split_shards. For readability, this specifies sizes in GBs rather + /// than bytes. Note that max_logical_size is the total logical size of the largest timeline + /// summed across all shards. + #[test] + fn compute_split_shards() { + // Size-based split: two shards have a 500 GB timeline, which need to split into 8 shards + // that are <= 64 GB, + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(2), + max_logical_size: 500, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + Some(ShardCount(8)) + ); + + // Size-based split: noop at or below threshold, fires above. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(2), + max_logical_size: 127, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + None, + ); + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(2), + max_logical_size: 128, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + None, + ); + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(2), + max_logical_size: 129, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + Some(ShardCount(4)), + ); + + // Size-based split: clamped to max_split_shards. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(2), + max_logical_size: 10000, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + Some(ShardCount(16)) + ); + + // Size-based split: tenant already at or beyond max_split_shards is not split. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(16), + max_logical_size: 10000, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + None + ); + + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(32), + max_logical_size: 10000, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + None + ); + + // Size-based split: a non-power-of-2 shard count is normalized to power-of-2 if it + // exceeds split_threshold (i.e. a 3-shard tenant splits into 8, not 6). + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(3), + max_logical_size: 320, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + Some(ShardCount(8)) + ); + + // Size-based split: a non-power-of-2 shard count is not normalized to power-of-2 if the + // existing shards are below or at split_threshold, but splits into 4 if it exceeds it. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(3), + max_logical_size: 191, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + None + ); + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(3), + max_logical_size: 192, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + None + ); + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(3), + max_logical_size: 193, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 0, + initial_split_shards: 0, + }), + Some(ShardCount(4)) + ); + + // Initial split: tenant has a 10 GB timeline, split into 4 shards. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 10, + split_threshold: 0, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 4, + }), + Some(ShardCount(4)) + ); + + // Initial split: 0 ShardCount is equivalent to 1. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(0), + max_logical_size: 10, + split_threshold: 0, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 4, + }), + Some(ShardCount(4)) + ); + + // Initial split: at or below threshold is noop. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 7, + split_threshold: 0, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 4, + }), + None, + ); + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 8, + split_threshold: 0, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 4, + }), + None, + ); + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 9, + split_threshold: 0, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 4, + }), + Some(ShardCount(4)) + ); + + // Initial split: already sharded tenant is not affected, even if above threshold and below + // shard count. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(2), + max_logical_size: 20, + split_threshold: 0, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 4, + }), + None, + ); + + // Initial split: clamped to max_shards. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 10, + split_threshold: 0, + max_split_shards: 3, + initial_split_threshold: 8, + initial_split_shards: 4, + }), + Some(ShardCount(3)), + ); + + // Initial+size split: tenant eligible for both will use the larger shard count. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 10, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 4, + }), + Some(ShardCount(4)), + ); + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 500, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 4, + }), + Some(ShardCount(8)), + ); + + // Initial+size split: sharded tenant is only eligible for size-based split. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(2), + max_logical_size: 200, + split_threshold: 64, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 8, + }), + Some(ShardCount(4)), + ); + + // Initial+size split: uses the larger shard count even with initial_split_threshold above + // split_threshold. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 10, + split_threshold: 4, + max_split_shards: 16, + initial_split_threshold: 8, + initial_split_shards: 8, + }), + Some(ShardCount(8)), + ); + + // Test backwards compatibility with production settings when initial/size-based splits were + // rolled out: a single split into 8 shards at 64 GB. Any already sharded tenants with <8 + // shards will split according to split_threshold. + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 65, + split_threshold: 64, + max_split_shards: 8, + initial_split_threshold: 64, + initial_split_shards: 8, + }), + Some(ShardCount(8)), + ); + + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(1), + max_logical_size: 64, + split_threshold: 64, + max_split_shards: 8, + initial_split_threshold: 64, + initial_split_shards: 8, + }), + None, + ); + + assert_eq!( + Service::compute_split_shards(ShardSplitInputs { + shard_count: ShardCount(2), + max_logical_size: 129, + split_threshold: 64, + max_split_shards: 8, + initial_split_threshold: 64, + initial_split_shards: 8, + }), + Some(ShardCount(4)), + ); + } +} diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index a0419e0205..9c7a9e3798 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -4,7 +4,7 @@ use std::time::Duration; use pageserver_api::controller_api::ShardSchedulingPolicy; use rand::seq::SliceRandom; -use rand::thread_rng; +use rand::{Rng, thread_rng}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; use utils::shard::TenantShardId; @@ -64,17 +64,22 @@ impl ChaosInjector { let mut interval = tokio::time::interval(self.interval); #[derive(Debug)] enum ChaosEvent { - ShuffleTenant, - ForceKill, + MigrationsToSecondary, + ForceKillController, + GracefulMigrationsAnywhere, } loop { let cron_interval = self.get_cron_interval_sleep_future(); let chaos_type = tokio::select! { _ = interval.tick() => { - ChaosEvent::ShuffleTenant + if thread_rng().gen_bool(0.5) { + ChaosEvent::MigrationsToSecondary + } else { + ChaosEvent::GracefulMigrationsAnywhere + } } Some(_) = maybe_sleep(cron_interval) => { - ChaosEvent::ForceKill + ChaosEvent::ForceKillController } _ = cancel.cancelled() => { tracing::info!("Shutting down"); @@ -83,16 +88,29 @@ impl ChaosInjector { }; tracing::info!("Chaos iteration: {chaos_type:?}..."); match chaos_type { - ChaosEvent::ShuffleTenant => { - self.inject_chaos().await; + ChaosEvent::MigrationsToSecondary => { + self.inject_migrations_to_secondary(); } - ChaosEvent::ForceKill => { + ChaosEvent::GracefulMigrationsAnywhere => { + self.inject_graceful_migrations_anywhere(); + } + ChaosEvent::ForceKillController => { self.force_kill().await; } } } } + fn is_shard_eligible_for_chaos(&self, shard: &TenantShard) -> bool { + // - Skip non-active scheduling policies, so that a shard with a policy like Pause can + // be pinned without being disrupted by us. + // - Skip shards doing a graceful migration already, so that we allow these to run to + // completion rather than only exercising the first part and then cancelling with + // some other chaos. + !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) + && shard.get_preferred_node().is_none() + } + /// If a shard has a secondary and attached location, then re-assign the secondary to be /// attached and the attached to be secondary. /// @@ -108,13 +126,7 @@ impl ChaosInjector { .get_mut(&tenant_shard_id) .expect("Held lock between choosing ID and this get"); - if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) { - // Skip non-active scheduling policies, so that a shard with a policy like Pause can - // be pinned without being disrupted by us. - tracing::info!( - "Skipping shard {tenant_shard_id}: scheduling policy is {:?}", - shard.get_scheduling_policy() - ); + if !self.is_shard_eligible_for_chaos(shard) { return; } @@ -152,7 +164,77 @@ impl ChaosInjector { std::process::exit(1); } - async fn inject_chaos(&mut self) { + // Unlike [`Self::inject_migrations_to_secondary`], this function will not only cut over to secondary, it + // will migrate a tenant to a random node in its home AZ using a graceful migration of the same type + // that my be initiated by an API caller using prewarm=true. + // + // This is a much more expensive operation in terms of I/O and time, as we will fully warm up + // some new location in order to migrate the tenant there. For that reason we do far fewer of these. + fn inject_graceful_migrations_anywhere(&mut self) { + let batch_size = 1; + let mut inner = self.service.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = inner.parts_mut(); + + let mut candidates = tenants + .values_mut() + .filter(|shard| self.is_shard_eligible_for_chaos(shard)) + .collect::>(); + + tracing::info!( + "Injecting chaos: found {} candidates for graceful migrations anywhere", + candidates.len() + ); + + let mut victims: Vec<&mut TenantShard> = Vec::new(); + + // Pick our victims: use a hand-rolled loop rather than choose_multiple() because we want + // to take the mutable refs from our candidates rather than ref'ing them. + while !candidates.is_empty() && victims.len() < batch_size { + let i = thread_rng().gen_range(0..candidates.len()); + victims.push(candidates.swap_remove(i)); + } + + for victim in victims.into_iter() { + // Find a node in the same AZ as the shard, or if the shard has no AZ preference, which + // is not where they are currently attached. + let candidate_nodes = nodes + .values() + .filter(|node| { + if let Some(preferred_az) = victim.preferred_az() { + node.get_availability_zone_id() == preferred_az + } else if let Some(attached) = *victim.intent.get_attached() { + node.get_id() != attached + } else { + true + } + }) + .collect::>(); + + let Some(victim_node) = candidate_nodes.choose(&mut thread_rng()) else { + // This can happen if e.g. we are in a small region with only one pageserver per AZ. + tracing::info!( + "no candidate nodes found for migrating shard {tenant_shard_id} within its home AZ", + tenant_shard_id = victim.tenant_shard_id + ); + continue; + }; + + // This doesn't change intent immediately: next iteration of Service::optimize_all should do that. We avoid + // doing it here because applying optimizations requires dropping lock to do some async work to check the optimisation + // is valid given remote state, and it would be a shame to duplicate that dance here. + tracing::info!( + "Injecting chaos: migrate {} to {}", + victim.tenant_shard_id, + victim_node + ); + victim.set_preferred_node(Some(victim_node.get_id())); + } + } + + /// Migrations of attached locations to their secondary location. This exercises reconciliation in general, + /// live migration in particular, and the pageserver code for cleanly shutting down and starting up tenants + /// during such migrations. + fn inject_migrations_to_secondary(&mut self) { // Pick some shards to interfere with let batch_size = 128; let mut inner = self.service.inner.write().unwrap(); diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index 4fa465c307..76e3162617 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -35,6 +35,10 @@ impl SafekeeperReconcilers { service: &Arc, reqs: Vec, ) { + tracing::info!( + "Scheduling {} pending safekeeper ops loaded from db", + reqs.len() + ); for req in reqs { self.schedule_request(service, req); } @@ -51,6 +55,22 @@ impl SafekeeperReconcilers { handle.cancel.cancel(); } } + /// Cancel ongoing reconciles for the given timeline + /// + /// Specifying `None` here only removes reconciles for the tenant-global reconciliation, + /// instead of doing this for all timelines of the tenant. + /// + /// Callers must remove the reconciles from the db manually + pub(crate) fn cancel_reconciles_for_timeline( + &mut self, + node_id: NodeId, + tenant_id: TenantId, + timeline_id: Option, + ) { + if let Some(handle) = self.reconcilers.get(&node_id) { + handle.cancel_reconciliation(tenant_id, timeline_id); + } + } } /// Initial load of the pending operations from the db @@ -58,7 +78,7 @@ pub(crate) async fn load_schedule_requests( service: &Arc, safekeepers: &HashMap, ) -> anyhow::Result> { - let pending_ops = service.persistence.list_pending_ops(None).await?; + let pending_ops = service.persistence.list_pending_ops().await?; let mut res = Vec::with_capacity(pending_ops.len()); for op_persist in pending_ops { let node_id = NodeId(op_persist.sk_id as u64); @@ -73,12 +93,21 @@ pub(crate) async fn load_schedule_requests( }; let sk = Box::new(sk.clone()); let tenant_id = TenantId::from_str(&op_persist.tenant_id)?; - let timeline_id = TimelineId::from_str(&op_persist.timeline_id)?; + let timeline_id = if !op_persist.timeline_id.is_empty() { + Some(TimelineId::from_str(&op_persist.timeline_id)?) + } else { + None + }; let host_list = match op_persist.op_kind { SafekeeperTimelineOpKind::Delete => Vec::new(), SafekeeperTimelineOpKind::Exclude => Vec::new(), SafekeeperTimelineOpKind::Pull => { // TODO this code is super hacky, it doesn't take migrations into account + let Some(timeline_id) = timeline_id else { + anyhow::bail!( + "timeline_id is empty for `pull` schedule request for {tenant_id}" + ); + }; let timeline_persist = service .persistence .get_timeline(tenant_id, timeline_id) @@ -129,14 +158,14 @@ pub(crate) struct ScheduleRequest { pub(crate) safekeeper: Box, pub(crate) host_list: Vec<(NodeId, String)>, pub(crate) tenant_id: TenantId, - pub(crate) timeline_id: TimelineId, + pub(crate) timeline_id: Option, pub(crate) generation: u32, pub(crate) kind: SafekeeperTimelineOpKind, } struct ReconcilerHandle { - tx: UnboundedSender<(ScheduleRequest, Arc)>, - ongoing_tokens: Arc>>, + tx: UnboundedSender<(ScheduleRequest, CancellationToken)>, + ongoing_tokens: Arc), CancellationToken>>, cancel: CancellationToken, } @@ -145,14 +174,20 @@ impl ReconcilerHandle { fn new_token_slot( &self, tenant_id: TenantId, - timeline_id: TimelineId, - ) -> Arc { + timeline_id: Option, + ) -> CancellationToken { let entry = self.ongoing_tokens.entry((tenant_id, timeline_id)); if let Entry::Occupied(entry) = &entry { let cancel: &CancellationToken = entry.get(); cancel.cancel(); } - entry.insert(Arc::new(self.cancel.child_token())).clone() + entry.insert(self.cancel.child_token()).clone() + } + /// Cancel an ongoing reconciliation + fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option) { + if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) { + cancel.cancel(); + } } fn schedule_reconcile(&self, req: ScheduleRequest) { let cancel = self.new_token_slot(req.tenant_id, req.timeline_id); @@ -165,7 +200,7 @@ impl ReconcilerHandle { pub(crate) struct SafekeeperReconciler { service: Arc, - rx: UnboundedReceiver<(ScheduleRequest, Arc)>, + rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>, cancel: CancellationToken, } @@ -201,20 +236,28 @@ impl SafekeeperReconciler { let kind = req.kind; let tenant_id = req.tenant_id; let timeline_id = req.timeline_id; + let node_id = req.safekeeper.skp.id; self.reconcile_one(req, req_cancel) .instrument(tracing::info_span!( "reconcile_one", ?kind, %tenant_id, - %timeline_id + ?timeline_id, + %node_id, )) .await; } } - async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: Arc) { + async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) { let req_host = req.safekeeper.skp.host.clone(); match req.kind { SafekeeperTimelineOpKind::Pull => { + let Some(timeline_id) = req.timeline_id else { + tracing::warn!( + "ignoring invalid schedule request: timeline_id is empty for `pull`" + ); + return; + }; let our_id = req.safekeeper.get_id(); let http_hosts = req .host_list @@ -225,7 +268,7 @@ impl SafekeeperReconciler { let pull_req = PullTimelineRequest { http_hosts, tenant_id: req.tenant_id, - timeline_id: req.timeline_id, + timeline_id, }; self.reconcile_inner( req, @@ -243,7 +286,12 @@ impl SafekeeperReconciler { SafekeeperTimelineOpKind::Exclude => { // TODO actually exclude instead of delete here let tenant_id = req.tenant_id; - let timeline_id = req.timeline_id; + let Some(timeline_id) = req.timeline_id else { + tracing::warn!( + "ignoring invalid schedule request: timeline_id is empty for `exclude`" + ); + return; + }; self.reconcile_inner( req, async |client| client.delete_timeline(tenant_id, timeline_id).await, @@ -256,26 +304,97 @@ impl SafekeeperReconciler { } SafekeeperTimelineOpKind::Delete => { let tenant_id = req.tenant_id; - let timeline_id = req.timeline_id; - self.reconcile_inner( - req, - async |client| client.delete_timeline(tenant_id, timeline_id).await, - |_resp| { - tracing::info!("deleted timeline from {req_host}"); - }, - req_cancel, - ) - .await; + if let Some(timeline_id) = req.timeline_id { + let deleted = self.reconcile_inner( + req, + async |client| client.delete_timeline(tenant_id, timeline_id).await, + |_resp| { + tracing::info!(%tenant_id, %timeline_id, "deleted timeline from {req_host}"); + }, + req_cancel, + ) + .await; + if deleted { + self.delete_timeline_from_db(tenant_id, timeline_id).await; + } + } else { + let deleted = self + .reconcile_inner( + req, + async |client| client.delete_tenant(tenant_id).await, + |_resp| { + tracing::info!(%tenant_id, "deleted tenant from {req_host}"); + }, + req_cancel, + ) + .await; + if deleted { + self.delete_tenant_timelines_from_db(tenant_id).await; + } + } } } } + async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) { + match self + .service + .persistence + .list_pending_ops_for_timeline(tenant_id, timeline_id) + .await + { + Ok(list) => { + if !list.is_empty() { + tracing::info!(%tenant_id, %timeline_id, "not deleting timeline from db as there is {} open reconciles", list.len()); + return; + } + } + Err(e) => { + tracing::warn!(%tenant_id, %timeline_id, "couldn't query pending ops: {e}"); + return; + } + } + tracing::info!(%tenant_id, %timeline_id, "deleting timeline from db after all reconciles succeeded"); + // In theory we could crash right after deleting the op from the db and right before reaching this, + // but then we'll boot up with a timeline that has deleted_at set, so hopefully we'll issue deletion ops for it again. + if let Err(err) = self + .service + .persistence + .delete_timeline(tenant_id, timeline_id) + .await + { + tracing::warn!(%tenant_id, %timeline_id, "couldn't delete timeline from db: {err}"); + } + } + async fn delete_tenant_timelines_from_db(&self, tenant_id: TenantId) { + let timeline_list = match self + .service + .persistence + .list_timelines_for_tenant(tenant_id) + .await + { + Ok(timeline_list) => timeline_list, + Err(e) => { + tracing::warn!(%tenant_id, "couldn't query timelines: {e}"); + return; + } + }; + for timeline in timeline_list { + let Ok(timeline_id) = TimelineId::from_str(&timeline.timeline_id) else { + tracing::warn!("Invalid timeline ID in database {}", timeline.timeline_id); + continue; + }; + self.delete_timeline_from_db(tenant_id, timeline_id).await; + } + } + /// Returns whether the reconciliation happened successfully async fn reconcile_inner( &self, req: ScheduleRequest, closure: impl Fn(SafekeeperClient) -> F, log_success: impl FnOnce(T) -> U, - req_cancel: Arc, - ) where + req_cancel: CancellationToken, + ) -> bool + where F: Future>, { let jwt = self @@ -284,7 +403,6 @@ impl SafekeeperReconciler { .safekeeper_jwt_token .clone() .map(SecretString::from); - let ssl_ca_cert = self.service.config.ssl_ca_cert.clone(); loop { let res = req .safekeeper @@ -293,8 +411,8 @@ impl SafekeeperReconciler { let closure = &closure; async move { closure(client).await } }, + self.service.get_http_client(), &jwt, - &ssl_ca_cert, 3, 10, Duration::from_secs(10), @@ -320,11 +438,11 @@ impl SafekeeperReconciler { req.safekeeper.skp.host ); } - return; + return true; } Err(mgmt_api::Error::Cancelled) => { // On cancellation, the code that issued it will take care of removing db entries (if needed) - return; + return false; } Err(e) => { tracing::info!( diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index b5fb00a469..7f2c63b9af 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1,4 +1,5 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -8,6 +9,7 @@ use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; +use anyhow::Context; use http_utils::error::ApiError; use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo}; @@ -76,8 +78,8 @@ impl Service { for sk in timeline_persistence.sk_set.iter() { let sk_id = NodeId(*sk as u64); let safekeepers = safekeepers.clone(); + let http_client = self.http_client.clone(); let jwt = jwt.clone(); - let ssl_ca_cert = self.config.ssl_ca_cert.clone(); let req = req.clone(); joinset.spawn(async move { // Unwrap is fine as we already would have returned error above @@ -88,8 +90,8 @@ impl Service { let req = req.clone(); async move { client.create_timeline(&req).await } }, + &http_client, &jwt, - &ssl_ca_cert, 3, 3, SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, @@ -265,7 +267,8 @@ impl Service { .get(&sk.id) .ok_or_else(|| { ApiError::InternalServerError(anyhow::anyhow!( - "Couldn't find safekeeper with id {remaining_id} to pull from" + "Couldn't find safekeeper with id {} to pull from", + sk.id )) })? .base_url(), @@ -279,7 +282,7 @@ impl Service { safekeeper: Box::new(sk.clone()), host_list, tenant_id, - timeline_id, + timeline_id: Some(timeline_id), generation: timeline_persist.generation as u32, kind: crate::persistence::SafekeeperTimelineOpKind::Pull, }; @@ -310,25 +313,32 @@ impl Service { ); return Ok(()); }; + self.persistence + .timeline_set_deleted_at(tenant_id, timeline_id) + .await?; let all_sks = tl .new_sk_set .iter() - .flat_map(|sks| { - sks.iter() - .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude)) - }) - .chain( - tl.sk_set - .iter() - .map(|v| (*v, SafekeeperTimelineOpKind::Delete)), - ) - .collect::>(); + .flatten() + .chain(tl.sk_set.iter()) + .collect::>(); // Schedule reconciliations + for &sk_id in all_sks.iter() { + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: tl.generation, + op_kind: SafekeeperTimelineOpKind::Delete, + sk_id: *sk_id, + }; + tracing::info!("writing pending op for sk id {sk_id}"); + self.persistence.insert_pending_op(pending_op).await?; + } { let mut locked = self.inner.write().unwrap(); - for (sk_id, kind) in all_sks { - let sk_id = NodeId(sk_id as u64); + for sk_id in all_sks { + let sk_id = NodeId(*sk_id as u64); let Some(sk) = locked.safekeepers.get(&sk_id) else { return Err(ApiError::InternalServerError(anyhow::anyhow!( "Couldn't find safekeeper with id {sk_id}" @@ -340,9 +350,9 @@ impl Service { // we don't use this for this kind, put a dummy value host_list: Vec::new(), tenant_id, - timeline_id, + timeline_id: Some(timeline_id), generation: tl.generation as u32, - kind, + kind: SafekeeperTimelineOpKind::Delete, }; locked.safekeeper_reconcilers.schedule_request(self, req); } @@ -350,6 +360,104 @@ impl Service { Ok(()) } + /// Perform tenant deletion on safekeepers. + pub(super) async fn tenant_delete_safekeepers( + self: &Arc, + tenant_id: TenantId, + ) -> Result<(), ApiError> { + let timeline_list = self + .persistence + .list_timelines_for_tenant(tenant_id) + .await?; + + if timeline_list.is_empty() { + // Early exit: the tenant is either empty or not migrated to the storcon yet + tracing::info!("Skipping tenant delete as the timeline doesn't exist in db"); + return Ok(()); + } + + let timeline_list = timeline_list + .into_iter() + .map(|timeline| { + let timeline_id = TimelineId::from_str(&timeline.timeline_id) + .context("timeline id loaded from db") + .map_err(ApiError::InternalServerError)?; + Ok((timeline_id, timeline)) + }) + .collect::, ApiError>>()?; + + // Remove pending ops from db, and set `deleted_at`. + // We cancel them in a later iteration once we hold the state lock. + for (timeline_id, _timeline) in timeline_list.iter() { + self.persistence + .remove_pending_ops_for_timeline(tenant_id, Some(*timeline_id)) + .await?; + self.persistence + .timeline_set_deleted_at(tenant_id, *timeline_id) + .await?; + } + + // The list of safekeepers that have any of the timelines + let mut sk_list = HashSet::new(); + + // List all pending ops for all timelines, cancel them + for (_timeline_id, timeline) in timeline_list.iter() { + let sk_iter = timeline + .sk_set + .iter() + .chain(timeline.new_sk_set.iter().flatten()) + .map(|id| NodeId(*id as u64)); + sk_list.extend(sk_iter); + } + + for &sk_id in sk_list.iter() { + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: String::new(), + generation: i32::MAX, + op_kind: SafekeeperTimelineOpKind::Delete, + sk_id: sk_id.0 as i64, + }; + tracing::info!("writing pending op for sk id {sk_id}"); + self.persistence.insert_pending_op(pending_op).await?; + } + + let mut locked = self.inner.write().unwrap(); + + for (timeline_id, _timeline) in timeline_list.iter() { + for sk_id in sk_list.iter() { + locked + .safekeeper_reconcilers + .cancel_reconciles_for_timeline(*sk_id, tenant_id, Some(*timeline_id)); + } + } + + // unwrap is safe: we return above for an empty timeline list + let max_generation = timeline_list + .iter() + .map(|(_tl_id, tl)| tl.generation as u32) + .max() + .unwrap(); + + for sk_id in sk_list { + let Some(safekeeper) = locked.safekeepers.get(&sk_id) else { + tracing::warn!("Couldn't find safekeeper with id {sk_id}"); + continue; + }; + // Add pending op for tenant deletion + let req = ScheduleRequest { + generation: max_generation, + host_list: Vec::new(), + kind: SafekeeperTimelineOpKind::Delete, + safekeeper: Box::new(safekeeper.clone()), + tenant_id, + timeline_id: None, + }; + locked.safekeeper_reconcilers.schedule_request(self, req); + } + Ok(()) + } + /// Choose safekeepers for the new timeline: 3 in different azs. pub(crate) async fn safekeepers_for_new_timeline( &self, diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 80f42e04a9..8424c65aba 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -622,7 +622,7 @@ impl TenantShard { .collect::>(); attached_locs.sort_by_key(|i| i.1); - if let Some((node_id, _gen)) = attached_locs.into_iter().last() { + if let Some((node_id, _gen)) = attached_locs.into_iter().next_back() { self.intent.set_attached(scheduler, Some(*node_id)); } @@ -1588,6 +1588,7 @@ impl TenantShard { units: ReconcileUnits, gate_guard: GateGuard, cancel: &CancellationToken, + http_client: reqwest::Client, ) -> Option { // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before // doing our sequence's work. @@ -1633,6 +1634,7 @@ impl TenantShard { cancel: reconciler_cancel.clone(), persistence: persistence.clone(), compute_notify_failure: false, + http_client, }; let reconcile_seq = self.sequence; diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs index efb05fb55e..a4ca68d378 100644 --- a/storage_scrubber/src/find_large_objects.rs +++ b/storage_scrubber/src/find_large_objects.rs @@ -18,7 +18,7 @@ enum LargeObjectKind { impl LargeObjectKind { fn from_key(key: &str) -> Self { - let fname = key.split('/').last().unwrap(); + let fname = key.split('/').next_back().unwrap(); let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else { return LargeObjectKind::Other; diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 34e43fcc0b..071f0b9756 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -295,8 +295,8 @@ pub struct ControllerClientConfig { } impl ControllerClientConfig { - pub fn build_client(self) -> control_api::Client { - control_api::Client::new(self.controller_api, Some(self.controller_jwt)) + pub fn build_client(self, http_client: reqwest::Client) -> control_api::Client { + control_api::Client::new(http_client, self.controller_api, Some(self.controller_jwt)) } } diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index fb2ab02565..4823c43e10 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -3,7 +3,7 @@ use camino::Utf8PathBuf; use clap::{Parser, Subcommand}; use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; -use reqwest::{Method, Url}; +use reqwest::{Certificate, Method, Url}; use storage_controller_client::control_api; use storage_scrubber::garbage::{PurgeMode, find_garbage, purge_garbage}; use storage_scrubber::pageserver_physical_gc::{GcMode, pageserver_physical_gc}; @@ -41,6 +41,10 @@ struct Cli { /// If set to true, the scrubber will exit with error code on fatal error. #[arg(long, default_value_t = false)] exit_code: bool, + + /// Trusted root CA certificates to use in https APIs. + #[arg(long)] + ssl_ca_file: Option, } #[derive(Subcommand, Debug)] @@ -146,13 +150,28 @@ async fn main() -> anyhow::Result<()> { tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG); + let ssl_ca_certs = match cli.ssl_ca_file.as_ref() { + Some(ssl_ca_file) => { + tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); + let buf = tokio::fs::read(ssl_ca_file).await?; + Certificate::from_pem_bundle(&buf)? + } + None => Vec::new(), + }; + + let mut http_client = reqwest::Client::builder(); + for cert in ssl_ca_certs { + http_client = http_client.add_root_certificate(cert); + } + let http_client = http_client.build()?; + let controller_client = cli.controller_api.map(|controller_api| { ControllerClientConfig { controller_api, // Default to no key: this is a convenience when working in a development environment controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()), } - .build_client() + .build_client(http_client) }); match cli.command { diff --git a/test_runner/cloud_regress/test_cloud_regress.py b/test_runner/cloud_regress/test_cloud_regress.py index 63427c1912..b90f5c5afc 100644 --- a/test_runner/cloud_regress/test_cloud_regress.py +++ b/test_runner/cloud_regress/test_cloud_regress.py @@ -4,11 +4,15 @@ Run the regression tests on the cloud instance of Neon from __future__ import annotations -from pathlib import Path +from typing import TYPE_CHECKING import pytest -from fixtures.neon_fixtures import RemotePostgres -from fixtures.pg_version import PgVersion + +if TYPE_CHECKING: + from pathlib import Path + + from fixtures.neon_fixtures import RemotePostgres + from fixtures.pg_version import PgVersion @pytest.mark.timeout(7200) diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py index 8382ce20b3..7d994b6cc0 100644 --- a/test_runner/fixtures/auth_tokens.py +++ b/test_runner/fixtures/auth_tokens.py @@ -2,11 +2,12 @@ from __future__ import annotations from dataclasses import dataclass from enum import StrEnum -from typing import Any +from typing import TYPE_CHECKING, Any import jwt -from fixtures.common_types import TenantId +if TYPE_CHECKING: + from fixtures.common_types import TenantId @dataclass diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index fa3747c08f..00e415cc98 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -15,18 +15,20 @@ from typing import TYPE_CHECKING import allure import pytest -from _pytest.config import Config -from _pytest.config.argparsing import Parser -from _pytest.fixtures import FixtureRequest -from _pytest.terminal import TerminalReporter -from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonPageserver if TYPE_CHECKING: from collections.abc import Callable, Iterator, Mapping + from _pytest.config import Config + from _pytest.config.argparsing import Parser + from _pytest.fixtures import FixtureRequest + from _pytest.terminal import TerminalReporter + + from fixtures.common_types import TenantId, TimelineId + from fixtures.neon_fixtures import NeonPageserver + """ This file contains fixtures for micro-benchmarks. diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index c0892399bd..150046b99a 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -11,7 +11,6 @@ from pathlib import Path from typing import TYPE_CHECKING, final import pytest -from _pytest.fixtures import FixtureRequest from typing_extensions import override from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker @@ -24,11 +23,14 @@ from fixtures.neon_fixtures import ( VanillaPostgres, wait_for_last_flush_lsn, ) -from fixtures.pg_stats import PgStatTable if TYPE_CHECKING: from collections.abc import Iterator + from _pytest.fixtures import FixtureRequest + + from fixtures.pg_stats import PgStatTable + class PgCompare(ABC): """Common interface of all postgres implementations, useful for benchmarks. diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 205b9141e0..d49c3f5601 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -4,8 +4,6 @@ import concurrent.futures from typing import TYPE_CHECKING import pytest -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response from fixtures.common_types import TenantId @@ -15,6 +13,9 @@ if TYPE_CHECKING: from collections.abc import Callable from typing import Any + from pytest_httpserver import HTTPServer + from werkzeug.wrappers.request import Request + class ComputeReconfigure: def __init__(self, server: HTTPServer): diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 9b28246f58..4073ebc3b9 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -5,6 +5,8 @@ import urllib.parse import requests from requests.adapters import HTTPAdapter +from fixtures.log_helper import log + class EndpointHttpClient(requests.Session): def __init__( @@ -51,6 +53,7 @@ class EndpointHttpClient(requests.Session): def metrics(self) -> str: res = self.get(f"http://localhost:{self.external_port}/metrics") res.raise_for_status() + log.debug("raw compute metrics: %s", res.text) return res.text # Current compute status. diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py index d674be99de..f9e5f9c1db 100644 --- a/test_runner/fixtures/fast_import.py +++ b/test_runner/fixtures/fast_import.py @@ -12,6 +12,7 @@ from _pytest.config import Config from fixtures.log_helper import log from fixtures.neon_cli import AbstractNeonCli from fixtures.pg_version import PgVersion +from fixtures.remote_storage import MockS3Server class FastImport(AbstractNeonCli): @@ -111,6 +112,18 @@ class FastImport(AbstractNeonCli): self.cmd = self.raw_cli(args) return self.cmd + def set_aws_creds(self, mock_s3_server: MockS3Server, extra_env: dict[str, str] | None = None): + if self.extra_env is None: + self.extra_env = {} + self.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() + self.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() + self.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() + self.extra_env["AWS_REGION"] = mock_s3_server.region() + self.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() + + if extra_env is not None: + self.extra_env.update(extra_env) + def __enter__(self): return self @@ -134,7 +147,7 @@ def fast_import( pg_distrib_dir, pg_version, workdir, - cleanup=not cast(bool, pytestconfig.getoption("--preserve-database-files")), + cleanup=not cast("bool", pytestconfig.getoption("--preserve-database-files")), ) as fi: yield fi diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py index 3e35af3b5b..d6a5fe57a6 100644 --- a/test_runner/fixtures/h2server.py +++ b/test_runner/fixtures/h2server.py @@ -10,7 +10,6 @@ import asyncio import collections import io import json -from collections.abc import AsyncIterable from typing import TYPE_CHECKING, final import pytest_asyncio @@ -31,6 +30,7 @@ from h2.settings import SettingCodes from typing_extensions import override if TYPE_CHECKING: + from collections.abc import AsyncIterable from typing import Any diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 54e6458ac6..df500544dc 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -1,12 +1,15 @@ from __future__ import annotations from collections import defaultdict +from typing import TYPE_CHECKING, Literal from prometheus_client.parser import text_string_to_metric_families -from prometheus_client.samples import Sample from fixtures.log_helper import log +if TYPE_CHECKING: + from prometheus_client.samples import Sample + class Metrics: metrics: dict[str, list[Sample]] @@ -43,14 +46,26 @@ class MetricsGetter: def get_metrics(self) -> Metrics: raise NotImplementedError() - def get_metric_value(self, name: str, filter: dict[str, str] | None = None) -> float | None: + def get_metric_value( + self, + name: str, + filter: dict[str, str] | None = None, + aggregate: Literal["sum"] | None = None, + ) -> float | None: metrics = self.get_metrics() results = metrics.query_all(name, filter=filter) if not results: log.info(f'could not find metric "{name}"') return None - assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" - return results[0].value + if aggregate is None: + assert len(results) == 1, ( + f"metric {name} with given filters is not unique, got: {results}" + ) + return results[0].value + elif aggregate == "sum": + return sum(sample.value for sample in results) + else: + raise RuntimeError(f"unknown aggregate function {aggregate}") def get_metrics_values( self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False @@ -129,7 +144,7 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), *histogram("pageserver_wait_lsn_seconds"), - *histogram("pageserver_remote_operation_seconds"), + *histogram("pageserver_remote_timeline_client_seconds_global"), *histogram("pageserver_io_operations_seconds"), "pageserver_smgr_query_started_global_count_total", "pageserver_tenant_states_count", @@ -140,6 +155,7 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( counter("pageserver_tenant_throttling_wait_usecs_sum_global"), counter("pageserver_tenant_throttling_count_global"), *histogram("pageserver_tokio_epoll_uring_slots_submission_queue_depth"), + *histogram("pageserver_wait_ondemand_download_seconds_global"), ) PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( @@ -168,7 +184,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( "pageserver_evictions_with_low_residence_duration_total", "pageserver_aux_file_estimated_size", "pageserver_valid_lsn_lease_count", - "pageserver_flush_wait_upload_seconds", counter("pageserver_tenant_throttling_count_accounted_start"), counter("pageserver_tenant_throttling_count_accounted_finish"), counter("pageserver_tenant_throttling_wait_usecs_sum"), @@ -178,6 +193,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( counter("pageserver_wait_lsn_in_progress_micros"), counter("pageserver_wait_lsn_started_count"), counter("pageserver_wait_lsn_finished_count"), + counter("pageserver_wait_ondemand_download_seconds_sum"), *histogram("pageserver_page_service_batch_size"), *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 6e53987e7c..d555ee2989 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -7,7 +7,6 @@ import subprocess import tempfile import textwrap from itertools import chain, product -from pathlib import Path from typing import TYPE_CHECKING, cast import toml @@ -15,14 +14,15 @@ import toml from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.common_types import IndexPartDump -from fixtures.pg_version import PgVersion if TYPE_CHECKING: + from pathlib import Path from typing import ( Any, - cast, ) + from fixtures.pg_version import PgVersion + # Used to be an ABC. abc.ABC removed due to linter without name change. class AbstractNeonCli: @@ -36,7 +36,7 @@ class AbstractNeonCli: self.extra_env = extra_env self.binpath = binpath - COMMAND: str = cast(str, None) # To be overwritten by the derived class. + COMMAND: str = cast("str", None) # To be overwritten by the derived class. def raw_cli( self, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index aba8e04977..5694bf170e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,14 +14,12 @@ import threading import time import uuid from collections import defaultdict -from collections.abc import Iterable, Iterator from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime from enum import StrEnum from functools import cached_property from pathlib import Path -from types import TracebackType from typing import TYPE_CHECKING, cast from urllib.parse import quote, urlparse @@ -34,19 +32,12 @@ import psycopg2.sql import pytest import requests import toml -from _pytest.config import Config -from _pytest.config.argparsing import Parser -from _pytest.fixtures import FixtureRequest from jwcrypto import jwk -from mypy_boto3_kms import KMSClient -from mypy_boto3_s3 import S3Client # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import cursor as PgCursor from psycopg2.extensions import make_dsn, parse_dsn -from pytest_httpserver import HTTPServer -from urllib3.util.retry import Retry from fixtures import overlayfs from fixtures.auth_tokens import AuthKeys, TokenScope @@ -60,7 +51,6 @@ from fixtures.common_types import ( ) from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS from fixtures.endpoint.http import EndpointHttpClient -from fixtures.h2server import H2Server from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.neon_cli import NeonLocalCli, Pagectl @@ -78,7 +68,6 @@ from fixtures.pageserver.utils import ( wait_for_last_record_lsn, ) from fixtures.paths import get_test_repo_dir, shared_snapshot_dir -from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( LocalFsStorage, @@ -108,10 +97,21 @@ from fixtures.utils import ( from .neon_api import NeonAPI, NeonApiEndpoint if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Iterable, Iterator + from types import TracebackType from typing import Any, Self, TypeVar + from _pytest.config import Config + from _pytest.config.argparsing import Parser + from _pytest.fixtures import FixtureRequest + from mypy_boto3_kms import KMSClient + from mypy_boto3_s3 import S3Client + from pytest_httpserver import HTTPServer + from urllib3.util.retry import Retry + + from fixtures.h2server import H2Server from fixtures.paths import SnapshotDirLocked + from fixtures.pg_version import PgVersion T = TypeVar("T") @@ -376,6 +376,28 @@ class PageserverWalReceiverProtocol(StrEnum): raise ValueError(f"Unknown protocol type: {proto}") +@dataclass +class PageserverTracingConfig: + sampling_ratio: tuple[int, int] + endpoint: str + protocol: str + timeout: str + + def to_config_key_value(self) -> tuple[str, dict[str, Any]]: + value = { + "sampling_ratio": { + "numerator": self.sampling_ratio[0], + "denominator": self.sampling_ratio[1], + }, + "export_config": { + "endpoint": self.endpoint, + "protocol": self.protocol, + "timeout": self.timeout, + }, + } + return ("tracing", value) + + class NeonEnvBuilder: """ Builder object to create a Neon runtime environment @@ -425,6 +447,7 @@ class NeonEnvBuilder: pageserver_virtual_file_io_mode: str | None = None, pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None, pageserver_get_vectored_concurrent_io: str | None = None, + pageserver_tracing_config: PageserverTracingConfig | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -469,12 +492,17 @@ class NeonEnvBuilder: # Flag to enable https listener in safekeeper, generate local ssl certs, # and force storage controller to use https for safekeeper api. self.use_https_safekeeper_api: bool = False + # Flag to use https listener in storage controller, generate local ssl certs, + # and force pageservers and neon_local to use https for storage controller api. + self.use_https_storage_controller_api: bool = False self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine self.pageserver_get_vectored_concurrent_io: str | None = ( pageserver_get_vectored_concurrent_io ) + self.pageserver_tracing_config = pageserver_tracing_config + self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm ) @@ -494,9 +522,9 @@ class NeonEnvBuilder: else: self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED - assert test_name.startswith( - "test_" - ), "Unexpectedly instantiated from outside a test function" + assert test_name.startswith("test_"), ( + "Unexpectedly instantiated from outside a test function" + ) self.test_name = test_name self.compatibility_neon_binpath = compatibility_neon_binpath self.compatibility_pg_distrib_dir = compatibility_pg_distrib_dir @@ -505,12 +533,12 @@ class NeonEnvBuilder: self.mixdir = self.test_output_dir / "mixdir_neon" if self.version_combination is not None: - assert ( - self.compatibility_neon_binpath is not None - ), "the environment variable COMPATIBILITY_NEON_BIN is required when using mixed versions" - assert ( - self.compatibility_pg_distrib_dir is not None - ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required when using mixed versions" + assert self.compatibility_neon_binpath is not None, ( + "the environment variable COMPATIBILITY_NEON_BIN is required when using mixed versions" + ) + assert self.compatibility_pg_distrib_dir is not None, ( + "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required when using mixed versions" + ) self.mixdir.mkdir(mode=0o755, exist_ok=True) self._mix_versions() self.test_may_use_compatibility_snapshot_binaries = True @@ -792,9 +820,9 @@ class NeonEnvBuilder: work = ident_state_dir / "work" assert upper.is_dir() assert work.is_dir() - assert ( - self.test_overlay_dir not in dst.parents - ), "otherwise workdir cleanup below wouldn't work" + assert self.test_overlay_dir not in dst.parents, ( + "otherwise workdir cleanup below wouldn't work" + ) # find index, still not mutating state idxmap = { existing_ident: idx @@ -860,9 +888,9 @@ class NeonEnvBuilder: self.pageserver_remote_storage = ret def enable_safekeeper_remote_storage(self, kind: RemoteStorageKind): - assert ( - self.safekeepers_remote_storage is None - ), "safekeepers_remote_storage already configured" + assert self.safekeepers_remote_storage is None, ( + "safekeepers_remote_storage already configured" + ) self.safekeepers_remote_storage = self._configure_and_create_remote_storage( kind, RemoteStorageUser.SAFEKEEPER @@ -1067,7 +1095,9 @@ class NeonEnv: self.initial_timeline = config.initial_timeline self.generate_local_ssl_certs = ( - config.use_https_pageserver_api or config.use_https_safekeeper_api + config.use_https_pageserver_api + or config.use_https_safekeeper_api + or config.use_https_storage_controller_api ) self.ssl_ca_file = ( self.repo_dir.joinpath("rootCA.crt") if self.generate_local_ssl_certs else None @@ -1096,7 +1126,10 @@ class NeonEnv: self.storage_controller_port = config.storage_controller_port_override self.storage_controller = NeonProxiedStorageController( - self, config.storage_controller_port_override, config.auth_enabled + self, + config.storage_controller_port_override, + config.auth_enabled, + config.use_https_storage_controller_api, ) else: # Find two adjacent ports for storage controller and its postgres DB. This @@ -1110,7 +1143,10 @@ class NeonEnv: self.storage_controller_port = storage_controller_port self.storage_controller = NeonStorageController( - self, storage_controller_port, config.auth_enabled + self, + storage_controller_port, + config.auth_enabled, + config.use_https_storage_controller_api, ) log.info( @@ -1127,6 +1163,7 @@ class NeonEnv: self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io + self.pageserver_tracing_config = config.pageserver_tracing_config # Create the neon_local's `NeonLocalInitConf` cfg: dict[str, Any] = { @@ -1158,6 +1195,12 @@ class NeonEnv: if storage_controller_config is not None: cfg["storage_controller"] = storage_controller_config + if config.test_may_use_compatibility_snapshot_binaries: + if "storage_controller" in cfg: + cfg["storage_controller"]["use_local_compute_notifications"] = False + else: + cfg["storage_controller"] = {"use_local_compute_notifications": False} + # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -1245,10 +1288,29 @@ class NeonEnv: if key not in ps_cfg: ps_cfg[key] = value + if self.pageserver_tracing_config is not None: + key, value = self.pageserver_tracing_config.to_config_key_value() + + if key not in ps_cfg: + ps_cfg[key] = value + + ps_cfg[key] = value + # Create a corresponding NeonPageserver object - self.pageservers.append( - NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"]) + ps = NeonPageserver( + self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"] ) + + if config.test_may_use_compatibility_snapshot_binaries: + # New features gated by pageserver config usually get rolled out in the + # test suite first, by enabling it in the `ps_cfg` abve. + # Compatibility tests run with old binaries that predate feature code & config. + # So, old binaries will warn about the flag's presence. + # Silence those warnings categorically. + log.info("test may use old binaries, ignoring warnings about unknown config items") + ps.allowed_errors.append(".*ignoring unknown configuration item.*") + + self.pageservers.append(ps) cfg["pageservers"].append(ps_cfg) # Create config and a Safekeeper object for each safekeeper @@ -1267,6 +1329,7 @@ class NeonEnv: "http_port": port.http, "https_port": port.https, "sync": config.safekeepers_enable_fsync, + "use_https_safekeeper_api": config.use_https_safekeeper_api, } if config.auth_enabled: sk_cfg["auth_enabled"] = True @@ -1327,6 +1390,8 @@ class NeonEnv: and self.storage_controller_config.get("timelines_onto_safekeepers") is True ): for sk_id, sk in enumerate(self.safekeepers): + # 0 is an invalid safekeeper id + sk_id = sk_id + 1 body = { "id": sk_id, "created_at": "2023-10-25T09:11:25Z", @@ -1404,9 +1469,9 @@ class NeonEnv: assert that there is only one. Tests with multiple pageservers should always use get_pageserver with an explicit ID. """ - assert ( - len(self.pageservers) == 1 - ), "env.pageserver must only be used with single pageserver NeonEnv" + assert len(self.pageservers) == 1, ( + "env.pageserver must only be used with single pageserver NeonEnv" + ) return self.pageservers[0] def get_pageserver(self, id: int | None) -> NeonPageserver: @@ -1597,7 +1662,7 @@ def neon_simple_env( compatibility_pg_distrib_dir=compatibility_pg_distrib_dir, pg_version=pg_version, run_id=run_id, - preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), + preserve_database_files=cast("bool", pytestconfig.getoption("--preserve-database-files")), test_name=request.node.name, test_output_dir=test_output_dir, pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, @@ -1666,7 +1731,7 @@ def neon_env_builder( combination=combination, pg_version=pg_version, run_id=run_id, - preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), + preserve_database_files=cast("bool", pytestconfig.getoption("--preserve-database-files")), pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, test_name=request.node.name, test_output_dir=test_output_dir, @@ -1714,6 +1779,8 @@ class LogUtils: log.warning(f"Skipping log check: {logfile} does not exist") return None + log.info(f"Checking log {logfile} for pattern '{pattern}'") + contains_re = re.compile(pattern) # XXX: Our rust logging machinery buffers the messages, so if you @@ -1777,14 +1844,16 @@ class StorageControllerMigrationConfig: class NeonStorageController(MetricsGetter, LogUtils): - def __init__(self, env: NeonEnv, port: int, auth_enabled: bool): + def __init__(self, env: NeonEnv, port: int, auth_enabled: bool, use_https: bool): self.env = env self.port: int = port - self.api: str = f"http://127.0.0.1:{port}" + scheme = "https" if use_https else "http" + self.api: str = f"{scheme}://127.0.0.1:{port}" self.running = False self.auth_enabled = auth_enabled self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log" + self.ssl_ca_file = env.ssl_ca_file def start( self, @@ -1854,6 +1923,8 @@ class NeonStorageController(MetricsGetter, LogUtils): return PageserverHttpClient(self.port, lambda: True, auth_token, *args, **kwargs) def request(self, method, *args, **kwargs) -> requests.Response: + if self.ssl_ca_file is not None: + kwargs["verify"] = self.ssl_ca_file resp = requests.request(method, *args, **kwargs) NeonStorageController.raise_api_exception(resp) @@ -2565,8 +2636,8 @@ class NeonStorageController(MetricsGetter, LogUtils): class NeonProxiedStorageController(NeonStorageController): - def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool): - super().__init__(env, proxy_port, auth_enabled) + def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool, use_https: bool): + super().__init__(env, proxy_port, auth_enabled, use_https) self.instances: dict[int, dict[str, Any]] = {} def start( @@ -2603,10 +2674,13 @@ class NeonProxiedStorageController(NeonStorageController): self.running = False return self + def instance_log_path(self, instance_id: int) -> Path: + return self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log" + def assert_no_errors(self): for instance_id in self.instances.keys(): assert_no_errors( - self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log", + self.instance_log_path(instance_id), "storage_controller", self.allowed_errors, ) @@ -2614,7 +2688,14 @@ class NeonProxiedStorageController(NeonStorageController): def log_contains( self, pattern: str, offset: None | LogCursor = None ) -> tuple[str, LogCursor] | None: - raise NotImplementedError() + for instance_id in self.instances.keys(): + log_path = self.instance_log_path(instance_id) + checker = LogUtils(log_path) + found = checker.log_contains(pattern, offset) + if found is not None: + return found + + return None @dataclass @@ -3544,9 +3625,9 @@ class NeonProxy(PgProtocol): @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) def _wait_until_ready(self): - assert ( - self._popen and self._popen.poll() is None - ), "Proxy exited unexpectedly. Check test log." + assert self._popen and self._popen.poll() is None, ( + "Proxy exited unexpectedly. Check test log." + ) requests.get(f"http://{self.host}:{self.http_port}/v1/status") def http_query(self, query, args, **kwargs): @@ -3754,9 +3835,9 @@ class NeonAuthBroker: @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) def _wait_until_ready(self): - assert ( - self._popen and self._popen.poll() is None - ), "Proxy exited unexpectedly. Check test log." + assert self._popen and self._popen.poll() is None, ( + "Proxy exited unexpectedly. Check test log." + ) requests.get(f"http://{self.host}:{self.http_port}/v1/status") async def query(self, query, args, **kwargs): @@ -4036,9 +4117,9 @@ class Endpoint(PgProtocol, LogUtils): m = re.search(r"=\s*(\S+)", line) assert m is not None, f"malformed config line {line}" size = m.group(1) - assert size_to_bytes(size) >= size_to_bytes( - "1MB" - ), "LFC size cannot be set less than 1MB" + assert size_to_bytes(size) >= size_to_bytes("1MB"), ( + "LFC size cannot be set less than 1MB" + ) lfc_path_escaped = str(lfc_path).replace("'", "''") config_lines = [ f"neon.file_cache_path = '{lfc_path_escaped}'", @@ -4049,12 +4130,12 @@ class Endpoint(PgProtocol, LogUtils): ] + config_lines else: for line in config_lines: - assert ( - line.find("neon.max_file_cache_size") == -1 - ), "Setting LFC parameters is not allowed when LFC is disabled" - assert ( - line.find("neon.file_cache_size_limit") == -1 - ), "Setting LFC parameters is not allowed when LFC is disabled" + assert line.find("neon.max_file_cache_size") == -1, ( + "Setting LFC parameters is not allowed when LFC is disabled" + ) + assert line.find("neon.file_cache_size_limit") == -1, ( + "Setting LFC parameters is not allowed when LFC is disabled" + ) self.config(config_lines) @@ -4176,7 +4257,7 @@ class Endpoint(PgProtocol, LogUtils): # Write it back updated with open(config_path, "w") as file: - log.info(json.dumps(dict(data_dict, **kwargs))) + log.debug(json.dumps(dict(data_dict, **kwargs))) json.dump(dict(data_dict, **kwargs), file, indent=4) def respec_deep(self, **kwargs: Any) -> None: @@ -4193,7 +4274,7 @@ class Endpoint(PgProtocol, LogUtils): with open(config_path) as f: data_dict: dict[str, Any] = json.load(f) - log.info("Current compute spec: %s", json.dumps(data_dict, indent=4)) + log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4)) for key, value in kwargs.items(): if isinstance(value, dict): @@ -4205,7 +4286,7 @@ class Endpoint(PgProtocol, LogUtils): data_dict[key] = value with open(config_path, "w") as file: - log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) + log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) json.dump(data_dict, file, indent=4) def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None: @@ -4892,9 +4973,9 @@ class StorageScrubber: healthy = False else: for _, warnings in with_warnings.items(): - assert ( - len(warnings) > 0 - ), "with_warnings value should not be empty, running without verbose mode?" + assert len(warnings) > 0, ( + "with_warnings value should not be empty, running without verbose mode?" + ) if not self._check_line_list_allowed(warnings): healthy = False break @@ -4908,9 +4989,9 @@ class StorageScrubber: healthy = False else: for _, errors in with_errors.items(): - assert ( - len(errors) > 0 - ), "with_errors value should not be empty, running without verbose mode?" + assert len(errors) > 0, ( + "with_errors value should not be empty, running without verbose mode?" + ) if not self._check_line_list_allowed(errors): healthy = False break diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index abddfa2768..24c856e279 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -5,7 +5,10 @@ from __future__ import annotations import argparse import re import sys -from collections.abc import Iterable +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterable def scan_pageserver_log_for_errors( @@ -101,11 +104,13 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*WARN.*path=/v1/utilization .*request was dropped before completing", # Can happen during shutdown ".*scheduling deletion on drop failed: queue is in state Stopped.*", + ".*scheduling deletion on drop failed: queue is shutting down.*", # L0 flush backpressure delays are expected under heavy ingest load. We want to exercise # this backpressure in tests. ".*delaying layer flush by \\S+ for compaction backpressure.*", ".*stalling layer flushes for compaction backpressure.*", ".*layer roll waiting for flush due to compaction backpressure.*", + ".*BatchSpanProcessor.*", ) @@ -114,7 +119,9 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ # failing to connect to them. ".*Call to node.*management API.*failed.*receive body.*", ".*Call to node.*management API.*failed.*ReceiveBody.*", + ".*Call to node.*management API.*failed.*Timeout.*", ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", + ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 13cab448f3..c2d176bf5a 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -7,8 +7,7 @@ import string import time from collections import defaultdict from dataclasses import dataclass -from datetime import datetime -from typing import Any +from typing import TYPE_CHECKING, Any import requests from requests.adapters import HTTPAdapter @@ -26,6 +25,9 @@ from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion from fixtures.utils import EnhancedJSONEncoder, Fn +if TYPE_CHECKING: + from datetime import datetime + class PageserverApiException(Exception): def __init__(self, message, status_code: int): @@ -851,6 +853,25 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res_json = res.json() return res_json + def timeline_mark_invisible( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + is_visible: bool | None = None, + ): + data = { + "is_visible": is_visible, + } + + log.info( + f"Requesting marking timeline invisible for {is_visible=}, {tenant_id=}, {timeline_id=}" + ) + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/mark_invisible", + json=data, + ) + self.verbose_error(res) + def timeline_get_timestamp_of_lsn( self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn ): @@ -1171,3 +1192,28 @@ class PageserverHttpClient(requests.Session, MetricsGetter): log.info(f"Got perf info response code: {res.status_code}") self.verbose_error(res) return res.json() + + def ingest_aux_files( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + aux_files: dict[str, bytes], + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/ingest_aux_files", + json={ + "aux_files": aux_files, + }, + ) + self.verbose_error(res) + return res.json() + + def list_aux_files( + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn + ) -> Any: + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/list_aux_files", + json={"lsn": str(lsn)}, + ) + self.verbose_error(res) + return res.json() diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index b6d19af84c..eedb693e3d 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -4,18 +4,19 @@ import concurrent.futures from typing import TYPE_CHECKING import fixtures.pageserver.remote_storage -from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnv, - NeonEnvBuilder, -) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind if TYPE_CHECKING: from collections.abc import Callable from typing import Any + from fixtures.common_types import TenantId, TimelineId + from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + ) + def single_timeline( neon_env_builder: NeonEnvBuilder, @@ -42,7 +43,7 @@ def single_timeline( f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}" ) - log.info("detach template tenant form pageserver") + log.info("detach template tenant from pageserver") env.pageserver.tenant_detach(template_tenant) log.info(f"duplicating template tenant {ncopies} times in remote storage") @@ -64,11 +65,13 @@ def single_timeline( assert ps_http.tenant_list() == [] def attach(tenant): - env.pageserver.tenant_attach( - tenant, - config=template_config.copy(), - generation=100, - override_storage_controller_generation=True, + # NB: create the new tenant in the storage controller with the correct tenant config. This + # will pick up the existing tenant data from remote storage. If we just attach it to the + # Pageserver, the storage controller will reset the tenant config to the default. + env.create_tenant( + tenant_id=tenant, + timeline_id=template_timeline, + conf=template_config, ) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py index 54acb9ce50..1fc473b633 100644 --- a/test_runner/fixtures/pageserver/remote_storage.py +++ b/test_runner/fixtures/pageserver/remote_storage.py @@ -5,11 +5,9 @@ import os import queue import shutil import threading -from pathlib import Path from typing import TYPE_CHECKING from fixtures.common_types import TenantId, TimelineId -from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.common_types import ( InvalidFileName, parse_layer_file_name, @@ -17,8 +15,11 @@ from fixtures.pageserver.common_types import ( from fixtures.remote_storage import LocalFsStorage if TYPE_CHECKING: + from pathlib import Path from typing import Any + from fixtures.neon_fixtures import NeonEnv + def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId): remote_storage = env.pageserver_remote_storage diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 66f61f9b4c..bc5076758d 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -3,13 +3,6 @@ from __future__ import annotations import time from typing import TYPE_CHECKING -from mypy_boto3_s3.type_defs import ( - DeleteObjectOutputTypeDef, - EmptyResponseMetadataTypeDef, - ListObjectsV2OutputTypeDef, - ObjectTypeDef, -) - from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient @@ -19,6 +12,13 @@ from fixtures.utils import wait_until if TYPE_CHECKING: from typing import Any + from mypy_boto3_s3.type_defs import ( + DeleteObjectOutputTypeDef, + EmptyResponseMetadataTypeDef, + ListObjectsV2OutputTypeDef, + ObjectTypeDef, + ) + def assert_tenant_state( pageserver_http: PageserverHttpClient, @@ -241,9 +241,9 @@ def wait_for_upload_queue_empty( found = False for f in finished: if all([s.labels[label] == f.labels[label] for label in remaining_labels]): - assert ( - not found - ), "duplicate match, remaining_labels don't uniquely identify sample" + assert not found, ( + "duplicate match, remaining_labels don't uniquely identify sample" + ) tl.append((s.labels, int(s.value) - int(f.value))) found = True if not found: diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index c33342c89e..e5deb50d46 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -6,13 +6,14 @@ from typing import TYPE_CHECKING import allure import pytest import toml -from _pytest.python import Metafunc from fixtures.pg_version import PgVersion if TYPE_CHECKING: from typing import Any + from _pytest.python import Metafunc + """ Dynamically parametrize tests by different parameters diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index fc4fb3629b..ddf2e8f4f0 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -6,7 +6,6 @@ import subprocess import threading from fcntl import LOCK_EX, LOCK_UN, flock from pathlib import Path -from types import TracebackType from typing import TYPE_CHECKING import pytest @@ -18,6 +17,7 @@ from fixtures.utils import allure_attach_from_dir if TYPE_CHECKING: from collections.abc import Iterator + from types import TracebackType BASE_DIR = Path(__file__).parents[2] @@ -101,9 +101,9 @@ def compatibility_snapshot_dir() -> Iterator[Path]: if os.getenv("REMOTE_ENV"): return compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") - assert ( - compatibility_snapshot_dir_env is not None - ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg(PG_VERSION)` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" + assert compatibility_snapshot_dir_env is not None, ( + "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg(PG_VERSION)` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" + ) compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() yield compatibility_snapshot_dir diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index cac84c07e7..f98ac4b92e 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -7,22 +7,24 @@ import os import re from dataclasses import dataclass from enum import StrEnum -from pathlib import Path from typing import TYPE_CHECKING import boto3 import toml from moto.server import ThreadedMotoServer -from mypy_boto3_s3 import S3Client from typing_extensions import override -from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.common_types import IndexPartDump if TYPE_CHECKING: + from pathlib import Path from typing import Any + from mypy_boto3_s3 import S3Client + + from fixtures.common_types import TenantId, TenantShardId, TimelineId + TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @@ -448,9 +450,9 @@ class RemoteStorageKind(StrEnum): env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") env_access_token = os.getenv("AWS_SESSION_TOKEN") env_profile = os.getenv("AWS_PROFILE") - assert ( - env_access_key and env_secret_key and env_access_token - ) or env_profile, "need to specify either access key and secret access key or profile" + assert (env_access_key and env_secret_key and env_access_token) or env_profile, ( + "need to specify either access key and secret access key or profile" + ) bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET") assert bucket_name is not None, "no remote storage bucket name provided" diff --git a/test_runner/fixtures/reruns.py b/test_runner/fixtures/reruns.py index f2a25ae8f6..a68b287f8a 100644 --- a/test_runner/fixtures/reruns.py +++ b/test_runner/fixtures/reruns.py @@ -3,12 +3,11 @@ from __future__ import annotations from collections.abc import MutableMapping from typing import TYPE_CHECKING, cast -import pytest - if TYPE_CHECKING: from collections.abc import MutableMapping from typing import Any + import pytest from _pytest.config import Config diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py index 922cdedccc..5608b8504e 100644 --- a/test_runner/fixtures/safekeeper/utils.py +++ b/test_runner/fixtures/safekeeper/utils.py @@ -1,10 +1,14 @@ from __future__ import annotations -from fixtures.common_types import TenantId, TimelineId +from typing import TYPE_CHECKING + from fixtures.log_helper import log -from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.common_types import TenantId, TimelineId + from fixtures.safekeeper.http import SafekeeperHttpClient + def wait_walreceivers_absent( sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId diff --git a/test_runner/fixtures/slow.py b/test_runner/fixtures/slow.py index 4c6372d515..d6f74b2b7f 100644 --- a/test_runner/fixtures/slow.py +++ b/test_runner/fixtures/slow.py @@ -3,12 +3,13 @@ from __future__ import annotations from typing import TYPE_CHECKING import pytest -from _pytest.config import Config -from _pytest.config.argparsing import Parser if TYPE_CHECKING: from typing import Any + from _pytest.config import Config + from _pytest.config.argparsing import Parser + """ This plugin allows tests to be marked as slow using pytest.mark.slow. By default slow diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py index be95a98ff9..ca3ad43774 100644 --- a/test_runner/fixtures/storage_controller_proxy.py +++ b/test_runner/fixtures/storage_controller_proxy.py @@ -5,9 +5,7 @@ from typing import TYPE_CHECKING import pytest import requests -from pytest_httpserver import HTTPServer from werkzeug.datastructures import Headers -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response from fixtures.log_helper import log @@ -15,6 +13,9 @@ from fixtures.log_helper import log if TYPE_CHECKING: from typing import Any + from pytest_httpserver import HTTPServer + from werkzeug.wrappers.request import Request + class StorageControllerProxy: def __init__(self, server: HTTPServer): diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index d1b2a5a400..13c2d320d1 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -19,7 +19,6 @@ from urllib.parse import urlencode import allure import pytest import zstandard -from psycopg2.extensions import cursor from typing_extensions import override from fixtures.common_types import Id, Lsn @@ -34,6 +33,8 @@ if TYPE_CHECKING: from collections.abc import Iterable from typing import IO + from psycopg2.extensions import cursor + from fixtures.common_types import TimelineId from fixtures.neon_fixtures import PgBin @@ -512,7 +513,9 @@ def assert_no_errors(log_file: Path, service: str, allowed_errors: list[str]): for _lineno, error in errors: log.info(f"not allowed {service} error: {error.strip()}") - assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add" + assert not errors, ( + f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add" + ) def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str]): @@ -550,18 +553,18 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str left_list, right_list = map(build_hash_list, [left, right]) - assert len(left_list) == len( - right_list - ), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}" + assert len(left_list) == len(right_list), ( + f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}" + ) mismatching: set[str] = set() for left_tuple, right_tuple in zip(left_list, right_list, strict=False): left_path, left_hash = left_tuple right_path, right_hash = right_tuple - assert ( - left_path == right_path - ), f"file count matched, expected these to be same paths: {left_path}, {right_path}" + assert left_path == right_path, ( + f"file count matched, expected these to be same paths: {left_path}, {right_path}" + ) if left_hash != right_hash: mismatching.add(left_path) @@ -721,3 +724,20 @@ def skip_on_ci(reason: str): os.getenv("CI", "false") == "true", reason=reason, ) + + +def shared_buffers_for_max_cu(max_cu: float) -> str: + """ + Returns the string value of shared_buffers for the given max CU. + Use shared_buffers size like in production for max CU compute. + See https://github.com/neondatabase/cloud/blob/877e33b4289a471b8f0a35c84009846358f3e5a3/goapp/controlplane/internal/pkg/compute/computespec/pg_settings.go#L405 + + e.g. // 2 CU: 225mb; 4 CU: 450mb; 8 CU: 900mb + """ + ramBytes = int(4096 * max_cu * 1024 * 1024) + maxConnections = max(100, min(int(ramBytes / 9531392), 4000)) + maxWorkerProcesses = 12 + int(max_cu * 2) + maxBackends = 1 + maxConnections + maxWorkerProcesses + sharedBuffersMb = int(max(128, (1023 + maxBackends * 256) / 1024)) + sharedBuffers = int(sharedBuffersMb * 1024 / 8) + return str(sharedBuffers) diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 1947a9c3fb..e17a8e989b 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -3,7 +3,6 @@ from __future__ import annotations import threading from typing import TYPE_CHECKING -from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -17,6 +16,8 @@ from fixtures.pageserver.utils import wait_for_last_record_lsn if TYPE_CHECKING: from typing import Any + from fixtures.common_types import TenantId, TimelineId + # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex # to ensure we don't do that: this enables running lots of Workloads in parallel safely. ENDPOINT_LOCK = threading.Lock() diff --git a/test_runner/logical_repl/test_clickhouse.py b/test_runner/logical_repl/test_clickhouse.py index 6b522fa46d..c05684baf9 100644 --- a/test_runner/logical_repl/test_clickhouse.py +++ b/test_runner/logical_repl/test_clickhouse.py @@ -7,14 +7,17 @@ from __future__ import annotations import hashlib import os import time +from typing import TYPE_CHECKING import clickhouse_connect import psycopg2 import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import RemotePostgres from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import RemotePostgres + def query_clickhouse( client, diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py index 8023d64d3d..a53e6cef92 100644 --- a/test_runner/logical_repl/test_debezium.py +++ b/test_runner/logical_repl/test_debezium.py @@ -7,14 +7,17 @@ from __future__ import annotations import json import os import time +from typing import TYPE_CHECKING import psycopg2 import pytest import requests from fixtures.log_helper import log -from fixtures.neon_fixtures import RemotePostgres from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import RemotePostgres + class DebeziumAPI: """ diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 3dbbb197f4..8874fe663b 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -10,14 +10,13 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + PageserverTracingConfig, PgBin, wait_for_last_flush_lsn, ) from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci -from performance.pageserver.util import ( - setup_pageserver_with_tenants, -) +from performance.pageserver.util import setup_pageserver_with_tenants if TYPE_CHECKING: from typing import Any @@ -113,6 +112,15 @@ def setup_and_run_pagebench_benchmark( neon_env_builder.pageserver_config_override = ( f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" ) + + tracing_config = PageserverTracingConfig( + sampling_ratio=(0, 1000), + endpoint="http://localhost:4318/v1/traces", + protocol="http-binary", + timeout="10s", + ) + neon_env_builder.pageserver_tracing_config = tracing_config + ratio = tracing_config.sampling_ratio[0] / tracing_config.sampling_ratio[1] params.update( { "pageserver_config_override.page_cache_size": ( @@ -120,20 +128,18 @@ def setup_and_run_pagebench_benchmark( {"unit": "byte"}, ), "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + "pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}), } ) for param, (value, kwargs) in params.items(): record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) - def setup_wrapper(env: NeonEnv): - return setup_tenant_template(env, pg_bin, pgbench_scale) - env = setup_pageserver_with_tenants( neon_env_builder, f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}", n_tenants, - setup_wrapper, + lambda env: setup_tenant_template(env, pg_bin, pgbench_scale), # https://github.com/neondatabase/neon/issues/8070 timeout_in_seconds=60, ) @@ -160,14 +166,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): "gc_period": "0s", # disable periodic gc "checkpoint_timeout": "10 years", "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, } - template_tenant, template_timeline = env.create_tenant(set_default=True) - env.pageserver.tenant_detach(template_tenant) - env.pageserver.tenant_attach(template_tenant, config) + template_tenant, template_timeline = env.create_tenant(set_default=True, conf=config) ps_http = env.pageserver.http_client() with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index bcc3db69f0..7a6d88f79c 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -7,18 +7,19 @@ from __future__ import annotations from typing import TYPE_CHECKING import fixtures.pageserver.many_tenants as many_tenants -from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnv, - NeonEnvBuilder, -) from fixtures.pageserver.utils import wait_until_all_tenants_state if TYPE_CHECKING: from collections.abc import Callable from typing import Any + from fixtures.common_types import TenantId, TimelineId + from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + ) + def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): """ diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index cf2212d447..b2bd94fae7 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -7,16 +7,19 @@ import threading import time import timeit from contextlib import closing +from typing import TYPE_CHECKING import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.common_types import Lsn -from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonPageserver from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.utils import wait_until -from prometheus_client.samples import Sample + +if TYPE_CHECKING: + from fixtures.compare_fixtures import NeonCompare + from fixtures.neon_fixtures import NeonPageserver + from prometheus_client.samples import Sample def _record_branch_creation_durations(neon_compare: NeonCompare, durs: list[float]): @@ -45,9 +48,9 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) tenant, _ = env.create_tenant( conf={ "gc_period": "5 s", - "gc_horizon": f"{4 * 1024 ** 2}", - "checkpoint_distance": f"{2 * 1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", + "gc_horizon": f"{4 * 1024**2}", + "checkpoint_distance": f"{2 * 1024**2}", + "compaction_target_size": f"{1024**2}", "compaction_threshold": "2", # set PITR interval to be small, so we can do GC "pitr_interval": "5 s", @@ -82,10 +85,10 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) env.create_branch(f"b{i + 1}", ancestor_branch_name=f"b{p}", tenant_id=tenant) dur = timeit.default_timer() - timer - log.info(f"Creating branch b{i+1} took {dur}s") + log.info(f"Creating branch b{i + 1} took {dur}s") branch_creation_durations.append(dur) - threads.append(threading.Thread(target=run_pgbench, args=(f"b{i+1}",), daemon=True)) + threads.append(threading.Thread(target=run_pgbench, args=(f"b{i + 1}",), daemon=True)) threads[-1].start() for thread in threads: diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py index dbff116360..1b29dab288 100644 --- a/test_runner/performance/test_branching.py +++ b/test_runner/performance/test_branching.py @@ -2,13 +2,16 @@ from __future__ import annotations import timeit from pathlib import Path +from typing import TYPE_CHECKING from fixtures.benchmark_fixture import PgBenchRunResult -from fixtures.compare_fixtures import NeonCompare from fixtures.neon_fixtures import fork_at_current_lsn from performance.test_perf_pgbench import utc_now_timestamp +if TYPE_CHECKING: + from fixtures.compare_fixtures import NeonCompare + # ----------------------------------------------------------------------- # Start of `test_compare_child_and_root_*` tests # ----------------------------------------------------------------------- diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 15a03ba456..4307e815d2 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -1,10 +1,13 @@ from __future__ import annotations import timeit +from typing import TYPE_CHECKING import pytest from fixtures.benchmark_fixture import MetricReport -from fixtures.neon_fixtures import NeonEnvBuilder + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder # Run bulk tenant creation test. # diff --git a/test_runner/performance/test_bulk_update.py b/test_runner/performance/test_bulk_update.py index 6946bc66f2..16606268f4 100644 --- a/test_runner/performance/test_bulk_update.py +++ b/test_runner/performance/test_bulk_update.py @@ -2,6 +2,7 @@ from __future__ import annotations import pytest from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn +from fixtures.utils import shared_buffers_for_max_cu # @@ -20,7 +21,10 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor) timeline_id = env.create_branch("test_bulk_update") tenant_id = env.initial_tenant - endpoint = env.endpoints.create_start("test_bulk_update") + # use shared_buffers size like in production for 8 CU compute + endpoint = env.endpoints.create_start( + "test_bulk_update", config_lines=[f"shared_buffers={shared_buffers_for_max_cu(8.0)}"] + ) cur = endpoint.connect().cursor() cur.execute("set statement_timeout=0") diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index eaa89ae754..3df7710494 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -1,12 +1,15 @@ from __future__ import annotations from contextlib import closing +from typing import TYPE_CHECKING import pytest -from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log from fixtures.neon_fixtures import wait_for_last_flush_lsn +if TYPE_CHECKING: + from fixtures.compare_fixtures import NeonCompare + # # Test compaction and image layer creation performance. diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py index a86995d6d3..c6289e39e1 100644 --- a/test_runner/performance/test_compare_pg_stats.py +++ b/test_runner/performance/test_compare_pg_stats.py @@ -3,13 +3,16 @@ from __future__ import annotations import os import threading import time +from typing import TYPE_CHECKING import pytest -from fixtures.compare_fixtures import PgCompare -from fixtures.pg_stats import PgStatTable from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + from fixtures.pg_stats import PgStatTable + def get_seeds_matrix(default: int = 100): seeds = os.getenv("TEST_PG_BENCH_SEEDS_MATRIX", default=str(default)) diff --git a/test_runner/performance/test_compute_ctl_api.py b/test_runner/performance/test_compute_ctl_api.py index d6d0a84e8e..fc65cb969d 100644 --- a/test_runner/performance/test_compute_ctl_api.py +++ b/test_runner/performance/test_compute_ctl_api.py @@ -1,10 +1,13 @@ from __future__ import annotations import datetime +from typing import TYPE_CHECKING import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.neon_fixtures import NeonEnv + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv @pytest.mark.timeout(120) diff --git a/test_runner/performance/test_compute_startup.py b/test_runner/performance/test_compute_startup.py new file mode 100644 index 0000000000..abedb4be27 --- /dev/null +++ b/test_runner/performance/test_compute_startup.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +import requests +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder, PgBin + + +# Just start and measure duration. +# +# This test runs pretty quickly and can be informative when used in combination +# with emulated network delay. Some useful delay commands: +# +# 1. Add 2msec delay to all localhost traffic +# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` +# +# 2. Test that it works (you should see 4ms ping) +# `ping localhost` +# +# 3. Revert back to normal +# `sudo tc qdisc del dev lo root netem` +# +# NOTE this test might not represent the real startup time because the basebackup +# for a large database might be larger if there's a lof of transaction metadata, +# or safekeepers might need more syncing, or there might be more operations to +# apply during config step, like more users, databases, or extensions. By default +# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this +# test we only load neon. +def test_compute_startup_simple( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, +): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.create_branch("test_startup") + + endpoint = None + + # We do two iterations so we can see if the second startup is faster. It should + # be because the compute node should already be configured with roles, databases, + # extensions, etc from the first run. + for i in range(2): + # Start + with zenbenchmark.record_duration(f"{i}_start_and_select"): + if endpoint: + endpoint.start() + else: + endpoint = env.endpoints.create( + "test_startup", + # Shared buffers need to be allocated during startup, so they + # impact startup time. This is the default value we use for + # 1CPU pods (maybe different for VMs). + # + # TODO extensions also contribute to shared memory allocation, + # and this test doesn't include all default extensions we + # load. + config_lines=["shared_buffers=262144"], + ) + # Do not skip pg_catalog updates at first start, i.e. + # imitate 'the first start after project creation'. + endpoint.respec(skip_pg_catalog_updates=False) + endpoint.start() + endpoint.safe_psql("select 1;") + + # Get metrics + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() + durations = { + "wait_for_spec_ms": f"{i}_wait_for_spec", + "sync_safekeepers_ms": f"{i}_sync_safekeepers", + "sync_sk_check_ms": f"{i}_sync_sk_check", + "basebackup_ms": f"{i}_basebackup", + "start_postgres_ms": f"{i}_start_postgres", + "config_ms": f"{i}_config", + "total_startup_ms": f"{i}_total_startup", + } + for key, name in durations.items(): + value = metrics[key] + zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + + # Check basebackup size makes sense + basebackup_bytes = metrics["basebackup_bytes"] + if i > 0: + assert basebackup_bytes < 100 * 1024 + + # Stop so we can restart + endpoint.stop() + + # Imitate optimizations that console would do for the second start + endpoint.respec(skip_pg_catalog_updates=True) + + +# Start and measure duration with huge SLRU segments. +# This test is similar to test_compute_startup_simple, but it creates huge number of transactions +# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation. +# TODO: this is very suspicious test, I doubt that it does what it's supposed to do, +# e.g. these two starts do not make much sense. Looks like it's just copy-paste. +# To be fixed within https://github.com/neondatabase/cloud/issues/8673 +@pytest.mark.timeout(1800) +@pytest.mark.parametrize("slru", ["lazy", "eager"]) +def test_compute_ondemand_slru_startup( + slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker +): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + lazy_slru_download = "true" if slru == "lazy" else "false" + tenant, _ = env.create_tenant( + conf={ + "lazy_slru_download": lazy_slru_download, + } + ) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant) + with endpoint.cursor() as cur: + cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)") + cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)") + cur.execute("INSERT INTO t VALUES (1, 0)") + cur.execute( + """ + CREATE PROCEDURE updating() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..1000000 LOOP + UPDATE t SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """ + ) + cur.execute("SET statement_timeout=0") + cur.execute("call updating()") + + endpoint.stop() + + # We do two iterations so we can see if the second startup is faster. It should + # be because the compute node should already be configured with roles, databases, + # extensions, etc from the first run. + for i in range(2): + # Start + with zenbenchmark.record_duration(f"{slru}_{i}_start"): + endpoint.start() + + with zenbenchmark.record_duration(f"{slru}_{i}_select"): + sum = endpoint.safe_psql("select sum(x) from t")[0][0] + assert sum == 1000000 + + # Get metrics + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() + durations = { + "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", + "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", + "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check", + "basebackup_ms": f"{slru}_{i}_basebackup", + "start_postgres_ms": f"{slru}_{i}_start_postgres", + "config_ms": f"{slru}_{i}_config", + "total_startup_ms": f"{slru}_{i}_total_startup", + } + for key, name in durations.items(): + value = metrics[key] + zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + + basebackup_bytes = metrics["basebackup_bytes"] + zenbenchmark.record( + f"{slru}_{i}_basebackup_bytes", + basebackup_bytes, + "bytes", + report=MetricReport.LOWER_IS_BETTER, + ) + + # Stop so we can restart + endpoint.stop() + + # Imitate optimizations that console would do for the second start + endpoint.respec(skip_pg_catalog_updates=True) + + +@pytest.mark.timeout(240) +def test_compute_startup_latency( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + zenbenchmark: NeonBenchmarker, +): + """ + Do NUM_STARTS 'optimized' starts, i.e. with pg_catalog updates skipped, + and measure the duration of each step. Report p50, p90, p99 latencies. + """ + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + endpoint = env.endpoints.create_start("main") + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s4", endpoint.connstr()]) + endpoint.stop() + + NUM_STARTS = 100 + + durations: dict[str, list[int]] = { + "sync_sk_check_ms": [], + "sync_safekeepers_ms": [], + "basebackup_ms": [], + "start_postgres_ms": [], + "total_startup_ms": [], + } + + for _i in range(NUM_STARTS): + endpoint.start() + client = endpoint.http_client() + metrics = client.metrics_json() + for key in durations.keys(): + value = metrics[key] + durations[key].append(value) + endpoint.stop() + + for key in durations.keys(): + durations[key] = sorted(durations[key]) + zenbenchmark.record( + f"{key}_p50", + durations[key][len(durations[key]) // 2], + "ms", + report=MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + f"{key}_p90", + durations[key][len(durations[key]) * 9 // 10], + "ms", + report=MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + f"{key}_p99", + durations[key][len(durations[key]) * 99 // 100], + "ms", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index 0e56fdc96f..8535e6843d 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -2,11 +2,13 @@ from __future__ import annotations from contextlib import closing from io import BufferedReader, RawIOBase -from typing import final +from typing import TYPE_CHECKING, final -from fixtures.compare_fixtures import PgCompare from typing_extensions import override +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + @final class CopyTestData(RawIOBase): diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py index f7e4a629d6..34a10499e7 100644 --- a/test_runner/performance/test_dup_key.py +++ b/test_runner/performance/test_dup_key.py @@ -1,11 +1,14 @@ from __future__ import annotations from contextlib import closing +from typing import TYPE_CHECKING import pytest -from fixtures.compare_fixtures import PgCompare from pytest_lazyfixture import lazy_fixture +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + @pytest.mark.parametrize( "env", diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index 7c9e9f47c8..804933d3a5 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -1,11 +1,14 @@ from __future__ import annotations import json +from typing import TYPE_CHECKING import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str): @@ -18,9 +21,9 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma # disable default GC and compaction "gc_period": "1000 m", "compaction_period": "0 s", - "gc_horizon": f"{1024 ** 2}", - "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", + "gc_horizon": f"{1024**2}", + "checkpoint_distance": f"{1024**2}", + "compaction_target_size": f"{1024**2}", # set PITR interval to be small, so we can do GC "pitr_interval": "10 s", # "compaction_threshold": "3", diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py index e8ef59722d..b82f3ba9d4 100644 --- a/test_runner/performance/test_gist_build.py +++ b/test_runner/performance/test_gist_build.py @@ -1,8 +1,10 @@ from __future__ import annotations from contextlib import closing +from typing import TYPE_CHECKING -from fixtures.compare_fixtures import PgCompare +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare # diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index d025566919..a2b109bc59 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -1,11 +1,14 @@ from __future__ import annotations from contextlib import closing +from typing import TYPE_CHECKING import pytest -from fixtures.compare_fixtures import PgCompare from pytest_lazyfixture import lazy_fixture +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + @pytest.mark.parametrize( "env", diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 792d35321d..f4ea52dedc 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -1,11 +1,14 @@ from __future__ import annotations from contextlib import closing +from typing import TYPE_CHECKING import pytest -from fixtures.compare_fixtures import PgCompare from pytest_lazyfixture import lazy_fixture +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + @pytest.mark.parametrize( "env", diff --git a/test_runner/performance/test_ingest_insert_bulk.py b/test_runner/performance/test_ingest_insert_bulk.py index 283bcada31..01836b82e9 100644 --- a/test_runner/performance/test_ingest_insert_bulk.py +++ b/test_runner/performance/test_ingest_insert_bulk.py @@ -17,9 +17,10 @@ from fixtures.pageserver.utils import ( wait_for_upload_queue_empty, ) from fixtures.remote_storage import s3_storage +from fixtures.utils import shared_buffers_for_max_cu -@pytest.mark.timeout(900) +@pytest.mark.timeout(1800) @pytest.mark.parametrize("size", [8, 1024, 8192]) @pytest.mark.parametrize("s3", [True, False], ids=["s3", "local"]) @pytest.mark.parametrize("backpressure", [True, False], ids=["backpressure", "nobackpressure"]) @@ -60,6 +61,8 @@ def test_ingest_insert_bulk( f"fsync = {fsync}", "max_replication_apply_lag = 0", f"max_replication_flush_lag = {'10GB' if backpressure else '0'}", + # use shared_buffers size like in production for 8 CU compute + f"shared_buffers={shared_buffers_for_max_cu(8.0)}", # NB: neon_local defaults to 15MB, which is too slow -- production uses 500MB. f"max_replication_write_lag = {'500MB' if backpressure else '0'}", ], diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py index b55cb68b64..bc16e3964d 100644 --- a/test_runner/performance/test_ingest_logical_message.py +++ b/test_runner/performance/test_ingest_logical_message.py @@ -12,7 +12,7 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_for_last_record_lsn -@pytest.mark.timeout(600) +@pytest.mark.timeout(1200) @pytest.mark.parametrize("size", [1024, 8192, 131072]) @pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"]) def test_ingest_logical_message( diff --git a/test_runner/performance/test_latency.py b/test_runner/performance/test_latency.py index 133a2cfd8a..0431f0bf42 100644 --- a/test_runner/performance/test_latency.py +++ b/test_runner/performance/test_latency.py @@ -1,18 +1,21 @@ from __future__ import annotations import threading +from typing import TYPE_CHECKING import pytest -from fixtures.compare_fixtures import PgCompare -from fixtures.neon_fixtures import PgProtocol from performance.test_perf_pgbench import get_scales_matrix from performance.test_wal_backpressure import record_read_latency +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + from fixtures.neon_fixtures import PgProtocol + def start_write_workload(pg: PgProtocol, scale: int = 10): with pg.connect().cursor() as cur: - cur.execute(f"create table big as select generate_series(1,{scale*100_000})") + cur.execute(f"create table big as select generate_series(1,{scale * 100_000})") # Measure latency of reads on one table, while lots of writes are happening on another table. @@ -24,7 +27,7 @@ def test_measure_read_latency_heavy_write_workload(neon_with_baseline: PgCompare pg = env.pg with pg.connect().cursor() as cur: - cur.execute(f"create table small as select generate_series(1,{scale*100_000})") + cur.execute(f"create table small as select generate_series(1,{scale * 100_000})") write_thread = threading.Thread(target=start_write_workload, args=(pg, scale * 100)) write_thread.start() diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py deleted file mode 100644 index 3bf3ef890f..0000000000 --- a/test_runner/performance/test_lazy_startup.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import annotations - -import pytest -import requests -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.neon_fixtures import NeonEnvBuilder - - -# Start and measure duration with huge SLRU segments. -# This test is similar to test_startup_simple, but it creates huge number of transactions -# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation. -# -# This test runs pretty quickly and can be informative when used in combination -# with emulated network delay. Some useful delay commands: -# -# 1. Add 2msec delay to all localhost traffic -# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` -# -# 2. Test that it works (you should see 4ms ping) -# `ping localhost` -# -# 3. Revert back to normal -# `sudo tc qdisc del dev lo root netem` -# -# NOTE this test might not represent the real startup time because the basebackup -# for a large database might be larger if there's a lof of transaction metadata, -# or safekeepers might need more syncing, or there might be more operations to -# apply during config step, like more users, databases, or extensions. By default -# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this -# test we only load neon. -@pytest.mark.timeout(1800) -@pytest.mark.parametrize("slru", ["lazy", "eager"]) -def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - lazy_slru_download = "true" if slru == "lazy" else "false" - tenant, _ = env.create_tenant( - conf={ - "lazy_slru_download": lazy_slru_download, - } - ) - - endpoint = env.endpoints.create_start("main", tenant_id=tenant) - with endpoint.cursor() as cur: - cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)") - cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)") - cur.execute("INSERT INTO t VALUES (1, 0)") - cur.execute( - """ - CREATE PROCEDURE updating() as - $$ - DECLARE - i integer; - BEGIN - FOR i IN 1..1000000 LOOP - UPDATE t SET x = x + 1 WHERE pk=1; - COMMIT; - END LOOP; - END - $$ LANGUAGE plpgsql - """ - ) - cur.execute("SET statement_timeout=0") - cur.execute("call updating()") - - endpoint.stop() - - # We do two iterations so we can see if the second startup is faster. It should - # be because the compute node should already be configured with roles, databases, - # extensions, etc from the first run. - for i in range(2): - # Start - with zenbenchmark.record_duration(f"{slru}_{i}_start"): - endpoint.start() - - with zenbenchmark.record_duration(f"{slru}_{i}_select"): - sum = endpoint.safe_psql("select sum(x) from t")[0][0] - assert sum == 1000000 - - # Get metrics - metrics = requests.get( - f"http://localhost:{endpoint.external_http_port}/metrics.json" - ).json() - durations = { - "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", - "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", - "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check", - "basebackup_ms": f"{slru}_{i}_basebackup", - "start_postgres_ms": f"{slru}_{i}_start_postgres", - "config_ms": f"{slru}_{i}_config", - "total_startup_ms": f"{slru}_{i}_total_startup", - } - for key, name in durations.items(): - value = metrics[key] - zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) - - basebackup_bytes = metrics["basebackup_bytes"] - zenbenchmark.record( - f"{slru}_{i}_basebackup_bytes", - basebackup_bytes, - "bytes", - report=MetricReport.LOWER_IS_BETTER, - ) - - # Stop so we can restart - endpoint.stop() - - # Imitate optimizations that console would do for the second start - endpoint.respec(skip_pg_catalog_updates=True) diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 807ed522e1..361dedc12f 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -1,7 +1,6 @@ from __future__ import annotations import time -from collections.abc import Iterator from contextlib import contextmanager from typing import TYPE_CHECKING, cast @@ -14,6 +13,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import logical_replication_sync if TYPE_CHECKING: + from collections.abc import Iterator from subprocess import Popen from typing import AnyStr diff --git a/test_runner/performance/test_parallel_copy.py b/test_runner/performance/test_parallel_copy.py index 1689755b6f..f7f20bd33e 100644 --- a/test_runner/performance/test_parallel_copy.py +++ b/test_runner/performance/test_parallel_copy.py @@ -2,8 +2,12 @@ from __future__ import annotations import asyncio from io import BytesIO +from typing import TYPE_CHECKING -from fixtures.neon_fixtures import Endpoint, NeonEnv +if TYPE_CHECKING: + from fixtures.neon_fixtures import Endpoint, NeonEnv + +from fixtures.utils import shared_buffers_for_max_cu async def repeat_bytes(buf, repetitions: int): @@ -43,7 +47,10 @@ async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env - endpoint = env.endpoints.create_start("main") + # use shared_buffers size like in production for 8 CU compute + endpoint = env.endpoints.create_start( + "main", config_lines=[f"shared_buffers={shared_buffers_for_max_cu(8.0)}"] + ) # Create test table conn = endpoint.connect() diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index ddee0ebcd1..0427ecaf0a 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -2,9 +2,11 @@ from __future__ import annotations import asyncio from io import BytesIO +from typing import TYPE_CHECKING -from fixtures.compare_fixtures import PgCompare -from fixtures.neon_fixtures import PgProtocol +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + from fixtures.neon_fixtures import PgProtocol async def repeat_bytes(buf, repetitions: int): diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py index da62422fca..1aff58bcc6 100644 --- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py +++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py @@ -134,8 +134,8 @@ def run_command_and_log_output(command, log_file_path: Path): # Define a list of necessary environment variables for pgcopydb custom_env_vars = { "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}", - "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")), - "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")), + "PGCOPYDB_SOURCE_PGURI": cast("str", os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")), + "PGCOPYDB_TARGET_PGURI": cast("str", os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")), "PGOPTIONS": "-c idle_in_transaction_session_timeout=0 -c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7", } # Combine the current environment with custom variables diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py index e2f0a79018..81dae53759 100644 --- a/test_runner/performance/test_perf_many_relations.py +++ b/test_runner/performance/test_perf_many_relations.py @@ -6,6 +6,7 @@ from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.compare_fixtures import RemoteCompare from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import shared_buffers_for_max_cu def get_num_relations(default: int = 1000) -> list[int]: @@ -78,7 +79,8 @@ def test_perf_simple_many_relations_reldir_v2( ep = env.endpoints.create_start( "main", config_lines=[ - "shared_buffers=1000MB", + # use shared_buffers size like in production for 8 CU compute + f"shared_buffers={shared_buffers_for_max_cu(8.0)}", "max_locks_per_transaction=16384", ], ) diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index bc4ab64105..90e69565ec 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -3,12 +3,15 @@ from __future__ import annotations import os from dataclasses import dataclass from pathlib import Path +from typing import TYPE_CHECKING import pytest -from _pytest.mark import ParameterSet -from fixtures.compare_fixtures import RemoteCompare from fixtures.log_helper import log +if TYPE_CHECKING: + from _pytest.mark import ParameterSet + from fixtures.compare_fixtures import RemoteCompare + @dataclass class LabelledQuery: diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py index 842e6a904b..957a4ec796 100644 --- a/test_runner/performance/test_perf_oltp_large_tenant.py +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -4,14 +4,17 @@ import os import timeit from contextlib import closing from pathlib import Path +from typing import TYPE_CHECKING import pytest from fixtures.benchmark_fixture import PgBenchRunResult -from fixtures.compare_fixtures import PgCompare from fixtures.log_helper import log from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + def get_custom_scripts( default: str = "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4", diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 24ff3d23fa..57889ceadf 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -6,12 +6,15 @@ import os import timeit from datetime import datetime from pathlib import Path +from typing import TYPE_CHECKING import pytest from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult -from fixtures.compare_fixtures import PgCompare from fixtures.utils import get_scale_for_db +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + @enum.unique class PgBenchLoadType(enum.Enum): diff --git a/test_runner/performance/test_perf_pgvector_queries.py b/test_runner/performance/test_perf_pgvector_queries.py index 4a5ea94c4b..372aab276c 100644 --- a/test_runner/performance/test_perf_pgvector_queries.py +++ b/test_runner/performance/test_perf_pgvector_queries.py @@ -1,10 +1,14 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest -from fixtures.compare_fixtures import PgCompare from performance.test_perf_pgbench import PgBenchLoadType, get_durations_matrix, run_test_pgbench +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + # The following test runs on an existing database that has pgvector extension installed # and a table with 1 million embedding vectors loaded and indexed with HNSW. diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index 38b04b9114..df5419f292 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -2,31 +2,34 @@ from __future__ import annotations import csv import os -import subprocess import time import traceback -from pathlib import Path from typing import TYPE_CHECKING import psycopg2 -import psycopg2.extras import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_api import connection_parameters_to_env -from fixtures.pg_version import PgVersion if TYPE_CHECKING: + import subprocess + from pathlib import Path from typing import Any from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.neon_api import NeonAPI from fixtures.neon_fixtures import PgBin + from fixtures.pg_version import PgVersion # Granularity of ~0.5 sec -def measure_replication_lag(master, replica, timeout_sec=600): +def measure_replication_lag( + master: psycopg2.extensions.cursor, + replica: psycopg2.extensions.cursor, + timeout_sec: int = 600, +): start = time.time() master.execute("SELECT pg_current_wal_flush_lsn()") master_lsn = Lsn(master.fetchall()[0][0]) @@ -40,7 +43,7 @@ def measure_replication_lag(master, replica, timeout_sec=600): raise TimeoutError(f"Replication sync took more than {timeout_sec} sec") -def check_pgbench_still_running(pgbench): +def check_pgbench_still_running(pgbench: subprocess.Popen[str]): rc = pgbench.poll() if rc is not None: raise RuntimeError(f"Pgbench terminated early with return code {rc}") @@ -61,6 +64,8 @@ def test_ro_replica_lag( project = neon_api.create_project(pg_version) project_id = project["project"]["id"] + log.info("Project ID: {}", project_id) + log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"]) neon_api.wait_for_operation_to_finish(project_id) error_occurred = False try: @@ -76,6 +81,7 @@ def test_ro_replica_lag( endpoint_type="read_only", settings={"pg_settings": {"hot_standby_feedback": "on"}}, ) + log.info("Replica endpoint ID: {}", replica["endpoint"]["id"]) replica_env = master_env.copy() replica_env["PGHOST"] = replica["endpoint"]["host"] neon_api.wait_for_operation_to_finish(project_id) @@ -186,11 +192,13 @@ def test_replication_start_stop( prefix = "pgbench_agg" num_replicas = 2 configuration_test_time_sec = 10 * 60 - pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}" + pgbench_duration = f"-T{2**num_replicas * configuration_test_time_sec}" error_occurred = False project = neon_api.create_project(pg_version) project_id = project["project"]["id"] + log.info("Project ID: {}", project_id) + log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"]) neon_api.wait_for_operation_to_finish(project_id) try: branch_id = project["branch"]["id"] @@ -200,15 +208,15 @@ def test_replication_start_stop( ) replicas = [] - for _ in range(num_replicas): - replicas.append( - neon_api.create_endpoint( - project_id, - branch_id, - endpoint_type="read_only", - settings={"pg_settings": {"hot_standby_feedback": "on"}}, - ) + for i in range(num_replicas): + replica = neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, ) + log.info("Replica {} endpoint ID: {}", i + 1, replica["endpoint"]["id"]) + replicas.append(replica) neon_api.wait_for_operation_to_finish(project_id) replica_connstr = [ diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index 46848a8af8..bb8048e97d 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -2,11 +2,14 @@ from __future__ import annotations import random from contextlib import closing +from typing import TYPE_CHECKING from fixtures.benchmark_fixture import MetricReport -from fixtures.compare_fixtures import PgCompare from fixtures.utils import query_scalar +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + # This is a clear-box test that demonstrates the worst case scenario for the # "1 segment per layer" implementation of the pageserver. It writes to random diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index 36ee4eb201..37854df1fa 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -4,13 +4,16 @@ from __future__ import annotations from contextlib import closing +from typing import TYPE_CHECKING import pytest from fixtures.benchmark_fixture import MetricReport -from fixtures.compare_fixtures import PgCompare from fixtures.log_helper import log from pytest_lazyfixture import lazy_fixture +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare + @pytest.mark.parametrize( "rows,iters,workers", diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py index e5a9f17da8..04bebae92f 100644 --- a/test_runner/performance/test_sharding_autosplit.py +++ b/test_runner/performance/test_sharding_autosplit.py @@ -34,10 +34,13 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): neon_env_builder.num_pageservers = 8 neon_env_builder.storage_controller_config = { - # Split tenants at 500MB: it's up to the storage controller how it interprets this (logical - # sizes, physical sizes, etc). We will write this much data logically, therefore other sizes - # will reliably be greater. - "split_threshold": 1024 * 1024 * 500 + # Initial splits at 64 MB, then repeated splits at 192 MB shard sizes, which typically ends + # up with a mix of 4 and 8 shards. Often, but not always, the relation is fully extended + # to the final size before splitting. + "initial_split_threshold": 64 * 1024 * 1024, + "initial_split_shards": 4, + "split_threshold": 192 * 1024 * 1024, + "max_split_shards": 16, } tenant_conf = { @@ -229,13 +232,13 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): def assert_all_split(): for tenant_id in tenants.keys(): shards = tenant_get_shards(env, tenant_id) - assert len(shards) == 8 + assert len(shards) >= 4 # This is not a wait_until, because we wanted the splits to happen _while_ pgbench is running: otherwise # this test is not properly doing its job of validating that splits work nicely under load. assert_all_split() - env.storage_controller.assert_log_contains(".*Successful auto-split.*") + env.storage_controller.assert_log_contains(".*successful auto-split .*") # Log timeline sizes, useful for debug, and implicitly validates that the shards # are available in the places the controller thinks they should be. diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py deleted file mode 100644 index 60d8b5be30..0000000000 --- a/test_runner/performance/test_startup.py +++ /dev/null @@ -1,84 +0,0 @@ -from __future__ import annotations - -import requests -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.neon_fixtures import NeonEnvBuilder - - -# Just start and measure duration. -# -# This test runs pretty quickly and can be informative when used in combination -# with emulated network delay. Some useful delay commands: -# -# 1. Add 2msec delay to all localhost traffic -# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` -# -# 2. Test that it works (you should see 4ms ping) -# `ping localhost` -# -# 3. Revert back to normal -# `sudo tc qdisc del dev lo root netem` -# -# NOTE this test might not represent the real startup time because the basebackup -# for a large database might be larger if there's a lof of transaction metadata, -# or safekeepers might need more syncing, or there might be more operations to -# apply during config step, like more users, databases, or extensions. By default -# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this -# test we only load neon. -def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - env.create_branch("test_startup") - - endpoint = None - - # We do two iterations so we can see if the second startup is faster. It should - # be because the compute node should already be configured with roles, databases, - # extensions, etc from the first run. - for i in range(2): - # Start - with zenbenchmark.record_duration(f"{i}_start_and_select"): - if endpoint: - endpoint.start() - else: - endpoint = env.endpoints.create_start( - "test_startup", - # Shared buffers need to be allocated during startup, so they - # impact startup time. This is the default value we use for - # 1CPU pods (maybe different for VMs). - # - # TODO extensions also contribute to shared memory allocation, - # and this test doesn't include all default extensions we - # load. - config_lines=["shared_buffers=262144"], - ) - endpoint.safe_psql("select 1;") - - # Get metrics - metrics = requests.get( - f"http://localhost:{endpoint.external_http_port}/metrics.json" - ).json() - durations = { - "wait_for_spec_ms": f"{i}_wait_for_spec", - "sync_safekeepers_ms": f"{i}_sync_safekeepers", - "sync_sk_check_ms": f"{i}_sync_sk_check", - "basebackup_ms": f"{i}_basebackup", - "start_postgres_ms": f"{i}_start_postgres", - "config_ms": f"{i}_config", - "total_startup_ms": f"{i}_total_startup", - } - for key, name in durations.items(): - value = metrics[key] - zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) - - # Check basebackup size makes sense - basebackup_bytes = metrics["basebackup_bytes"] - if i > 0: - assert basebackup_bytes < 100 * 1024 - - # Stop so we can restart - endpoint.stop() - - # Imitate optimizations that console would do for the second start - endpoint.respec(skip_pg_catalog_updates=True) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index e897d53cc8..58436c4739 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -5,10 +5,10 @@ import random import time from collections import defaultdict from enum import StrEnum +from typing import TYPE_CHECKING import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId -from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -22,6 +22,9 @@ from fixtures.pageserver.http import PageserverApiException, PageserverHttpClien from fixtures.pg_version import PgVersion from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.compute_reconfigure import ComputeReconfigure + def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]: """ @@ -82,6 +85,7 @@ def test_storage_controller_many_tenants( # guard against regressions in restart time. "max_offline": "30s", "max_warming_up": "300s", + "use_local_compute_notifications": False, } neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api @@ -170,7 +174,7 @@ def test_storage_controller_many_tenants( rss = env.storage_controller.get_metric_value("process_resident_memory_bytes") assert rss is not None - log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)") + log.info(f"Resident memory: {rss} ({rss / total_shards} per shard)") assert rss < expect_memory_per_shard * total_shards def assert_all_tenants_scheduled_in_home_az(): @@ -185,15 +189,15 @@ def test_storage_controller_many_tenants( assert preferred_az == shard["preferred_az_id"] # Attachment should be in the preferred AZ - assert shard["preferred_az_id"] == az_selector( - shard["node_attached"] - ), f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}" + assert shard["preferred_az_id"] == az_selector(shard["node_attached"]), ( + f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}" + ) # Secondary locations should not be in the preferred AZ for node_secondary in shard["node_secondary"]: - assert ( - shard["preferred_az_id"] != az_selector(node_secondary) - ), f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}" + assert shard["preferred_az_id"] != az_selector(node_secondary), ( + f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}" + ) # There should only be one secondary location (i.e. no migrations in flight) assert len(shard["node_secondary"]) == 1 @@ -530,9 +534,9 @@ def test_storage_controller_many_tenants( for node in nodes: if node["id"] in node_ids: checked_any = True - assert ( - node["availability"] == expected_availability - ), f"Node {node['id']} is not {expected_availability} yet: {node['availability']}" + assert node["availability"] == expected_availability, ( + f"Node {node['id']} is not {expected_availability} yet: {node['availability']}" + ) assert checked_any @@ -549,9 +553,9 @@ def test_storage_controller_many_tenants( desc = env.storage_controller.tenant_describe(tenant_id) for shard in desc["shards"]: # Attachment should be outside the AZ where we killed the pageservers - assert ( - az_selector(shard["node_attached"]) != victim_az - ), f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})" + assert az_selector(shard["node_attached"]) != victim_az, ( + f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})" + ) # Bring back the pageservers for ps in killed_pageservers: diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index c6d795ce4d..4824fa1ba8 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -4,7 +4,6 @@ import statistics import threading import time import timeit -from collections.abc import Generator from typing import TYPE_CHECKING import pytest @@ -17,7 +16,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Generator from typing import Any @@ -37,9 +36,9 @@ def pg_compare(request) -> Generator[PgCompare, None, None]: yield fixture else: - assert ( - len(x) == 2 - ), f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`" + assert len(x) == 2, ( + f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`" + ) # `NeonCompare` interface neon_env_builder = request.getfixturevalue("neon_env_builder") @@ -278,7 +277,7 @@ def record_read_latency( t2 = timeit.default_timer() log.info( - f"Executed read query {read_query}, got {cur.fetchall()}, read time {t2-t1:.2f}s" + f"Executed read query {read_query}, got {cur.fetchall()}, read time {t2 - t1:.2f}s" ) read_latencies.append(t2 - t1) except Exception as err: diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py index 87824604f8..55ebd1cd23 100644 --- a/test_runner/performance/test_write_amplification.py +++ b/test_runner/performance/test_write_amplification.py @@ -14,8 +14,10 @@ from __future__ import annotations from contextlib import closing +from typing import TYPE_CHECKING -from fixtures.compare_fixtures import PgCompare +if TYPE_CHECKING: + from fixtures.compare_fixtures import PgCompare def test_write_amplification(neon_with_baseline: PgCompare): diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile index 71717a6006..d5bb3d8685 100644 --- a/test_runner/pg_clients/csharp/npgsql/Dockerfile +++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build +FROM mcr.microsoft.com/dotnet/sdk:8.0@sha256:2d7f935b8c7fe032cd3d36b5ce9c82c24413881e6dad1b4fbdf36cf369e4244f AS build WORKDIR /source COPY *.csproj . @@ -7,7 +7,7 @@ RUN dotnet restore COPY . . RUN dotnet publish -c release -o /app --no-restore -FROM mcr.microsoft.com/dotnet/runtime:8.0 +FROM mcr.microsoft.com/dotnet/runtime:8.0@sha256:a6fc92280fbf2149cd6846d39c5bf7b9b535184e470aa68ef2847b9a02f6b99e WORKDIR /app COPY --from=build /app . diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 7c2b1b40e0..af7c833f33 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,4 +1,5 @@ -FROM openjdk:22 +# use base image openjdk:22 +FROM openjdk:22@sha256:b7d44427f4622d3f6b9a60583e5218ecfa8b4e44f3e01dfd0d9b7d7abba31c9a WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile index f2cc37a7bb..200611189b 100644 --- a/test_runner/pg_clients/python/asyncpg/Dockerfile +++ b/test_runner/pg_clients/python/asyncpg/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.12 +# use base image python:3.12 +FROM python:3.12@sha256:4e7024df2f2099e87d0a41893c299230d2a974c3474e681b0996f141951f9817 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile index ee1de20da5..97e54829aa 100644 --- a/test_runner/pg_clients/python/pg8000/Dockerfile +++ b/test_runner/pg_clients/python/pg8000/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.12 +# use base image python:3.12 +FROM python:3.12@sha256:4e7024df2f2099e87d0a41893c299230d2a974c3474e681b0996f141951f9817 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 0b138bf167..027be03707 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "addr2line" @@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.70" +version = "0.10.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6" +checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.105" +version = "0.9.107" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc" +checksum = "8288979acd84749c744a9014b4382d42b8f7b2592847b5afb2ed29e5d16ede07" dependencies = [ "cc", "libc", diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index 3e214de785..add7d35de5 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,5 @@ -FROM rust:1.79 +# use base image rust:1.79 +FROM rust:1.79@sha256:9b2689d6f99ff381f178fa4361db745c8c355faecde73aa5b18b0efa84f03e62 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index 6006e61ee2..19cca43ae2 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,13 @@ -FROM swift:5.10 AS build +# use base image swift:5.10 +FROM swift:5.10@sha256:69828d795abc6aa3f7b7f7a02054ab308420f54dcc806fcbd89c54944e61a089 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.10 +# use base image swift:5.10 +FROM swift:5.10@sha256:69828d795abc6aa3f7b7f7a02054ab308420f54dcc806fcbd89c54944e61a089 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index d6815fbb5f..89b91a1225 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,12 @@ -FROM swift:5.10 AS build +# use base image swift:5.10 +FROM swift:5.10@sha256:69828d795abc6aa3f7b7f7a02054ab308420f54dcc806fcbd89c54944e61a089 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.10 +# use base image swift:5.10 +FROM swift:5.10@sha256:69828d795abc6aa3f7b7f7a02054ab308420f54dcc806fcbd89c54944e61a089 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index ffa710da06..d402a9ffa8 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -3,11 +3,14 @@ from __future__ import annotations import shutil from pathlib import Path from tempfile import NamedTemporaryFile +from typing import TYPE_CHECKING import pytest -from fixtures.neon_fixtures import RemotePostgres from fixtures.utils import subprocess_capture +if TYPE_CHECKING: + from fixtures.neon_fixtures import RemotePostgres + @pytest.mark.remote_cluster @pytest.mark.parametrize( diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index 45e8753f7e..5134ed1bfc 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,5 @@ -FROM node:22 +# use base image node:22 +FROM node:22@sha256:c7fd844945a76eeaa83cb372e4d289b4a30b478a1c80e16c685b62c54156285b WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile index 45e8753f7e..5134ed1bfc 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -1,4 +1,5 @@ -FROM node:22 +# use base image node:22 +FROM node:22@sha256:c7fd844945a76eeaa83cb372e4d289b4a30b478a1c80e16c685b62c54156285b WORKDIR /source COPY . . diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 8cd49d480f..92c8ae9456 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,10 +1,14 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from fixtures.common_types import TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import query_scalar +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + # # Create ancestor branches off the main branch. diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index b56fcd3500..5021cc4b17 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -1,18 +1,23 @@ from __future__ import annotations -from collections.abc import Generator from dataclasses import dataclass +from typing import TYPE_CHECKING import pytest from fixtures.common_types import TenantId -from fixtures.neon_fixtures import ( - NeonEnv, - NeonEnvBuilder, -) -from fixtures.pageserver.http import TenantConfig +from fixtures.log_helper import log from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import wait_until +if TYPE_CHECKING: + from collections.abc import Generator + + from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + ) + from fixtures.pageserver.http import PageserverHttpClient, TenantConfig + @pytest.fixture def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: @@ -52,9 +57,9 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N yield NegativeTests(env, tenant_id, config_pre_detach) - assert tenant_id not in [ - TenantId(t["id"]) for t in ps_http.tenant_list() - ], "tenant should not be attached after negative test" + assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()], ( + "tenant should not be attached after negative test" + ) env.pageserver.allowed_errors.extend( [ @@ -144,7 +149,6 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_l0_semaphore": False, "l0_flush_delay_threshold": 25, "l0_flush_stall_threshold": 42, - "l0_flush_wait_upload": True, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", @@ -159,7 +163,6 @@ def test_fully_custom_config(positive_env: NeonEnv): "evictions_low_residence_duration_metric_threshold": "2days", "gc_horizon": 23 * (1024 * 1024), "gc_period": "2h 13m", - "heatmap_period": "10m", "image_creation_threshold": 7, "pitr_interval": "1m", "lagging_wal_timeout": "23m", @@ -182,44 +185,76 @@ def test_fully_custom_config(positive_env: NeonEnv): "type": "interpreted", "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, }, - "rel_size_v2_enabled": True, + "rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it "gc_compaction_enabled": True, "gc_compaction_initial_threshold_kb": 1024000, "gc_compaction_ratio_percent": 200, "image_creation_preempt_threshold": 5, + "sampling_ratio": { + "numerator": 0, + "denominator": 10, + }, } vps_http = env.storage_controller.pageserver_api() + ps_http = env.pageserver.http_client() - initial_tenant_config = vps_http.tenant_config(env.initial_tenant) - assert [ - (key, val) - for key, val in initial_tenant_config.tenant_specific_overrides.items() - if val is not None - ] == [] - assert set(initial_tenant_config.effective_config.keys()) == set( - fully_custom_config.keys() - ), "ensure we cover all config options" + def get_config(client: PageserverHttpClient, tenant_id): + ignored_fields = [ + # storcon overrides this during reconciles, and + # this test triggers reconciles when we change the + # tenant config via vps_http + "heatmap_period" + ] + config = client.tenant_config(tenant_id) + for field in ignored_fields: + config.effective_config.pop(field, None) + config.tenant_specific_overrides.pop(field, None) + return config + # storcon returns its db state in GET tenant_config in both fields + # https://github.com/neondatabase/neon/issues/9621 + initial_tenant_config = get_config(vps_http, env.initial_tenant) + assert initial_tenant_config.tenant_specific_overrides == {} + assert initial_tenant_config.tenant_specific_overrides == initial_tenant_config.effective_config + + # pageserver has built-in defaults for all config options + # also self-test that our fully_custom_config covers all of them + initial_tenant_config = get_config(ps_http, env.initial_tenant) + assert initial_tenant_config.tenant_specific_overrides == {} + assert set(initial_tenant_config.effective_config.keys()) == set(fully_custom_config.keys()), ( + "ensure we cover all config options" + ) + + # create a new tenant to test overrides (tenant_id, _) = env.create_tenant() vps_http.set_tenant_config(tenant_id, fully_custom_config) - our_tenant_config = vps_http.tenant_config(tenant_id) - assert our_tenant_config.tenant_specific_overrides == fully_custom_config - assert set(our_tenant_config.effective_config.keys()) == set( - fully_custom_config.keys() - ), "ensure we cover all config options" - assert ( - { + + for iteration in ["first", "after-reattach"]: + log.info(f"iteration: {iteration}") + + # validate that overrides for all fields are returned by storcon + our_tenant_config = get_config(vps_http, tenant_id) + assert our_tenant_config.tenant_specific_overrides == fully_custom_config + assert our_tenant_config.tenant_specific_overrides == our_tenant_config.effective_config + + # validate that overrides for all fields reached pageserver + our_tenant_config = get_config(ps_http, tenant_id) + assert our_tenant_config.tenant_specific_overrides == fully_custom_config + assert our_tenant_config.tenant_specific_overrides == our_tenant_config.effective_config + + # some more self-validation: assert that none of the values in our + # fully custom config are the same as the default values + assert set(our_tenant_config.effective_config.keys()) == set(fully_custom_config.keys()), ( + "ensure we cover all config options" + ) + assert { k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k] for k in fully_custom_config.keys() - } - == {k: True for k in fully_custom_config.keys()} - ), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything" + } == {k: True for k in fully_custom_config.keys()}, ( + "ensure our custom config has different values than the default config for all config options, so we know we overrode everything" + ) - env.pageserver.tenant_detach(tenant_id) - env.pageserver.tenant_attach(tenant_id, config=fully_custom_config) - - assert vps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config - assert set(vps_http.tenant_config(tenant_id).effective_config.keys()) == set( - fully_custom_config.keys() - ), "ensure we cover all config options" + # ensure customizations survive reattach + env.pageserver.tenant_detach(tenant_id) + env.pageserver.tenant_attach(tenant_id, config=fully_custom_config) diff --git a/test_runner/regress/test_auth_broker.py b/test_runner/regress/test_auth_broker.py index 11dc7d56b5..bc3f220011 100644 --- a/test_runner/regress/test_auth_broker.py +++ b/test_runner/regress/test_auth_broker.py @@ -23,13 +23,13 @@ async def test_auth_broker_happy( # local proxy mock just echos back the request # check that we forward the correct data - assert ( - res["headers"]["authorization"] == f"Bearer {token.serialize()}" - ), "JWT should be forwarded" + assert res["headers"]["authorization"] == f"Bearer {token.serialize()}", ( + "JWT should be forwarded" + ) - assert ( - "anonymous" in res["headers"]["neon-connection-string"] - ), "conn string should be forwarded" + assert "anonymous" in res["headers"]["neon-connection-string"], ( + "conn string should be forwarded" + ) assert json.loads(res["body"]) == { "query": "foo", diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py index c75419b786..da548721cf 100644 --- a/test_runner/regress/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -3,11 +3,14 @@ from __future__ import annotations import threading import time from contextlib import closing, contextmanager +from typing import TYPE_CHECKING import psycopg2.extras import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder + +if TYPE_CHECKING: + from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder pytest_plugins = "fixtures.neon_fixtures" diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index bfc5cb174e..d31c0c95d3 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -2,13 +2,16 @@ from __future__ import annotations import random import time +from typing import TYPE_CHECKING import psycopg2.errors import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import USE_LFC +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + @pytest.mark.timeout(600) def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py index 2dd1a88ad7..7a21712ef9 100644 --- a/test_runner/regress/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -1,7 +1,11 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest -from fixtures.neon_fixtures import NeonEnv + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv # diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index 0e28231a86..8447c9bf2d 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -2,14 +2,17 @@ from __future__ import annotations import threading import time +from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.http import TimelineCreate406 from fixtures.utils import query_scalar, skip_in_debug_build +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + # Test the GC implementation when running with branching. # This test reproduces the issue https://github.com/neondatabase/neon/issues/707. @@ -58,9 +61,9 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # disable background GC "gc_period": "0s", # small checkpoint distance to create more delta layer files - "checkpoint_distance": f"{1024 ** 2}", + "checkpoint_distance": f"{1024**2}", # set the target size to be large to allow the image layer to cover the whole key space - "compaction_target_size": f"{1024 ** 3}", + "compaction_target_size": f"{1024**3}", # tweak the default settings to allow quickly create image layers and L1 layers "compaction_period": "1 s", "compaction_threshold": "2", @@ -134,9 +137,9 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # disable background GC "gc_period": "0s", # small checkpoint distance to create more delta layer files - "checkpoint_distance": f"{1024 ** 2}", + "checkpoint_distance": f"{1024**2}", # set the target size to be large to allow the image layer to cover the whole key space - "compaction_target_size": f"{1024 ** 3}", + "compaction_target_size": f"{1024**3}", # tweak the default settings to allow quickly create image layers and L1 layers "compaction_period": "1 s", "compaction_threshold": "2", diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 619fc15aa3..34680cffce 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,12 +1,16 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import TimelineCreate406 from fixtures.utils import print_gc_result, query_scalar +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + # # Create a couple of branches off the main branch, at a historical point in time. diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 85d0cfbf1d..9ce618b2ad 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -4,16 +4,11 @@ import random import threading import time from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - Endpoint, - NeonEnv, - NeonEnvBuilder, - PgBin, -) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import wait_until_tenant_active from fixtures.utils import query_scalar @@ -21,6 +16,14 @@ from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException from requests.exceptions import RetryError +if TYPE_CHECKING: + from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + NeonEnvBuilder, + PgBin, + ) + # Test branch creation # @@ -43,9 +46,9 @@ def test_branching_with_pgbench( tenant, _ = env.create_tenant( conf={ "gc_period": "5 s", - "gc_horizon": f"{1024 ** 2}", - "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", + "gc_horizon": f"{1024**2}", + "checkpoint_distance": f"{1024**2}", + "compaction_target_size": f"{1024**2}", # set PITR interval to be small, so we can do GC "pitr_interval": "5 s", } diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index d49686b57c..1209b3a818 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -124,14 +124,14 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. new_tenant_timelines = env.neon_cli.timeline_list(tenant_id) - assert ( - new_tenant_timelines == old_tenant_timelines - ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + assert new_tenant_timelines == old_tenant_timelines, ( + f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + ) timeline_dirs = [d for d in timelines_dir.iterdir()] - assert ( - timeline_dirs == initial_timeline_dirs - ), "pageserver should clean its temp timeline files on timeline creation failure" + assert timeline_dirs == initial_timeline_dirs, ( + "pageserver should clean its temp timeline files on timeline creation failure" + ) # The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups @@ -176,14 +176,14 @@ def test_timeline_init_break_before_checkpoint_recreate( # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. new_tenant_timelines = env.neon_cli.timeline_list(tenant_id) - assert ( - new_tenant_timelines == old_tenant_timelines - ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + assert new_tenant_timelines == old_tenant_timelines, ( + f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + ) timeline_dirs = [d for d in timelines_dir.iterdir()] - assert ( - timeline_dirs == initial_timeline_dirs - ), "pageserver should clean its temp timeline files on timeline creation failure" + assert timeline_dirs == initial_timeline_dirs, ( + "pageserver should clean its temp timeline files on timeline creation failure" + ) # creating the branch should have worked now new_timeline_id = TimelineId( @@ -211,11 +211,11 @@ def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuild # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. # "New" timeline is not present in the list, allowing pageserver to retry the same request new_tenant_timelines = env.neon_cli.timeline_list(tenant_id) - assert ( - new_tenant_timelines == old_tenant_timelines - ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + assert new_tenant_timelines == old_tenant_timelines, ( + f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + ) timeline_dirs = [d for d in timelines_dir.iterdir()] - assert ( - timeline_dirs == initial_timeline_dirs - ), "pageserver should clean its temp timeline files on timeline creation failure" + assert timeline_dirs == initial_timeline_dirs, ( + "pageserver should clean its temp timeline files on timeline creation failure" + ) diff --git a/test_runner/regress/test_build_info_metric.py b/test_runner/regress/test_build_info_metric.py index 9a8744571a..200662efd2 100644 --- a/test_runner/regress/test_build_info_metric.py +++ b/test_runner/regress/test_build_info_metric.py @@ -1,7 +1,11 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from fixtures.metrics import parse_metrics -from fixtures.neon_fixtures import NeonEnvBuilder, NeonProxy + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder, NeonProxy def test_build_info_metric(neon_env_builder: NeonEnvBuilder, link_proxy: NeonProxy): diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 5526b783d5..b004db310c 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -1,15 +1,16 @@ from __future__ import annotations import asyncio +from typing import TYPE_CHECKING from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.remote_storage import RemoteStorageKind -from werkzeug.wrappers.request import Request -from werkzeug.wrappers.response import Response + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder -def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): +def test_change_pageserver(neon_env_builder: NeonEnvBuilder): """ A relatively low level test of reconfiguring a compute's pageserver at runtime. Usually this is all done via the storage controller, but this test will disable the storage controller's compute @@ -23,19 +24,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): ) env = neon_env_builder.init_start() - neon_env_builder.control_plane_hooks_api = ( - f"http://{make_httpserver.host}:{make_httpserver.port}/" - ) - - def ignore_notify(request: Request): - # This test does direct updates to compute configuration: disable the storage controller's notification - log.info(f"Ignoring storage controller compute notification: {request.json}") - return Response(status=200) - - make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler( - ignore_notify - ) - env.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index 2ae38e6d88..1780ceb203 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -1,11 +1,14 @@ from __future__ import annotations import os +from typing import TYPE_CHECKING from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import query_scalar, wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + # # Test compute node start after clog truncation diff --git a/test_runner/regress/test_close_fds.py b/test_runner/regress/test_close_fds.py index c0bf7d2462..6911fbf5df 100644 --- a/test_runner/regress/test_close_fds.py +++ b/test_runner/regress/test_close_fds.py @@ -6,9 +6,12 @@ import subprocess import threading import time from contextlib import closing +from typing import TYPE_CHECKING from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def lsof_path() -> str: diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index c8cce7a4e7..6789939e0c 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -166,6 +166,7 @@ def test_pageserver_compaction_preempt( @skip_in_debug_build("only run with release build") +@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM @pytest.mark.parametrize( "with_branches", ["with_branches", "no_branches"], @@ -177,7 +178,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b "compaction_period": "5s", # No PiTR interval and small GC horizon "pitr_interval": "0s", - "gc_horizon": f"{1024 ** 2}", + "gc_horizon": f"{1024**2}", "lsn_lease_length": "0s", } @@ -867,7 +868,7 @@ def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool ) assert bytes_in is not None assert bytes_out is not None - log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)") + log.info(f"Compression ratio: {bytes_out / bytes_in} ({bytes_out} in, {bytes_out} out)") if enabled: # We are writing high compressible repetitive plain text, expect excellent compression diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 823f2185e4..ee96daca33 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -7,6 +7,7 @@ import subprocess import tempfile from dataclasses import dataclass from pathlib import Path +from typing import TYPE_CHECKING import fixtures.utils import pytest @@ -27,6 +28,9 @@ from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload +if TYPE_CHECKING: + from fixtures.compute_reconfigure import ComputeReconfigure + # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. # - `test_create_snapshot` a script wrapped in a test that creates a data snapshot. @@ -97,7 +101,7 @@ from fixtures.workload import Workload # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} # export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install -# export NEON_BIN=target/release +# export NEON_BIN=target/${BUILD_TYPE} # export POSTGRES_DISTRIB_DIR=pg_install # # # Build previous version of binaries and store them somewhere: @@ -231,7 +235,9 @@ def test_backward_compatibility( else: raise - assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + assert not breaking_changes_allowed, ( + "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + ) @check_ondisk_data_compatibility_if_enabled @@ -243,6 +249,7 @@ def test_forward_compatibility( top_output_dir: Path, pg_version: PgVersion, compatibility_snapshot_dir: Path, + compute_reconfigure_listener: ComputeReconfigure, ): """ Test that the old binaries can read new data @@ -251,6 +258,7 @@ def test_forward_compatibility( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api neon_env_builder.test_may_use_compatibility_snapshot_binaries = True try: @@ -259,12 +267,12 @@ def test_forward_compatibility( # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). # But always use the current version's neon_local binary. # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI. - assert ( - neon_env_builder.compatibility_neon_binpath is not None - ), "the environment variable COMPATIBILITY_NEON_BIN is required" - assert ( - neon_env_builder.compatibility_pg_distrib_dir is not None - ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required" + assert neon_env_builder.compatibility_neon_binpath is not None, ( + "the environment variable COMPATIBILITY_NEON_BIN is required" + ) + assert neon_env_builder.compatibility_pg_distrib_dir is not None, ( + "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required" + ) neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir @@ -310,7 +318,9 @@ def test_forward_compatibility( else: raise - assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + assert not breaking_changes_allowed, ( + "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + ) def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): @@ -592,17 +602,22 @@ def test_historic_storage_formats( @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") -@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) +@pytest.mark.parametrize( + **fixtures.utils.allpairs_versions(), +) def test_versions_mismatch( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion, compatibility_snapshot_dir, + compute_reconfigure_listener: ComputeReconfigure, combination, ): """ Checks compatibility of different combinations of versions of the components """ + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 0d3618d1b8..c1f05830b7 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -103,22 +103,22 @@ def test_compute_catalog(neon_simple_env: NeonEnv): objects = client.dbs_and_roles() # Assert that 'cloud_admin' role exists in the 'roles' list - assert any( - role["name"] == "cloud_admin" for role in objects["roles"] - ), "The 'cloud_admin' role is missing" + assert any(role["name"] == "cloud_admin" for role in objects["roles"]), ( + "The 'cloud_admin' role is missing" + ) # Assert that 'postgres' database exists in the 'databases' list - assert any( - db["name"] == "postgres" for db in objects["databases"] - ), "The 'postgres' database is missing" + assert any(db["name"] == "postgres" for db in objects["databases"]), ( + "The 'postgres' database is missing" + ) # Check other databases for test_db in TEST_DB_NAMES: db = next((db for db in objects["databases"] if db["name"] == test_db["name"]), None) assert db is not None, f"The '{test_db['name']}' database is missing" - assert ( - db["owner"] == test_db["owner"] - ), f"The '{test_db['name']}' database has incorrect owner" + assert db["owner"] == test_db["owner"], ( + f"The '{test_db['name']}' database has incorrect owner" + ) ddl = client.database_schema(database=test_db["name"]) @@ -135,9 +135,9 @@ def test_compute_catalog(neon_simple_env: NeonEnv): client.database_schema(database="nonexistentdb") raise AssertionError("Expected HTTPError was not raised") except requests.exceptions.HTTPError as e: - assert ( - e.response.status_code == 404 - ), f"Expected 404 status code, but got {e.response.status_code}" + assert e.response.status_code == 404, ( + f"Expected 404 status code, but got {e.response.status_code}" + ) def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 85cd065a2f..5e3f8671a2 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -13,21 +13,21 @@ import _jsonnet import pytest import requests import yaml -from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR from fixtures.utils import wait_until -from prometheus_client.samples import Sample if TYPE_CHECKING: from collections.abc import Callable from types import TracebackType from typing import Self, TypedDict + from fixtures.endpoint.http import EndpointHttpClient from fixtures.neon_fixtures import NeonEnv from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor + from prometheus_client.samples import Sample class Metric(TypedDict): metric_name: str diff --git a/test_runner/regress/test_compute_migrations.py b/test_runner/regress/test_compute_migrations.py index 0dbb187c39..dc555417b4 100644 --- a/test_runner/regress/test_compute_migrations.py +++ b/test_runner/regress/test_compute_migrations.py @@ -1,6 +1,5 @@ from __future__ import annotations -from pathlib import Path from typing import TYPE_CHECKING, cast import pytest @@ -9,6 +8,8 @@ from fixtures.metrics import parse_metrics from fixtures.utils import wait_until if TYPE_CHECKING: + from pathlib import Path + from fixtures.neon_fixtures import NeonEnv diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py index 6619548811..6396ba67a1 100644 --- a/test_runner/regress/test_compute_reconfigure.py +++ b/test_runner/regress/test_compute_reconfigure.py @@ -1,8 +1,14 @@ from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv +import os +from typing import TYPE_CHECKING + +from fixtures.metrics import parse_metrics from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + def test_compute_reconfigure(neon_simple_env: NeonEnv): """ @@ -60,3 +66,20 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv): row = cursor.fetchone() assert row is not None assert row[0] == TEST_LOG_LINE_PREFIX + + # Check that even after reconfigure and state transitions we still report + # only the current status. + client = endpoint.http_client() + raw_metrics = client.metrics() + metrics = parse_metrics(raw_metrics) + samples = metrics.query_all("compute_ctl_up") + assert len(samples) == 1 + assert samples[0].value == 1 + samples = metrics.query_all("compute_ctl_up", {"status": "running"}) + assert len(samples) == 1 + assert samples[0].value == 1 + # Check that build tag is reported + build_tag = os.environ.get("BUILD_TAG", "latest") + samples = metrics.query_all("compute_ctl_up", {"build_tag": build_tag}) + assert len(samples) == 1 + assert samples[0].value == 1 diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index d48fd01fcb..7a0e4cb3d2 100644 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -2,8 +2,10 @@ from __future__ import annotations import os from contextlib import closing +from typing import TYPE_CHECKING -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder # diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py index 236f4eb2fe..f136ac256f 100644 --- a/test_runner/regress/test_createuser.py +++ b/test_runner/regress/test_createuser.py @@ -1,8 +1,12 @@ from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv +from typing import TYPE_CHECKING + from fixtures.utils import query_scalar +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + # # Test CREATE USER to check shared catalog restore diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index b10e38885e..9c924e9503 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -1,21 +1,21 @@ from __future__ import annotations -from types import TracebackType from typing import TYPE_CHECKING import psycopg2 import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, VanillaPostgres from psycopg2.errors import UndefinedObject -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: + from types import TracebackType from typing import Any, Self from fixtures.httpserver import ListenAddress + from fixtures.neon_fixtures import NeonEnv, VanillaPostgres + from pytest_httpserver import HTTPServer + from werkzeug.wrappers.request import Request def handle_db(dbs, roles, operation): diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 7abcdb3838..b29610e021 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -3,7 +3,6 @@ from __future__ import annotations import enum import time from collections import Counter -from collections.abc import Iterable from dataclasses import dataclass from enum import StrEnum from typing import TYPE_CHECKING @@ -18,14 +17,16 @@ from fixtures.neon_fixtures import ( PgBin, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import human_bytes, wait_until if TYPE_CHECKING: + from collections.abc import Iterable from typing import Any + from fixtures.pageserver.http import PageserverHttpClient + GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" @@ -323,9 +324,9 @@ def finish_tenant_creation( layers = pageserver_http.layer_map_info(tenant_id, timeline_id) # log.info(f"{layers}") - assert ( - len(layers.historic_layers) >= min_expected_layers - ), "evictions happen at layer granularity, but we often assert at byte-granularity" + assert len(layers.historic_layers) >= min_expected_layers, ( + "evictions happen at layer granularity, but we often assert at byte-granularity" + ) return pgbench_init_lsn @@ -421,9 +422,9 @@ def test_pageserver_evicts_until_pressure_is_relieved( assert 0 <= actual_change, "nothing can load layers during this test" assert actual_change >= target, "must evict more than half" - assert ( - response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change - ), "report accurately evicted bytes" + assert response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change, ( + "report accurately evicted bytes" + ) assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" @@ -448,18 +449,18 @@ def test_pageserver_respects_overridden_resident_size( large_tenant = max(du_by_timeline, key=du_by_timeline.__getitem__) small_tenant = min(du_by_timeline, key=du_by_timeline.__getitem__) assert du_by_timeline[large_tenant] > du_by_timeline[small_tenant] - assert ( - du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size - ), "ensure this test will do more than 1 eviction" + assert du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size, ( + "ensure this test will do more than 1 eviction" + ) # Give the larger tenant a haircut while preventing the smaller tenant from getting one. # To prevent the smaller from getting a haircut, we set min_resident_size to its current size. # To ensure the larger tenant is getting a haircut, any non-zero `target` will do. min_resident_size = du_by_timeline[small_tenant] target = 1 - assert ( - du_by_timeline[large_tenant] > min_resident_size - ), "ensure the larger tenant will get a haircut" + assert du_by_timeline[large_tenant] > min_resident_size, ( + "ensure the larger tenant will get a haircut" + ) env.neon_env.storage_controller.pageserver_api().update_tenant_config( small_tenant[0], {"min_resident_size_override": min_resident_size} ) @@ -490,17 +491,17 @@ def test_pageserver_respects_overridden_resident_size( actual_change = total_on_disk - later_total_on_disk assert 0 <= actual_change, "nothing can load layers during this test" assert actual_change >= target, "eviction must always evict more than target" - assert ( - response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change - ), "report accurately evicted bytes" + assert response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change, ( + "report accurately evicted bytes" + ) assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" - assert ( - later_du_by_timeline[small_tenant] == du_by_timeline[small_tenant] - ), "small tenant sees no haircut" - assert ( - later_du_by_timeline[large_tenant] < du_by_timeline[large_tenant] - ), "large tenant gets a haircut" + assert later_du_by_timeline[small_tenant] == du_by_timeline[small_tenant], ( + "small tenant sees no haircut" + ) + assert later_du_by_timeline[large_tenant] < du_by_timeline[large_tenant], ( + "large tenant gets a haircut" + ) assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target @@ -579,9 +580,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): later_du_by_timeline = env.du_by_timeline(env.pageserver) for tenant, later_tenant_usage in later_du_by_timeline.items(): - assert ( - later_tenant_usage < du_by_timeline[tenant] - ), "all tenants should have lost some layers" + assert later_tenant_usage < du_by_timeline[tenant], ( + "all tenants should have lost some layers" + ) # with relative order what matters is the amount of layers, with a # fudge factor of whether the eviction bothers tenants with highest @@ -665,9 +666,9 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or ratio = after / originally ratios.append(ratio) - assert ( - len(ratios) == 4 - ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order" + assert len(ratios) == 4, ( + "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order" + ) log.info(f"{ratios}") if order == EvictionOrder.RELATIVE_ORDER_EQUAL: @@ -829,9 +830,9 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): def more_than_min_avail_bytes_freed(): post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) - assert ( - total_size - post_eviction_total_size >= min_avail_bytes - ), f"we requested at least {min_avail_bytes} worth of free space" + assert total_size - post_eviction_total_size >= min_avail_bytes, ( + f"we requested at least {min_avail_bytes} worth of free space" + ) wait_until(more_than_min_avail_bytes_freed, timeout=5) @@ -878,6 +879,6 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): post_eviction_total_size, _, _ = env.timelines_du(ps_secondary) - assert ( - total_size - post_eviction_total_size >= evict_bytes - ), "we requested at least evict_bytes worth of free space" + assert total_size - post_eviction_total_size >= evict_bytes, ( + "we requested at least evict_bytes worth of free space" + ) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 30f8c65cbd..77babe12cd 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -3,17 +3,16 @@ from __future__ import annotations import os import shutil import tarfile -from pathlib import Path from typing import TYPE_CHECKING import pytest import zstandard from fixtures.log_helper import log from fixtures.metrics import parse_metrics -from pytest_httpserver import HTTPServer from werkzeug.wrappers.response import Response if TYPE_CHECKING: + from pathlib import Path from typing import Any from fixtures.httpserver import ListenAddress @@ -21,6 +20,7 @@ if TYPE_CHECKING: NeonEnvBuilder, ) from fixtures.pg_version import PgVersion + from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py index 03bfd1cb8d..feeb4c8f90 100644 --- a/test_runner/regress/test_endpoint_crash.py +++ b/test_runner/regress/test_endpoint_crash.py @@ -1,10 +1,14 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pg_version import PgVersion from fixtures.utils import WITH_SANITIZERS, run_only_on_postgres +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + @pytest.mark.parametrize( "sql_func", diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py index 382556fd7e..a44ffcc4b0 100644 --- a/test_runner/regress/test_explain_with_lfc_stats.py +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -1,12 +1,15 @@ from __future__ import annotations from pathlib import Path +from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv from fixtures.utils import USE_LFC +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_fsm_truncate.py b/test_runner/regress/test_fsm_truncate.py index 55a010f26a..c6a43bafe1 100644 --- a/test_runner/regress/test_fsm_truncate.py +++ b/test_runner/regress/test_fsm_truncate.py @@ -1,6 +1,9 @@ from __future__ import annotations -from fixtures.neon_fixtures import NeonEnvBuilder +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder def test_fsm_truncate(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index 62d59528cf..2625ae212a 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from pathlib import Path +from typing import TYPE_CHECKING from fixtures.common_types import Lsn from fixtures.log_helper import log @@ -10,9 +10,13 @@ from fixtures.neon_fixtures import ( PgBin, VanillaPostgres, ) -from fixtures.port_distributor import PortDistributor from fixtures.utils import query_scalar, subprocess_capture +if TYPE_CHECKING: + from pathlib import Path + + from fixtures.port_distributor import PortDistributor + num_rows = 1000 diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 97c38cf658..c83004583a 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -3,8 +3,8 @@ from __future__ import annotations import asyncio import concurrent.futures import random +from typing import TYPE_CHECKING -from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -14,6 +14,9 @@ from fixtures.neon_fixtures import ( ) from fixtures.remote_storage import RemoteStorageKind +if TYPE_CHECKING: + from fixtures.common_types import TimelineId + # Test configuration # # Create a table with {NUM_ROWS} rows, and perform {UPDATES_TO_PERFORM} random @@ -123,7 +126,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder): ps_metrics = env.pageserver.http_client().get_metrics() total = 0.0 for sample in ps_metrics.query_all( - name="pageserver_remote_operation_seconds_count", + name="pageserver_remote_timeline_client_seconds_global_count", filter={ "file_kind": str(file_kind), "op_kind": str(op_kind), diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 743fa72aba..55737c35f0 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -6,17 +6,11 @@ import re import shutil import tarfile from contextlib import closing -from pathlib import Path +from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - Endpoint, - NeonEnv, - NeonEnvBuilder, - PgBin, -) from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -24,6 +18,16 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture +if TYPE_CHECKING: + from pathlib import Path + + from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + NeonEnvBuilder, + PgBin, + ) + def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): # Put data in vanilla pg @@ -179,7 +183,7 @@ def test_import_from_pageserver_multisegment( logical_size = env.pageserver.http_client().timeline_detail(env.initial_tenant, timeline)[ "current_logical_size" ] - log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") + log.info(f"timeline logical size = {logical_size / (1024**2)}MB") assert logical_size > 1024**3 # = 1GB tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir) diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 71e0d16edd..6b3b71f29c 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -19,6 +19,7 @@ from fixtures.pageserver.http import ( from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import MockS3Server, RemoteStorageKind +from fixtures.utils import shared_buffers_for_max_cu from mypy_boto3_kms import KMSClient from mypy_boto3_kms.type_defs import EncryptResponseTypeDef from mypy_boto3_s3 import S3Client @@ -80,7 +81,8 @@ def test_pgdata_import_smoke( # doesn't allow any prefetching on v17 and above, where the new streaming # read machinery keeps buffers pinned while prefetching them. Use a higher # setting to enable prefetching and speed up the tests - ep_config = ["shared_buffers=64MB"] + # use shared_buffers size like in production for 8 CU compute + ep_config = [f"shared_buffers={shared_buffers_for_max_cu(8.0)}"] # # Put data in vanilla pg @@ -158,6 +160,7 @@ def test_pgdata_import_smoke( statusdir = importbucket / "status" statusdir.mkdir() (statusdir / "pgdata").write_text(json.dumps({"done": True})) + (statusdir / "fast_import").write_text(json.dumps({"command": "pgdata", "done": True})) # # Do the import @@ -286,9 +289,9 @@ def test_pgdata_import_smoke( shard_ps = env.get_pageserver(shard["node_id"]) result = shard_ps.timeline_scan_no_disposable_keys(shard["shard_id"], timeline_id) assert result.tally.disposable_count == 0 - assert ( - result.tally.not_disposable_count > 0 - ), "sanity check, each shard should have some data" + assert result.tally.not_disposable_count > 0, ( + "sanity check, each shard should have some data" + ) # # validate that we can write @@ -439,16 +442,23 @@ def test_fast_import_with_pageserver_ingest( env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) # Run fast_import - if fast_import.extra_env is None: - fast_import.extra_env = {} - fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() - fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() - fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() - fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() - fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() - fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" + fast_import.set_aws_creds(mock_s3_server, {"RUST_LOG": "aws_config=debug,aws_sdk_kms=debug"}) pg_port = port_distributor.get_port() fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}") + + pgdata_status_obj = mock_s3_client.get_object(Bucket=bucket, Key=f"{key_prefix}/status/pgdata") + pgdata_status = pgdata_status_obj["Body"].read().decode("utf-8") + assert json.loads(pgdata_status) == {"done": True}, f"got status: {pgdata_status}" + + job_status_obj = mock_s3_client.get_object( + Bucket=bucket, Key=f"{key_prefix}/status/fast_import" + ) + job_status = job_status_obj["Body"].read().decode("utf-8") + assert json.loads(job_status) == { + "command": "pgdata", + "done": True, + }, f"got status: {job_status}" + vanilla_pg.stop() def validate_vanilla_equivalence(ep): @@ -674,21 +684,27 @@ def test_fast_import_restore_to_connstring_from_s3_spec( ).decode("utf-8"), } - mock_s3_client.create_bucket(Bucket="test-bucket") + bucket = "test-bucket" + key_prefix = "test-prefix" + mock_s3_client.create_bucket(Bucket=bucket) mock_s3_client.put_object( - Bucket="test-bucket", Key="test-prefix/spec.json", Body=json.dumps(spec) + Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec) ) # Run fast_import - if fast_import.extra_env is None: - fast_import.extra_env = {} - fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() - fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() - fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() - fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() - fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() - fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" - fast_import.run_dump_restore(s3prefix="s3://test-bucket/test-prefix") + fast_import.set_aws_creds( + mock_s3_server, {"RUST_LOG": "aws_config=debug,aws_sdk_kms=debug"} + ) + fast_import.run_dump_restore(s3prefix=f"s3://{bucket}/{key_prefix}") + + job_status_obj = mock_s3_client.get_object( + Bucket=bucket, Key=f"{key_prefix}/status/fast_import" + ) + job_status = job_status_obj["Body"].read().decode("utf-8") + assert json.loads(job_status) == { + "done": True, + "command": "dump-restore", + }, f"got status: {job_status}" vanilla_pg.stop() res = destination_vanilla_pg.safe_psql("SELECT count(*) FROM foo;") @@ -696,9 +712,110 @@ def test_fast_import_restore_to_connstring_from_s3_spec( assert res[0][0] == 10 -# TODO: Maybe test with pageserver? -# 1. run whole neon env -# 2. create timeline with some s3 path??? -# 3. run fast_import with s3 prefix -# 4. ??? mock http where pageserver will report progress -# 5. run compute on this timeline and check if data is there +def test_fast_import_restore_to_connstring_error_to_s3_bad_destination( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Start source postgres and ingest data + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + # Encrypt connstrings and put spec into S3 + source_connstring_encrypted = encrypt(vanilla_pg.connstr()) + destination_connstring_encrypted = encrypt("postgres://random:connection@string:5432/neondb") + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode( + source_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + "destination_connstring_ciphertext_base64": base64.b64encode( + destination_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + } + + bucket = "test-bucket" + key_prefix = "test-prefix" + mock_s3_client.create_bucket(Bucket=bucket) + mock_s3_client.put_object(Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec)) + + # Run fast_import + fast_import.set_aws_creds(mock_s3_server, {"RUST_LOG": "aws_config=debug,aws_sdk_kms=debug"}) + fast_import.run_dump_restore(s3prefix=f"s3://{bucket}/{key_prefix}") + + job_status_obj = mock_s3_client.get_object( + Bucket=bucket, Key=f"{key_prefix}/status/fast_import" + ) + job_status = job_status_obj["Body"].read().decode("utf-8") + assert json.loads(job_status) == { + "command": "dump-restore", + "done": False, + "error": "pg_restore failed", + }, f"got status: {job_status}" + vanilla_pg.stop() + + +def test_fast_import_restore_to_connstring_error_to_s3_kms_error( + test_output_dir, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Encrypt connstrings and put spec into S3 + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode(b"invalid encrypted string").decode( + "utf-8" + ), + } + + bucket = "test-bucket" + key_prefix = "test-prefix" + mock_s3_client.create_bucket(Bucket=bucket) + mock_s3_client.put_object(Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec)) + + # Run fast_import + fast_import.set_aws_creds(mock_s3_server, {"RUST_LOG": "aws_config=debug,aws_sdk_kms=debug"}) + fast_import.run_dump_restore(s3prefix=f"s3://{bucket}/{key_prefix}") + + job_status_obj = mock_s3_client.get_object( + Bucket=bucket, Key=f"{key_prefix}/status/fast_import" + ) + job_status = job_status_obj["Body"].read().decode("utf-8") + assert json.loads(job_status) == { + "command": "dump-restore", + "done": False, + "error": "decrypt source connection string", + }, f"got status: {job_status}" diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py index 7e99d4b2f2..7ba044d5f3 100644 --- a/test_runner/regress/test_ingestion_layer_size.py +++ b/test_runner/regress/test_ingestion_layer_size.py @@ -1,13 +1,17 @@ from __future__ import annotations -from collections.abc import Iterable from dataclasses import dataclass +from typing import TYPE_CHECKING from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn -from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo from fixtures.utils import human_bytes, skip_in_debug_build +if TYPE_CHECKING: + from collections.abc import Iterable + + from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo + @skip_in_debug_build("debug run is unnecessarily slow") def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): @@ -27,9 +31,9 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): # bucket lower limits buckets = [0, minimum_initdb_size, minimum_good_layer_size, minimum_too_large_layer_size] - assert ( - minimum_initdb_size < minimum_good_layer_size - ), "keep checkpoint_distance higher than the initdb size (find it by experimenting)" + assert minimum_initdb_size < minimum_good_layer_size, ( + "keep checkpoint_distance higher than the initdb size (find it by experimenting)" + ) env = neon_env_builder.init_start( initial_tenant_conf={ @@ -57,9 +61,9 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): assert size is not None assert isinstance(size[0], int) log.info(f"gin index size: {human_bytes(size[0])}") - assert ( - size[0] > checkpoint_distance * 3 - ), f"gin index is not large enough: {human_bytes(size[0])}" + assert size[0] > checkpoint_distance * 3, ( + f"gin index is not large enough: {human_bytes(size[0])}" + ) wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) ps_http = env.pageserver.http_client() @@ -91,13 +95,13 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): log.info("non-cumulative layer size distribution after compaction:") print_layer_size_histogram(post_compact) - assert ( - post_ingest.counts[3] == 0 - ), f"there should be no layers larger than 2*checkpoint_distance ({human_bytes(2*checkpoint_distance)})" + assert post_ingest.counts[3] == 0, ( + f"there should be no layers larger than 2*checkpoint_distance ({human_bytes(2 * checkpoint_distance)})" + ) assert post_ingest.counts[1] == 1, "expect one smaller layer for initdb" - assert ( - post_ingest.counts[0] <= 1 - ), "expect at most one tiny layer from shutting down the endpoint" + assert post_ingest.counts[0] <= 1, ( + "expect at most one tiny layer from shutting down the endpoint" + ) # just make sure we don't have trouble splitting the layers apart assert post_compact.counts[3] == 0 diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py index ae5113ed45..7f56ef498b 100644 --- a/test_runner/regress/test_large_schema.py +++ b/test_runner/regress/test_large_schema.py @@ -2,9 +2,12 @@ from __future__ import annotations import os import time +from typing import TYPE_CHECKING from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder # This test creates large number of tables which cause large catalog. diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 2eb38c49b2..2abe03a2e0 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -59,70 +59,72 @@ def test_basic_eviction(neon_env_builder: NeonEnvBuilder): (parse_layer_file_name(path.name), path) for path in env.pageserver.list_layers(tenant_id, timeline_id) ) - assert ( - len(initial_local_layers) > 1 - ), f"Should create multiple layers for timeline, but got {initial_local_layers}" + assert len(initial_local_layers) > 1, ( + f"Should create multiple layers for timeline, but got {initial_local_layers}" + ) # Compare layer map dump with the local layers, ensure everything's present locally and matches initial_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id) - assert ( - not initial_layer_map_info.in_memory_layers - ), "Should have no in memory layers after flushing" - assert len(initial_local_layers) == len( - initial_layer_map_info.historic_layers - ), "Should have the same layers in memory and on disk" + assert not initial_layer_map_info.in_memory_layers, ( + "Should have no in memory layers after flushing" + ) + assert len(initial_local_layers) == len(initial_layer_map_info.historic_layers), ( + "Should have the same layers in memory and on disk" + ) for returned_layer in initial_layer_map_info.historic_layers: - assert ( - returned_layer.kind == "Delta" - ), f"Did not create and expect image layers, but got {returned_layer}" - assert ( - not returned_layer.remote - ), f"All created layers should be present locally, but got {returned_layer}" + assert returned_layer.kind == "Delta", ( + f"Did not create and expect image layers, but got {returned_layer}" + ) + assert not returned_layer.remote, ( + f"All created layers should be present locally, but got {returned_layer}" + ) returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) - assert ( - returned_layer_name in initial_local_layers - ), f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}" + assert returned_layer_name in initial_local_layers, ( + f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}" + ) local_layer_path = ( env.pageserver.timeline_dir(tenant_id, timeline_id) / initial_local_layers[returned_layer_name] ) - assert ( - returned_layer.layer_file_size == local_layer_path.stat().st_size - ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}" + assert returned_layer.layer_file_size == local_layer_path.stat().st_size, ( + f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}" + ) # Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map for local_layer_name, local_layer_path in initial_local_layers.items(): client.evict_layer( tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_path.name ) - assert not env.pageserver.layer_exists( - tenant_id, timeline_id, local_layer_name - ), f"Did not expect to find {local_layer_name} layer after evicting" + assert not env.pageserver.layer_exists(tenant_id, timeline_id, local_layer_name), ( + f"Did not expect to find {local_layer_name} layer after evicting" + ) empty_layers = env.pageserver.list_layers(tenant_id, timeline_id) - assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" + assert not empty_layers, ( + f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" + ) evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id) - assert ( - not evicted_layer_map_info.in_memory_layers - ), "Should have no in memory layers after flushing and evicting" - assert len(initial_local_layers) == len( - evicted_layer_map_info.historic_layers - ), "Should have the same layers in memory and on disk initially" + assert not evicted_layer_map_info.in_memory_layers, ( + "Should have no in memory layers after flushing and evicting" + ) + assert len(initial_local_layers) == len(evicted_layer_map_info.historic_layers), ( + "Should have the same layers in memory and on disk initially" + ) for returned_layer in evicted_layer_map_info.historic_layers: - assert ( - returned_layer.kind == "Delta" - ), f"Did not create and expect image layers, but got {returned_layer}" - assert ( - returned_layer.remote - ), f"All layers should be evicted and not present locally, but got {returned_layer}" + assert returned_layer.kind == "Delta", ( + f"Did not create and expect image layers, but got {returned_layer}" + ) + assert returned_layer.remote, ( + f"All layers should be evicted and not present locally, but got {returned_layer}" + ) returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) - assert ( - returned_layer_name in initial_local_layers - ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" + assert returned_layer_name in initial_local_layers, ( + f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" + ) # redownload all evicted layers and ensure the initial state is restored for local_layer_name, _local_layer_path in initial_local_layers.items(): @@ -142,15 +144,15 @@ def test_basic_eviction(neon_env_builder: NeonEnvBuilder): (parse_layer_file_name(path.name), path) for path in env.pageserver.list_layers(tenant_id, timeline_id) ) - assert ( - redownloaded_layers == initial_local_layers - ), "Should have the same layers locally after redownloading the evicted layers" + assert redownloaded_layers == initial_local_layers, ( + "Should have the same layers locally after redownloading the evicted layers" + ) redownloaded_layer_map_info = client.layer_map_info( tenant_id=tenant_id, timeline_id=timeline_id ) - assert ( - redownloaded_layer_map_info == initial_layer_map_info - ), "Should have the same layer map after redownloading the evicted layers" + assert redownloaded_layer_map_info == initial_layer_map_info, ( + "Should have the same layer map after redownloading the evicted layers" + ) def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): @@ -266,9 +268,9 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): resident_physical_size_metric = ps_http.get_timeline_metric( tenant_id, timeline_id, "pageserver_resident_physical_size" ) - assert ( - resident_physical_size_metric == 0 - ), "ensure that resident_physical_size metric is zero" + assert resident_physical_size_metric == 0, ( + "ensure that resident_physical_size metric is zero" + ) assert resident_physical_size_metric == sum( layer.layer_file_size for layer in info.historic_layers if not layer.remote ), "ensure that resident_physical_size metric corresponds to layer map dump" diff --git a/test_runner/regress/test_layer_writers_fail.py b/test_runner/regress/test_layer_writers_fail.py index dd31e2725b..e07321e0ab 100644 --- a/test_runner/regress/test_layer_writers_fail.py +++ b/test_runner/regress/test_layer_writers_fail.py @@ -13,9 +13,9 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): tenant_id, timeline_id = env.create_tenant( conf={ # small checkpoint distance to create more delta layer files - "checkpoint_distance": f"{1024 ** 2}", + "checkpoint_distance": f"{1024**2}", # set the target size to be large to allow the image layer to cover the whole key space - "compaction_target_size": f"{1024 ** 3}", + "compaction_target_size": f"{1024**3}", # tweak the default settings to allow quickly create image layers and L1 layers "compaction_period": "1 s", "compaction_threshold": "2", @@ -44,9 +44,9 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): ) ) - assert ( - len(new_temp_layer_files) == 0 - ), "pageserver should clean its temporary new image layer files on failure" + assert len(new_temp_layer_files) == 0, ( + "pageserver should clean its temporary new image layer files on failure" + ) @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2703") @@ -57,9 +57,9 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): tenant_id, timeline_id = env.create_tenant( conf={ # small checkpoint distance to create more delta layer files - "checkpoint_distance": f"{1024 ** 2}", + "checkpoint_distance": f"{1024**2}", # set the target size to be large to allow the image layer to cover the whole key space - "compaction_target_size": f"{1024 ** 3}", + "compaction_target_size": f"{1024**3}", # tweak the default settings to allow quickly create image layers and L1 layers "compaction_period": "1 s", "compaction_threshold": "2", @@ -90,6 +90,6 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): ) ) - assert ( - len(new_temp_layer_files) == 0 - ), "pageserver should clean its temporary new delta layer files on failure" + assert len(new_temp_layer_files) == 0, ( + "pageserver should clean its temporary new delta layer files on failure" + ) diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 872d3dc4cf..b4eba2779d 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -127,9 +127,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): ip = get_index_part() assert len(ip.layer_metadata.keys()) - assert ( - ip.disk_consistent_lsn < last_record_lsn - ), "sanity check for what above loop is supposed to do" + assert ip.disk_consistent_lsn < last_record_lsn, ( + "sanity check for what above loop is supposed to do" + ) # create the image layer from the future env.storage_controller.pageserver_api().update_tenant_config( @@ -233,9 +233,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): start = time.monotonic() while True: post_stat = future_layer_path.stat() - assert ( - pre_stat.st_mtime == post_stat.st_mtime - ), "observed PUT overtake the stucked DELETE => bug isn't fixed yet" + assert pre_stat.st_mtime == post_stat.st_mtime, ( + "observed PUT overtake the stucked DELETE => bug isn't fixed yet" + ) if time.monotonic() - start > max_race_opportunity_window: log.info( "a correct implementation would never let the later PUT overtake the earlier DELETE" diff --git a/test_runner/regress/test_lfc_prefetch.py b/test_runner/regress/test_lfc_prefetch.py index dd422d996e..2885c0e17b 100644 --- a/test_runner/regress/test_lfc_prefetch.py +++ b/test_runner/regress/test_lfc_prefetch.py @@ -1,12 +1,15 @@ from __future__ import annotations import time +from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv from fixtures.utils import USE_LFC +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + @pytest.mark.timeout(600) @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") @@ -97,5 +100,5 @@ def test_lfc_prefetch(neon_simple_env: NeonEnv): prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] log.info(f"Unused prefetches: {prefetch_expired}") - # No redundant prefethc requrests if prefetch results are stored in LFC + # No redundant prefetch requests if prefetch results are stored in LFC assert prefetch_expired == 0 diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index ea7d38a3d9..51074751e0 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -5,12 +5,15 @@ import re import subprocess import threading import time +from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, PgBin from fixtures.utils import USE_LFC +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv, PgBin + @pytest.mark.timeout(600) @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 17068849d4..e422622167 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -2,12 +2,15 @@ from __future__ import annotations import time from pathlib import Path +from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv from fixtures.utils import USE_LFC, query_scalar +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): @@ -19,10 +22,7 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC") endpoint = env.endpoints.create_start( "main", - config_lines=[ - "neon.max_file_cache_size='128MB'", - "neon.file_cache_size_limit='64MB'", - ], + config_lines=["neon.max_file_cache_size='128MB'", "neon.file_cache_size_limit='64MB'"], ) cur = endpoint.connect().cursor() @@ -116,4 +116,4 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): log.info(f"Table size {size} blocks") assert estimation_1k >= 20 and estimation_1k <= 40 - assert estimation_10k >= 200 and estimation_10k <= 400 + assert estimation_10k >= 200 and estimation_10k <= 440 diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 52ee2f32a2..49d5c1916c 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -5,11 +5,14 @@ import queue import random import threading import time +from typing import TYPE_CHECKING import pytest -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.utils import USE_LFC, query_scalar +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder + """ Test whether LFC doesn't error out when the LRU is empty, but the LFC is already at its maximum size. diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py index d94c786f49..9eafcf24bc 100644 --- a/test_runner/regress/test_logging.py +++ b/test_runner/regress/test_logging.py @@ -1,12 +1,15 @@ from __future__ import annotations import uuid +from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import run_only_on_default_postgres, wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + @pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"]) @run_only_on_default_postgres("it does not use any postgres functionality") diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index e42e71646d..7280a91a12 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -39,9 +39,9 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool): # disable default GC and compaction "gc_period": "1000 m", "compaction_period": "0 s", - "gc_horizon": f"{1024 ** 2}", - "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", + "gc_horizon": f"{1024**2}", + "checkpoint_distance": f"{1024**2}", + "compaction_target_size": f"{1024**2}", } ) diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py index 32ec6fcb92..f8e9a953bd 100644 --- a/test_runner/regress/test_nbtree_pagesplit_cycleid.py +++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py @@ -48,9 +48,9 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): ses1.execute(BTREE_NUM_CYCLEID_PAGES) pages = ses1.fetchall() - assert ( - len(pages) == 0 - ), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}" + assert len(pages) == 0, ( + f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}" + ) # Delete enough tuples to clear the first index page. # (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs. ses1.execute("DELETE FROM t WHERE id <= 406;") @@ -119,9 +119,9 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): # check that our expectations are correct ses1.execute(BTREE_NUM_CYCLEID_PAGES) pages = ses1.fetchall() - assert ( - len(pages) == 1 and pages[0][0] == 3 - ), f"3 page splits with cycle ID expected; actual {pages}" + assert len(pages) == 1 and pages[0][0] == 3, ( + f"3 page splits with cycle ID expected; actual {pages}" + ) # final cleanup ses3t.join() diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 72db72f2b9..8bd0662ef8 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -1,8 +1,7 @@ from __future__ import annotations import subprocess -from pathlib import Path -from typing import cast +from typing import TYPE_CHECKING, cast import pytest import requests @@ -13,9 +12,13 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, parse_project_git_version_output, ) -from fixtures.pageserver.http import PageserverHttpClient from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build +if TYPE_CHECKING: + from pathlib import Path + + from fixtures.pageserver.http import PageserverHttpClient + def helper_compare_timeline_list( pageserver_http_client: PageserverHttpClient, env: NeonEnv, initial_tenant: TenantId @@ -65,7 +68,7 @@ def test_cli_timeline_list(neon_simple_env: NeonEnv): def helper_compare_tenant_list(pageserver_http_client: PageserverHttpClient, env: NeonEnv): tenants = pageserver_http_client.tenant_list() - tenants_api = sorted(map(lambda t: cast(str, t["id"]), tenants)) + tenants_api = sorted(map(lambda t: cast("str", t["id"]), tenants)) res = env.neon_cli.tenant_list() tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 4035398a5f..e79ab458ca 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -2,9 +2,12 @@ from __future__ import annotations import time from contextlib import closing +from typing import TYPE_CHECKING from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder # Verify that the neon extension is installed and has the correct version. diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 8d9aab6848..00aeb6c4fe 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -1,9 +1,13 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest from fixtures.common_types import TimelineId -from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.port_distributor import PortDistributor + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + from fixtures.port_distributor import PortDistributor # Test that neon cli is able to start and stop all processes with the user defaults. diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index 49cd91906f..f99d79e138 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -1,10 +1,14 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv from fixtures.pg_version import PgVersion from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): env = neon_simple_env diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index db8da51125..693dd628d7 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -2,7 +2,7 @@ from __future__ import annotations import os import time -from pathlib import Path +from typing import TYPE_CHECKING from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log @@ -16,6 +16,9 @@ from fixtures.neon_fixtures import ( from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import query_scalar +if TYPE_CHECKING: + from pathlib import Path + def test_next_xid(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index c8458b963e..44590ea4b9 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -1,9 +1,13 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.pageserver.http import PageserverHttpClient + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder + from fixtures.pageserver.http import PageserverHttpClient def check_tenant( diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py index e2bde8be6f..76766a0754 100644 --- a/test_runner/regress/test_oid_overflow.py +++ b/test_runner/regress/test_oid_overflow.py @@ -1,7 +1,11 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder def test_oid_overflow(neon_env_builder: NeonEnvBuilder): @@ -32,7 +36,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder): log.info(f"t1.relfilenode={oid}") cur.execute("set statement_timeout=0") - cur.execute(f"select test_consume_oids({oid-1})") + cur.execute(f"select test_consume_oids({oid - 1})") cur.execute("VACUUM FULL t2") cur.execute("SELECT relfilenode FROM pg_class where relname='t2'") diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index a615464582..8090077729 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,10 +1,14 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from fixtures.common_types import TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import print_gc_result, query_scalar +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + # # Test where Postgres generates a lot of WAL, and it's garbage collected away, but diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index c344f30f4d..2590a3fe9d 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -38,12 +38,13 @@ def get_num_downloaded_layers(client: PageserverHttpClient): This assumes that the pageserver only has a single tenant. """ value = client.get_metric_value( - "pageserver_remote_operation_seconds_count", + "pageserver_remote_timeline_client_seconds_global_count", { "file_kind": "layer", "op_kind": "download", "status": "success", }, + "sum", ) if value is None: return 0 @@ -61,12 +62,12 @@ def test_ondemand_download_large_rel(neon_env_builder: NeonEnvBuilder): initial_tenant_conf={ # disable background GC "gc_period": "0s", - "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB + "gc_horizon": f"{10 * 1024**3}", # 10 GB # small checkpoint distance to create more delta layer files - "checkpoint_distance": f"{10 * 1024 ** 2}", # 10 MB + "checkpoint_distance": f"{10 * 1024**2}", # 10 MB # allow compaction with the checkpoint "compaction_threshold": "3", - "compaction_target_size": f"{10 * 1024 ** 2}", # 10 MB + "compaction_target_size": f"{10 * 1024**2}", # 10 MB # but don't run compaction in background or on restart "compaction_period": "0s", } @@ -160,10 +161,10 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): "gc_period": "0s", "compaction_period": "0s", # small checkpoint distance to create more delta layer files - "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "checkpoint_distance": f"{1 * 1024**2}", # 1 MB "compaction_threshold": "1", "image_creation_threshold": "1", - "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + "compaction_target_size": f"{1 * 1024**2}", # 1 MB } ) pageserver_http = env.pageserver.http_client() @@ -334,10 +335,10 @@ def test_download_remote_layers_api( "gc_period": "0s", "compaction_period": "0s", # small checkpoint distance to create more delta layer files - "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "checkpoint_distance": f"{1 * 1024**2}", # 1 MB "compaction_threshold": "999999", "image_creation_threshold": "999999", - "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + "compaction_target_size": f"{1 * 1024**2}", # 1 MB } ) @@ -419,15 +420,15 @@ def test_download_remote_layers_api( ###### Phase 1: exercise download error code path this_time = get_api_current_physical_size() - assert ( - filled_current_physical == this_time - ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" + assert filled_current_physical == this_time, ( + "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" + ) post_unlink_size = get_resident_physical_size() log.info(f"post_unlink_size: {post_unlink_size}") - assert ( - post_unlink_size < filled_size - ), "we just deleted layers and didn't cause anything to re-download them yet" + assert post_unlink_size < filled_size, ( + "we just deleted layers and didn't cause anything to re-download them yet" + ) # issue downloads that we know will fail info = client.timeline_download_remote_layers( @@ -449,9 +450,9 @@ def test_download_remote_layers_api( == info["successful_download_count"] + info["failed_download_count"] ) assert get_api_current_physical_size() == filled_current_physical - assert ( - get_resident_physical_size() == post_unlink_size - ), "didn't download anything new due to failpoint" + assert get_resident_physical_size() == post_unlink_size, ( + "didn't download anything new due to failpoint" + ) ##### Retry, this time without failpoints client.configure_failpoints(("remote-storage-download-pre-rename", "off")) @@ -515,9 +516,9 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: m = pageserver_http.get_metrics() # these are global counters total_bytes = m.query_one("pageserver_remote_ondemand_downloaded_bytes_total").value - assert ( - total_bytes < 2**53 and total_bytes.is_integer() - ), "bytes should still be safe integer-in-f64" + assert total_bytes < 2**53 and total_bytes.is_integer(), ( + "bytes should still be safe integer-in-f64" + ) count = m.query_one("pageserver_remote_ondemand_downloaded_layers_total").value assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64" return (int(total_bytes), int(count)) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 17ffeca23b..7f9207047e 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,14 +1,18 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.http import PageserverHttpClient from fixtures.utils import run_only_on_default_postgres, wait_until +if TYPE_CHECKING: + from fixtures.pageserver.http import PageserverHttpClient + def check_client(env: NeonEnv, client: PageserverHttpClient): pg_version = env.pg_version @@ -65,15 +69,15 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True ) - assert ( - timeline_details.get("wal_source_connstr") is None - ), "Should not be able to connect to WAL streaming without PG compute node running" - assert ( - timeline_details.get("last_received_msg_lsn") is None - ), "Should not be able to connect to WAL streaming without PG compute node running" - assert ( - timeline_details.get("last_received_msg_ts") is None - ), "Should not be able to connect to WAL streaming without PG compute node running" + assert timeline_details.get("wal_source_connstr") is None, ( + "Should not be able to connect to WAL streaming without PG compute node running" + ) + assert timeline_details.get("last_received_msg_lsn") is None, ( + "Should not be able to connect to WAL streaming without PG compute node running" + ) + assert timeline_details.get("last_received_msg_ts") is None, ( + "Should not be able to connect to WAL streaming without PG compute node running" + ) def expect_updated_msg_lsn( @@ -89,14 +93,14 @@ def expect_updated_msg_lsn( assert "last_received_msg_lsn" in timeline_details.keys() assert "last_received_msg_ts" in timeline_details.keys() - assert ( - timeline_details["last_received_msg_lsn"] is not None - ), "the last received message's LSN is empty" + assert timeline_details["last_received_msg_lsn"] is not None, ( + "the last received message's LSN is empty" + ) last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"]) - assert ( - prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn - ), f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}" + assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, ( + f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}" + ) return last_msg_lsn diff --git a/test_runner/regress/test_pageserver_catchup.py b/test_runner/regress/test_pageserver_catchup.py index 3567e05f81..f3d7cd5bdb 100644 --- a/test_runner/regress/test_pageserver_catchup.py +++ b/test_runner/regress/test_pageserver_catchup.py @@ -1,6 +1,9 @@ from __future__ import annotations -from fixtures.neon_fixtures import NeonEnvBuilder +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder # Test safekeeper sync and pageserver catch up diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py new file mode 100644 index 0000000000..4035afd9aa --- /dev/null +++ b/test_runner/regress/test_pageserver_config.py @@ -0,0 +1,56 @@ +import re + +import pytest +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import run_only_on_default_postgres + + +@pytest.mark.parametrize("what", ["default", "top_level", "nested"]) +@run_only_on_default_postgres(reason="does not use postgres") +def test_unknown_config_items_handling(neon_simple_env: NeonEnv, what: str): + """ + Ensure we log unknown config fields and expose a metric for alerting. + There are more unit tests in the Rust code for other TOML items. + """ + env = neon_simple_env + + def edit_fn(config) -> str | None: + if what == "default": + return None + elif what == "top_level": + config["unknown_top_level_config_item"] = 23 + return r"unknown_top_level_config_item" + elif what == "nested": + config["remote_storage"]["unknown_config_item"] = 23 + return r"remote_storage.unknown_config_item" + else: + raise ValueError(f"Unknown what: {what}") + + def get_metric(): + metrics = env.pageserver.http_client().get_metrics() + samples = metrics.query_all("pageserver_config_ignored_items") + by_item = {sample.labels["item"]: sample.value for sample in samples} + assert by_item[""] == 0, "must always contain the empty item with value 0" + del by_item[""] + return by_item + + expected_ignored_item = env.pageserver.edit_config_toml(edit_fn) + + if expected_ignored_item is not None: + expected_ignored_item_log_line_re = r".*ignoring unknown configuration item.*" + re.escape( + expected_ignored_item + ) + env.pageserver.allowed_errors.append(expected_ignored_item_log_line_re) + + if expected_ignored_item is not None: + assert not env.pageserver.log_contains(expected_ignored_item_log_line_re) + assert get_metric() == {} + + # in any way, unknown config items should not fail pageserver to start + # TODO: extend this test with the config validator mode once we introduce it + # https://github.com/neondatabase/cloud/issues/24349 + env.pageserver.restart() + + if expected_ignored_item is not None: + assert env.pageserver.log_contains(expected_ignored_item_log_line_re) + assert get_metric() == {expected_ignored_item: 1} diff --git a/test_runner/regress/test_pageserver_crash_consistency.py b/test_runner/regress/test_pageserver_crash_consistency.py index e9eee2760e..2e943e5bd8 100644 --- a/test_runner/regress/test_pageserver_crash_consistency.py +++ b/test_runner/regress/test_pageserver_crash_consistency.py @@ -92,9 +92,9 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin: env.pageserver.start() wait_until_tenant_active(pageserver_http, tenant_id) - assert not env.pageserver.layer_exists( - tenant_id, timeline_id, l1_found - ), "partial compaction result should had been removed during startup" + assert not env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), ( + "partial compaction result should had been removed during startup" + ) # wait for us to catch up again wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 9644ebe3e2..3d7204d883 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -3,16 +3,20 @@ from __future__ import annotations import copy import json import uuid +from typing import TYPE_CHECKING import pytest from anyio import Path from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin from fixtures.pg_version import PgVersion from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder, PgBin + +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395") def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() @@ -101,12 +105,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P expect_ncompleted = duration_secs * rate_limit_rps delta_abs = abs(expect_ncompleted - actual_ncompleted) threshold = 0.05 * expect_ncompleted - assert ( - threshold / rate_limit_rps < 0.1 * duration_secs - ), "test self-test: unrealistic expecations regarding precision in this test" - assert ( - delta_abs < 0.05 * expect_ncompleted - ), "the throttling deviates more than 5percent from the expectation" + assert threshold / rate_limit_rps < 0.1 * duration_secs, ( + "test self-test: unrealistic expecations regarding precision in this test" + ) + assert delta_abs < 0.05 * expect_ncompleted, ( + "the throttling deviates more than 5percent from the expectation" + ) log.info("validate that we logged the throttling") @@ -127,14 +131,14 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P actual_throttled_secs = actual_throttled_usecs / 1_000_000 log.info("validate that the metric doesn't include throttle wait time") - assert ( - duration_secs >= 10 * actual_smgr_query_seconds - ), "smgr metrics should not include throttle wait time" + assert duration_secs >= 10 * actual_smgr_query_seconds, ( + "smgr metrics should not include throttle wait time" + ) log.info("validate that the throttling wait time metrics is correct") - assert ( - pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs - ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" + assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, ( + "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" + ) throttle_config_with_field_fair_set = { @@ -191,3 +195,7 @@ def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( ps_http = env.pageserver.http_client() conf = ps_http.tenant_config(env.initial_tenant) assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) + + env.pageserver.allowed_errors.append( + r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*' + ) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index fcc465f90a..91c4ef521c 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -2,6 +2,7 @@ from __future__ import annotations import asyncio import time +from typing import TYPE_CHECKING import psutil import pytest @@ -12,10 +13,12 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, tenant_get_shards, ) -from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.utils import skip_in_debug_build, wait_until +if TYPE_CHECKING: + from fixtures.pageserver.http import PageserverHttpClient + TIMELINE_COUNT = 10 ENTRIES_PER_TIMELINE = 10_000 CHECKPOINT_TIMEOUT_SECONDS = 60 @@ -130,7 +133,7 @@ def test_pageserver_small_inmemory_layers( tenant_conf = { # Large `checkpoint_distance` effectively disables size # based checkpointing. - "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_distance": f"{2 * 1024**3}", "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", "compaction_period": "1s", } @@ -179,7 +182,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): tenant_conf = { # Large `checkpoint_distance` effectively disables size # based checkpointing. - "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_distance": f"{2 * 1024**3}", "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", "compaction_period": "1s", } @@ -279,9 +282,9 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): http_client = env.pageserver.http_client() initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"]) this_timeline_ingested = last_flush_lsn - initdb_lsn - assert ( - this_timeline_ingested < checkpoint_distance * 0.8 - ), "this test is supposed to fill InMemoryLayer" + assert this_timeline_ingested < checkpoint_distance * 0.8, ( + "this test is supposed to fill InMemoryLayer" + ) total_bytes_ingested += this_timeline_ingested log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})") diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index aedfdbd210..acec0ba44a 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -5,7 +5,6 @@ import json import os import time from dataclasses import dataclass -from pathlib import Path from queue import SimpleQueue from typing import TYPE_CHECKING @@ -20,14 +19,15 @@ from fixtures.remote_storage import ( RemoteStorageKind, remote_storage_to_toml_inline_table, ) -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: + from pathlib import Path from typing import Any from fixtures.httpserver import ListenAddress + from pytest_httpserver import HTTPServer + from werkzeug.wrappers.request import Request # TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP @@ -107,7 +107,7 @@ def test_metric_collection( ps_metrics = env.pageserver.http_client().get_metrics() total = 0.0 for sample in ps_metrics.query_all( - name="pageserver_remote_operation_seconds_count", + name="pageserver_remote_timeline_client_seconds_global_count", filter={ "file_kind": str(file_kind), "op_kind": str(op_kind), @@ -287,9 +287,9 @@ def test_metric_collection_cleans_up_tempfile( initially = iterate_pageserver_workdir(env.pageserver.workdir, "last_consumption_metrics.json") - assert ( - len(initially.matching) == 2 - ), f"expecting actual file and tempfile, but not found: {initially.matching}" + assert len(initially.matching) == 2, ( + f"expecting actual file and tempfile, but not found: {initially.matching}" + ) uploads.put("ready") env.pageserver.start() @@ -308,9 +308,9 @@ def test_metric_collection_cleans_up_tempfile( # it is possible we shutdown the pageserver right at the correct time, so the old tempfile # is gone, but we also have a new one. only = set(["last_consumption_metrics.json"]) - assert ( - initially.matching.intersection(later.matching) == only - ), "only initial tempfile should had been removed" + assert initially.matching.intersection(later.matching) == only, ( + "only initial tempfile should had been removed" + ) assert initially.other.issuperset(later.other), "no other files should had been removed" @@ -497,9 +497,9 @@ class SyntheticSizeVerifier: def post_batch(self, parent): if self.prev is not None: # this is assuming no one goes and deletes the cache file - assert ( - self.value is not None - ), "after calculating first synthetic size, cached or more recent should be sent" + assert self.value is not None, ( + "after calculating first synthetic size, cached or more recent should be sent" + ) self.prev = self.value self.value = None diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py index be63208428..775ab37021 100644 --- a/test_runner/regress/test_pageserver_reconnect.py +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -3,10 +3,13 @@ from __future__ import annotations import threading import time from contextlib import closing +from typing import TYPE_CHECKING import psycopg2.errors from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, PgBin + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv, PgBin # Test updating neon.pageserver_connstring setting on the fly. @@ -25,7 +28,7 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin): def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) - pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr]) + pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects * timeout)}", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) thread.start() diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 21cb780c06..ee5efd9398 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -2,14 +2,17 @@ from __future__ import annotations import random from contextlib import closing +from typing import TYPE_CHECKING import psycopg2.errors as pgerr import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.remote_storage import s3_storage from fixtures.utils import skip_in_debug_build, wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + # Test restarting page server, while safekeeper and compute node keep # running. @@ -135,9 +138,9 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): for phase, expectation in expectations: assert phase in values, f"No data for phase {phase}" sample = values[phase] - assert expectation( - sample.value, prev_value - ), f"Unexpected value for {phase}: {sample.value}" + assert expectation(sample.value, prev_value), ( + f"Unexpected value for {phase}: {sample.value}" + ) prev_value = sample.value # Startup is complete, this metric should exist but be zero diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index ec74e03f89..9f19c887a4 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -6,9 +6,12 @@ from __future__ import annotations import threading import time +from typing import TYPE_CHECKING from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, PgBin + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv, PgBin # Test restarting page server, while safekeeper and compute node keep diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 9f2aa5df8c..c73a592d98 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -4,7 +4,6 @@ import json import os import random import time -from pathlib import Path from typing import TYPE_CHECKING import pytest @@ -24,12 +23,14 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build, wait_until from fixtures.workload import Workload -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: + from pathlib import Path from typing import Any + from werkzeug.wrappers.request import Request + # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. @@ -91,6 +92,8 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, f"http://{make_httpserver.host}:{make_httpserver.port}/" ) + neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False} + def ignore_notify(request: Request): # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions), # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute. @@ -626,7 +629,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): except: # On assertion failures, log some details to help with debugging heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) - log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}") + log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}") raise # Scrub the remote storage @@ -1096,3 +1099,70 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Warm up the current secondary. ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100) wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally)) + + +@run_only_on_default_postgres("PG version is not interesting here") +@pytest.mark.parametrize("action", ["delete_timeline", "detach"]) +def test_io_metrics_match_secondary_timeline_lifecycle( + neon_env_builder: NeonEnvBuilder, action: str +): + """ + Check that IO metrics for secondary timelines are de-registered when the timeline + is removed + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + parent_timeline_id = TimelineId.generate() + + # We do heatmap uploads and pulls manually + tenant_conf = {"heatmap_period": "0s"} + env.create_tenant( + tenant_id, parent_timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}' + ) + + child_timeline_id = env.create_branch("foo", tenant_id) + + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + status, _ = ps_secondary.http_client().tenant_secondary_download(tenant_id, wait_ms=5000) + assert status == 200 + + labels = { + "operation": "write", + "tenant_id": str(tenant_id), + "timeline_id": str(child_timeline_id), + } + bytes_written = ( + ps_secondary.http_client() + .get_metrics() + .query_one("pageserver_io_operations_bytes_total", labels) + .value + ) + + assert bytes_written == 0 + + if action == "delete_timeline": + env.storage_controller.pageserver_api().timeline_delete(tenant_id, child_timeline_id) + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + status, _ = ps_secondary.http_client().tenant_secondary_download(tenant_id, wait_ms=5000) + assert status == 200 + elif action == "detach": + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + else: + raise Exception("Unexpected action") + + assert ( + len( + ps_secondary.http_client() + .get_metrics() + .query_all("pageserver_io_operations_bytes_total", labels) + ) + == 0 + ) diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py index d4ed7230fa..782595fad0 100644 --- a/test_runner/regress/test_pg_query_cancellation.py +++ b/test_runner/regress/test_pg_query_cancellation.py @@ -1,13 +1,16 @@ from __future__ import annotations from contextlib import closing +from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver -from fixtures.pageserver.http import PageserverHttpClient from psycopg2.errors import QueryCanceled +if TYPE_CHECKING: + from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver + from fixtures.pageserver.http import PageserverHttpClient + CRITICAL_PG_PS_WAIT_FAILPOINTS: set[str] = { "ps::connection-start::pre-login", "ps::connection-start::startup-packet", diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 1d9f385358..a3fae97327 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -4,7 +4,6 @@ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor -from pathlib import Path from typing import TYPE_CHECKING, Any, cast import pytest @@ -21,6 +20,8 @@ from fixtures.remote_storage import s3_storage from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: + from pathlib import Path + from fixtures.neon_fixtures import PgBin from pytest import CaptureFixture diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py index c98d395451..3ece555a72 100644 --- a/test_runner/regress/test_pg_waldump.py +++ b/test_runner/regress/test_pg_waldump.py @@ -2,10 +2,13 @@ from __future__ import annotations import os import shutil +from typing import TYPE_CHECKING -from fixtures.neon_fixtures import NeonEnv, PgBin from fixtures.utils import subprocess_capture +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv, PgBin + def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir): # use special --ignore option to ignore the validation checks in pg_waldump diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index 17819fd367..1ebf70dbf2 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import wait_replica_caughtup +from fixtures.utils import shared_buffers_for_max_cu if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnv @@ -180,7 +181,8 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en endpoint_id="primary", config_lines=[ "max_connections=1000", - "shared_buffers=128MB", # prevent "no unpinned buffers available" error + # use shared_buffers size like in production for 2 CU compute + f"shared_buffers={shared_buffers_for_max_cu(2.0)}", # prevent "no unpinned buffers available" error ], ) secondary = env.endpoints.new_replica_start( diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index d983d77e72..0e3e667844 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -1,10 +1,14 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from fixtures.common_types import TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import print_gc_result, query_scalar +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + # # Check pitr_interval GC behavior. diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py index 5eb743809f..da5b3993a4 100644 --- a/test_runner/regress/test_postgres_version.py +++ b/test_runner/regress/test_postgres_version.py @@ -3,9 +3,11 @@ from __future__ import annotations import json import re from pathlib import Path +from typing import TYPE_CHECKING -from fixtures.neon_fixtures import PgBin -from fixtures.pg_version import PgVersion +if TYPE_CHECKING: + from fixtures.neon_fixtures import PgBin + from fixtures.pg_version import PgVersion def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): @@ -32,8 +34,8 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): version = match.group("version") commit = match.group("commit") - assert ( - pg_version.v_prefixed in expected_revisions - ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + assert pg_version.v_prefixed in expected_revisions, ( + f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + ) msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_prefetch_buffer_resize.py b/test_runner/regress/test_prefetch_buffer_resize.py index 99fe80e621..299320e770 100644 --- a/test_runner/regress/test_prefetch_buffer_resize.py +++ b/test_runner/regress/test_prefetch_buffer_resize.py @@ -1,9 +1,12 @@ from __future__ import annotations import random +from typing import TYPE_CHECKING import pytest -from fixtures.neon_fixtures import NeonEnvBuilder + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder @pytest.mark.parametrize("shard_count", [None, 4]) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 3c7fd0b897..9860658ba5 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -648,6 +648,6 @@ def test_sql_over_http_connection_cancel(static_proxy: NeonProxy): assert res["rowCount"] == 1, "HTTP query should insert" res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400) - assert ( - "duplicate key value violates unique constraint" in res["message"] - ), "HTTP query should conflict" + assert "duplicate key value violates unique constraint" in res["message"], ( + "HTTP query should conflict" + ) diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py index c59da8c6b0..7384326385 100644 --- a/test_runner/regress/test_proxy_allowed_ips.py +++ b/test_runner/regress/test_proxy_allowed_ips.py @@ -1,11 +1,15 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import psycopg2 import pytest -from fixtures.neon_fixtures import ( - NeonProxy, - VanillaPostgres, -) + +if TYPE_CHECKING: + from fixtures.neon_fixtures import ( + NeonProxy, + VanillaPostgres, + ) TABLE_NAME = "neon_control_plane.endpoints" diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py index 5ff4a99c51..85d8a6daaa 100644 --- a/test_runner/regress/test_proxy_metric_collection.py +++ b/test_runner/regress/test_proxy_metric_collection.py @@ -1,7 +1,5 @@ from __future__ import annotations -from collections.abc import Iterator -from pathlib import Path from typing import TYPE_CHECKING import pytest @@ -11,13 +9,16 @@ from fixtures.neon_fixtures import ( NeonProxy, VanillaPostgres, ) -from fixtures.port_distributor import PortDistributor -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + from fixtures.httpserver import ListenAddress + from fixtures.port_distributor import PortDistributor + from pytest_httpserver import HTTPServer + from werkzeug.wrappers.request import Request def proxy_metrics_handler(request: Request) -> Response: diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py index f14317a39f..8a7c0288e0 100644 --- a/test_runner/regress/test_proxy_websockets.py +++ b/test_runner/regress/test_proxy_websockets.py @@ -2,14 +2,17 @@ from __future__ import annotations import asyncio import ssl +from typing import TYPE_CHECKING import asyncpg import pytest import websocket_tunnel import websockets from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonProxy -from fixtures.port_distributor import PortDistributor + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonProxy + from fixtures.port_distributor import PortDistributor @pytest.mark.asyncio @@ -84,9 +87,9 @@ async def test_websockets(static_proxy: NeonProxy): assert query_response[0:1] == b"D", "should be data row message" data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 data_row, query_response = query_response[:data_row_len], query_response[data_row_len:] - assert ( - data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011" - ), "should contain 1 column with text value 1" + assert data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011", ( + "should contain 1 column with text value 1" + ) assert query_response[0:1] == b"C", "should be command complete message" command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 @@ -184,9 +187,9 @@ async def test_websockets_pipelined(static_proxy: NeonProxy): assert query_response[0:1] == b"D", "should be data row message" data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 data_row, query_response = query_response[:data_row_len], query_response[data_row_len:] - assert ( - data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011" - ), "should contain 1 column with text value 1" + assert data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011", ( + "should contain 1 column with text value 1" + ) assert query_response[0:1] == b"C", "should be command complete message" command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 70a7a675df..88655c1d41 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -1,12 +1,15 @@ from __future__ import annotations from contextlib import closing +from typing import TYPE_CHECKING from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar from psycopg2.errors import IoError, UndefinedTable +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + pytest_plugins = "fixtures.neon_fixtures" extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index fe970a868c..ee934a900d 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,6 +1,7 @@ from __future__ import annotations import time +from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId @@ -13,10 +14,12 @@ from fixtures.neon_fixtures import ( last_flush_lsn_upload, tenant_get_shards, ) -from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.utils import query_scalar, wait_until +if TYPE_CHECKING: + from fixtures.pageserver.http import PageserverHttpClient + # # Create read-only compute nodes, anchored at historical points in time. @@ -207,9 +210,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): # Note: cannot assert on `layers_removed` here because it could be layers # not guarded by the lease. Instead, use layer map dump. - assert layers_guarded_before_gc.issubset( - layers_guarded_after_gc - ), "Layers guarded by lease before GC should not be removed" + assert layers_guarded_before_gc.issubset(layers_guarded_after_gc), ( + "Layers guarded by lease before GC should not be removed" + ) log.info(f"{gc_result=}") diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index dab01fcd1a..5e081a4fda 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -2,9 +2,12 @@ from __future__ import annotations import time from contextlib import closing +from typing import TYPE_CHECKING from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder # diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py index 07eacfc775..b2ddcb1c2e 100644 --- a/test_runner/regress/test_relations.py +++ b/test_runner/regress/test_relations.py @@ -1,8 +1,11 @@ from __future__ import annotations -from fixtures.neon_fixtures import ( - NeonEnvBuilder, -) +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import ( + NeonEnvBuilder, + ) def test_pageserver_reldir_v2( diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index e8721f1ea0..298aae39ee 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -180,9 +180,9 @@ def test_remote_storage_backup_and_restore( # The initiated attach operation should survive the restart, and continue from where it was. env.pageserver.stop() layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+" - assert not env.pageserver.log_contains( - layer_download_failed_regex - ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" + assert not env.pageserver.log_contains(layer_download_failed_regex), ( + "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" + ) env.pageserver.start() # The attach should have got far enough that it recovers on restart (i.e. tenant's @@ -197,9 +197,9 @@ def test_remote_storage_backup_and_restore( detail = client.timeline_detail(tenant_id, timeline_id) log.info("Timeline detail after attach completed: %s", detail) - assert ( - Lsn(detail["last_record_lsn"]) >= current_lsn - ), "current db Lsn should should not be less than the one stored on remote storage" + assert Lsn(detail["last_record_lsn"]) >= current_lsn, ( + "current db Lsn should should not be less than the one stored on remote storage" + ) log.info("select some data, this will cause layers to be downloaded") endpoint = env.endpoints.create_start("main") @@ -456,9 +456,9 @@ def test_remote_timeline_client_calls_started_metric( def ensure_calls_started_grew(): for (file_kind, op_kind), observations in calls_started.items(): log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}") - assert all( - x < y for x, y in zip(observations, observations[1:], strict=False) - ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" + assert all(x < y for x, y in zip(observations, observations[1:], strict=False)), ( + f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" + ) def churn(data_pass1, data_pass2): # overwrite the same data in place, vacuum inbetween, and @@ -540,7 +540,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( "l0_flush_stall_threshold": "0", "compaction_target_size": f"{64 * 1024}", # large horizon to avoid automatic GC (our assert on gc_result below relies on that) - "gc_horizon": f"{1024 ** 4}", + "gc_horizon": f"{1024**4}", "gc_period": "1h", # disable PITR so that GC considers just gc_horizon "pitr_interval": "0s", @@ -574,9 +574,9 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( try: client.timeline_checkpoint(tenant_id, timeline_id) except PageserverApiException: - assert ( - checkpoint_allowed_to_fail.is_set() - ), "checkpoint op should only fail in response to timeline deletion" + assert checkpoint_allowed_to_fail.is_set(), ( + "checkpoint op should only fail in response to timeline deletion" + ) checkpoint_thread = threading.Thread(target=checkpoint_thread_fn) checkpoint_thread.start() @@ -662,9 +662,9 @@ def test_empty_branch_remote_storage_upload(neon_env_builder: NeonEnvBuilder): ) ) expected_timelines = set([env.initial_timeline, new_branch_timeline_id]) - assert ( - timelines_before_detach == expected_timelines - ), f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}" + assert timelines_before_detach == expected_timelines, ( + f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}" + ) client.tenant_detach(env.initial_tenant) env.pageserver.tenant_attach(env.initial_tenant) @@ -677,9 +677,9 @@ def test_empty_branch_remote_storage_upload(neon_env_builder: NeonEnvBuilder): ) ) - assert ( - timelines_before_detach == timelines_after_detach - ), f"Expected to have same timelines after reattach, but got {timelines_after_detach}" + assert timelines_before_detach == timelines_after_detach, ( + f"Expected to have same timelines after reattach, but got {timelines_after_detach}" + ) def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnvBuilder): @@ -724,9 +724,9 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv new_branch_on_remote_storage = env.pageserver_remote_storage.timeline_path( env.initial_tenant, new_branch_timeline_id ) - assert ( - not new_branch_on_remote_storage.exists() - ), "failpoint should had prohibited index_part.json upload" + assert not new_branch_on_remote_storage.exists(), ( + "failpoint should had prohibited index_part.json upload" + ) # during reconciliation we should had scheduled the uploads and on the # retried create_timeline, we will await for those to complete on next @@ -768,9 +768,9 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv client.configure_failpoints(("before-upload-index", "off")) exception = q.get() - assert ( - exception is None - ), "create_timeline should have succeeded, because we deleted unuploaded local state" + assert exception is None, ( + "create_timeline should have succeeded, because we deleted unuploaded local state" + ) # this is because creating a timeline always awaits for the uploads to complete assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id) diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 8764da3c2f..082808f9ff 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -2,13 +2,10 @@ from __future__ import annotations import time from datetime import UTC, datetime +from typing import TYPE_CHECKING from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnvBuilder, - PgBin, -) from fixtures.pageserver.utils import ( assert_prefix_empty, enable_remote_storage_versioning, @@ -18,6 +15,12 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.utils import run_pg_bench_small +if TYPE_CHECKING: + from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + ) + def test_tenant_s3_restore( neon_env_builder: NeonEnvBuilder, @@ -80,14 +83,14 @@ def test_tenant_s3_restore( ts_before_deletion = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) - assert ( - ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 - ), "tenant removed before we deletion was issued" + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1, ( + "tenant removed before we deletion was issued" + ) ps_http.tenant_delete(tenant_id) ps_http.deletion_queue_flush(execute=True) - assert ( - ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 - ), "tenant removed before we deletion was issued" + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0, ( + "tenant removed before we deletion was issued" + ) env.storage_controller.attach_hook_drop(tenant_id) tenant_path = env.pageserver.tenant_dir(tenant_id) diff --git a/test_runner/regress/test_safekeeper_deletion.py b/test_runner/regress/test_safekeeper_deletion.py index b46095d583..b681a86103 100644 --- a/test_runner/regress/test_safekeeper_deletion.py +++ b/test_runner/regress/test_safekeeper_deletion.py @@ -4,19 +4,22 @@ import threading import time from contextlib import closing from enum import StrEnum +from typing import TYPE_CHECKING import pytest import requests from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - Endpoint, - NeonEnvBuilder, -) from fixtures.remote_storage import S3Storage, s3_storage from fixtures.safekeeper_utils import is_segment_offloaded from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import ( + Endpoint, + NeonEnvBuilder, + ) + @pytest.mark.parametrize("auth_enabled", [False, True]) def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): diff --git a/test_runner/regress/test_setup.py b/test_runner/regress/test_setup.py index dfbbd575b7..fd0f5705c8 100644 --- a/test_runner/regress/test_setup.py +++ b/test_runner/regress/test_setup.py @@ -2,7 +2,10 @@ from __future__ import annotations -from fixtures.neon_fixtures import NeonEnvBuilder +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder # Test that pageserver and safekeeper can restart quickly. diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index b98ac8e50a..0bfc4b1d8c 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -8,7 +8,6 @@ from typing import TYPE_CHECKING, Any import pytest import requests from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId -from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_AZ_ID, @@ -23,13 +22,14 @@ from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_emp from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, s3_storage from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload -from pytest_httpserver import HTTPServer from typing_extensions import override -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: + from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.httpserver import ListenAddress + from pytest_httpserver import HTTPServer + from werkzeug.wrappers.request import Request def test_sharding_smoke( @@ -334,9 +334,9 @@ def test_sharding_split_compaction( result = ps.timeline_scan_no_disposable_keys(shard, timeline_id) tally = result.tally raw_page_count = tally.not_disposable_count + tally.disposable_count - assert tally.not_disposable_count > ( - raw_page_count // 2 - ), "compaction doesn't rewrite layers that are >=50pct local" + assert tally.not_disposable_count > (raw_page_count // 2), ( + "compaction doesn't rewrite layers that are >=50pct local" + ) log.info("check sizes") timeline_info = ps.http_client().timeline_detail(shard, timeline_id) @@ -808,6 +808,8 @@ def test_sharding_split_stripe_size( httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False} + env = neon_env_builder.init_start( initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size ) @@ -1316,6 +1318,11 @@ def test_sharding_split_failures( initial_shard_count = 2 split_shard_count = 4 + neon_env_builder.storage_controller_config = { + # Route to `compute_reconfigure_listener` instead + "use_local_compute_notifications": False, + } + env = neon_env_builder.init_configs() env.start() @@ -1594,7 +1601,7 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): delta_bytes = lsn - last_flush_lsn avg_speed = delta_bytes / delta / 1024 / 1024 log.info( - f"flush_lsn {lsn}, written {delta_bytes/1024}kb for {delta:.3f}s, avg_speed {avg_speed:.3f} MiB/s" + f"flush_lsn {lsn}, written {delta_bytes / 1024}kb for {delta:.3f}s, avg_speed {avg_speed:.3f} MiB/s" ) last_flush_lsn = lsn diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 3487542d6e..19952fc71b 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -2,18 +2,19 @@ from __future__ import annotations import socket import subprocess -from pathlib import Path from typing import TYPE_CHECKING import backoff from fixtures.log_helper import log from fixtures.neon_fixtures import PgProtocol, VanillaPostgres -from fixtures.port_distributor import PortDistributor if TYPE_CHECKING: + from pathlib import Path from types import TracebackType from typing import Self + from fixtures.port_distributor import PortDistributor + def generate_tls_cert(cn, certout, keyout): subprocess.run( diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py index 7db4a16f49..9a7204ca17 100644 --- a/test_runner/regress/test_ssl.py +++ b/test_runner/regress/test_ssl.py @@ -1,3 +1,6 @@ +import os +import ssl + import pytest import requests from fixtures.neon_fixtures import NeonEnvBuilder, StorageControllerApiException @@ -66,3 +69,85 @@ def test_safekeeper_https_api(neon_env_builder: NeonEnvBuilder): ) wait_until(storcon_heartbeat) + + +def test_storage_controller_https_api(neon_env_builder: NeonEnvBuilder): + """ + Test HTTPS storage controller API. + If NeonEnv starts with use_https_storage_controller_api with no errors, it's already a success. + Make /status request to HTTPS API to ensure it's appropriately configured. + """ + neon_env_builder.use_https_storage_controller_api = True + env = neon_env_builder.init_start() + + addr = f"https://localhost:{env.storage_controller.port}/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + +def test_certificate_rotation(neon_env_builder: NeonEnvBuilder): + """ + Test that pageserver reloads certificates when they are updated on the disk. + Safekeepers and storage controller use the same server implementation, so + testing only pageserver is fine. + 1. Simple check that HTTPS API works. + 2. Check that the cert returned by the server matches the cert in file. + 3. Replace ps's cert (but not the key). + 4. Check that ps uses the old cert (because the new one doesn't match the key). + 5. Replace ps's key. + 6. Check that ps reloaded the cert and key and returns the new one. + """ + neon_env_builder.use_https_pageserver_api = True + # Speed up the test :) + neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'" + env = neon_env_builder.init_start() + + # We intentionally set an incorrect key/cert pair during the test to test this error. + env.pageserver.allowed_errors.append(".*Error reloading certificate.*") + + port = env.pageserver.service_port.https + assert port is not None + + # 1. Check if https works. + addr = f"https://localhost:{port}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + ps_cert_path = env.pageserver.workdir / "server.crt" + ps_key_path = env.pageserver.workdir / "server.key" + ps_cert = open(ps_cert_path).read() + # We need another valid certificate to update to. + # Let's steal it from safekeeper. + sk_cert_path = env.safekeepers[0].data_dir / "server.crt" + sk_key_path = env.safekeepers[0].data_dir / "server.key" + sk_cert = open(sk_cert_path).read() + + # 2. Check that server's certificate match the cert in the file. + cur_cert = ssl.get_server_certificate(("localhost", port)) + assert cur_cert == ps_cert + + # 3. Replace ps's cert with sk's one. + os.rename(sk_cert_path, ps_cert_path) + + # Cert shouldn't be reloaded because it doesn't match private key. + def error_reloading_cert(): + assert env.pageserver.log_contains("Error reloading certificate: .* KeyMismatch") + + wait_until(error_reloading_cert) + + # 4. Check that it uses old cert. + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + cur_cert = ssl.get_server_certificate(("localhost", port)) + assert cur_cert == ps_cert + + # 5. Replace ps's private key with sk's one. + os.rename(sk_key_path, ps_key_path) + + # Wait till ps reloads certificate. + def cert_reloaded(): + assert env.pageserver.log_contains("Certificate has been reloaded") + + wait_until(cert_reloaded) + + # 6. Check that server returns new cert. + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + cur_cert = ssl.get_server_certificate(("localhost", port)) + assert cur_cert == sk_cert diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 05eb4301b0..097c187699 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -13,7 +13,6 @@ import fixtures.utils import pytest from fixtures.auth_tokens import TokenScope from fixtures.common_types import TenantId, TenantShardId, TimelineId -from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_AZ_ID, @@ -40,9 +39,7 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) from fixtures.pg_version import PgVersion -from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind, s3_storage -from fixtures.storage_controller_proxy import StorageControllerProxy from fixtures.utils import ( run_only_on_default_postgres, run_pg_bench_small, @@ -50,18 +47,21 @@ from fixtures.utils import ( wait_until, ) from fixtures.workload import Workload -from mypy_boto3_s3.type_defs import ( - ObjectTypeDef, -) -from pytest_httpserver import HTTPServer from urllib3 import Retry -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: from typing import Any + from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.httpserver import ListenAddress + from fixtures.port_distributor import PortDistributor + from fixtures.storage_controller_proxy import StorageControllerProxy + from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, + ) + from pytest_httpserver import HTTPServer + from werkzeug.wrappers.request import Request def get_node_shard_counts(env: NeonEnv, tenant_ids): @@ -73,7 +73,9 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids): @pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) -def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination): +def test_storage_controller_smoke( + neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure, combination +): """ Test the basic lifecycle of a storage controller: - Restarting @@ -83,6 +85,7 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) """ neon_env_builder.num_pageservers = 3 + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api env = neon_env_builder.init_configs() # Start services by hand so that we can skip a pageserver (this will start + register later) @@ -144,9 +147,9 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) for node_id, count in get_node_shard_counts(env, tenant_ids).items(): # we used a multiple of pagservers for the total shard count, # so expect equal number on all pageservers - assert count == tenant_shard_count / len( - env.pageservers - ), f"Node {node_id} has bad count {count}" + assert count == tenant_shard_count / len(env.pageservers), ( + f"Node {node_id} has bad count {count}" + ) # Creating and deleting timelines should work, using identical API to pageserver timeline_crud_tenant = next(iter(tenant_ids)) @@ -620,6 +623,8 @@ def test_storage_controller_compute_hook( httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False} + # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -738,6 +743,8 @@ def test_storage_controller_stuck_compute_hook( httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False} + # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -885,6 +892,8 @@ def test_storage_controller_compute_hook_retry( httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False} + # Start running env = neon_env_builder.init_configs() env.start() @@ -1008,6 +1017,8 @@ def test_storage_controller_compute_hook_revert( httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False} + # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) tenant_id = env.initial_tenant @@ -1374,8 +1385,9 @@ def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder): # vs. pageserver API calls, because pageserver has defaults. http.set_tenant_config(tenant_id, {}) readback_controller = http.tenant_config(tenant_id) - assert readback_controller.effective_config["pitr_interval"] is None - assert readback_controller.tenant_specific_overrides["pitr_interval"] is None + + assert "pitr_interval" not in readback_controller.effective_config.keys() + assert "pitr_interval" not in readback_controller.tenant_specific_overrides.keys() readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id) assert readback_ps.effective_config["pitr_interval"] == default_value assert "pitr_interval" not in readback_ps.tenant_specific_overrides @@ -1397,6 +1409,11 @@ def test_storage_controller_tenant_deletion( neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api + neon_env_builder.storage_controller_config = { + # Route to `compute_reconfigure_listener` instead + "use_local_compute_notifications": False, + } + env = neon_env_builder.init_configs() env.start() @@ -1598,6 +1615,12 @@ def test_storage_controller_heartbeats( env.storage_controller.allowed_errors.append( ".*Call to node.*management API.*failed.*failpoint.*" ) + # The server starts listening to the socket before sending re-attach request, + # but it starts serving HTTP only when re-attach is completed. + # If re-attach is slow (last scenario), storcon's heartbeat requests will time out. + env.storage_controller.allowed_errors.append( + ".*Call to node.*management API.*failed.* Timeout.*" + ) # Initially we have two online pageservers nodes = env.storage_controller.node_list() @@ -2169,7 +2192,12 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto @pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) @pytest.mark.parametrize("num_azs", [1, 2]) -def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int, combination): +def test_graceful_cluster_restart( + neon_env_builder: NeonEnvBuilder, + num_azs: int, + compute_reconfigure_listener: ComputeReconfigure, + combination, +): """ Graceful reststart of storage controller clusters use the drain and fill hooks in order to migrate attachments away from pageservers before @@ -2181,6 +2209,7 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int """ neon_env_builder.num_azs = num_azs neon_env_builder.num_pageservers = 2 + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api env = neon_env_builder.init_configs() env.start() @@ -2436,7 +2465,6 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("while_offline", [True, False]) def test_storage_controller_node_deletion( neon_env_builder: NeonEnvBuilder, - compute_reconfigure_listener: ComputeReconfigure, while_offline: bool, ): """ @@ -2862,6 +2890,143 @@ def test_storage_controller_leadership_transfer( ) +def test_storage_controller_leadership_transfer_during_split( + neon_env_builder: NeonEnvBuilder, + storage_controller_proxy: StorageControllerProxy, + port_distributor: PortDistributor, +): + """ + Exercise a race between shard splitting and graceful leadership transfer. This is + a reproducer for https://github.com/neondatabase/neon/issues/11254 + """ + neon_env_builder.auth_enabled = True + + neon_env_builder.num_pageservers = 3 + + neon_env_builder.storage_controller_config = { + "database_url": f"127.0.0.1:{port_distributor.get_port()}", + "start_as_candidate": True, + } + + neon_env_builder.storage_controller_port_override = storage_controller_proxy.port() + + storage_controller_1_port = port_distributor.get_port() + storage_controller_2_port = port_distributor.get_port() + + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + + env = neon_env_builder.init_configs() + start_env(env, storage_controller_1_port) + + assert ( + env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER + ) + leader = env.storage_controller.get_leader() + assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/" + + tenant_count = 2 + shard_count = 4 + tenants = set(TenantId.generate() for _ in range(0, tenant_count)) + + for tid in tenants: + env.storage_controller.tenant_create( + tid, shard_count=shard_count, placement_policy={"Attached": 1} + ) + env.storage_controller.reconcile_until_idle() + + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + # Start a shard split + env.storage_controller.allowed_errors.extend( + [".*Unexpected child shard count.*", ".*Enqueuing background abort.*"] + ) + pause_failpoint = "shard-split-pre-complete" + env.storage_controller.configure_failpoints((pause_failpoint, "pause")) + split_fut = executor.submit( + env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2 + ) + + def hit_failpoint(): + log.info("Checking log for pattern...") + try: + assert env.storage_controller.log_contains(f".*at failpoint {pause_failpoint}.*") + except Exception: + log.exception("Failed to find pattern in log") + raise + + wait_until(hit_failpoint, interval=0.1, status_interval=1.0) + + env.storage_controller.start( + timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port + ) + + def passed_split_abort(): + try: + log.info("Checking log for pattern...") + assert env.storage_controller.log_contains( + ".*Using observed state received from leader.*" + ) + except Exception: + log.exception("Failed to find pattern in log") + raise + + log.info("Awaiting split abort") + wait_until(passed_split_abort, interval=0.1, status_interval=1.0) + assert env.storage_controller.log_contains(".*Aborting shard split.*") + + # Proxy is still talking to original controller here: disable its pause failpoint so + # that its shard split can run to completion. + log.info("Disabling failpoint") + # Bypass the proxy: the python test HTTPServer is single threaded and still blocked + # on handling the shard split request. + env.storage_controller.request( + "PUT", + f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", + json=[{"name": "shard-split-pre-complete", "actions": "off"}], + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + + def previous_stepped_down(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.STEPPED_DOWN + ) + + log.info("Awaiting step down") + wait_until(previous_stepped_down) + + # Let the shard split complete: this may happen _after_ the replacement has come up + # and tried to clean up the databases + log.info("Unblocking & awaiting shard split") + with pytest.raises(Exception, match="Unexpected child shard count"): + # This split fails when it tries to persist results, because it encounters + # changes already made by the new controller's abort-on-startup + split_fut.result() + + log.info("Routing to new leader") + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") + + def new_becomes_leader(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.LEADER + ) + + wait_until(new_becomes_leader) + leader = env.storage_controller.get_leader() + assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/" + + env.storage_controller.wait_until_ready() + env.storage_controller.consistency_check() + + # Check that the stepped down instance forwards requests + # to the new leader while it's still running. + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + env.storage_controller.tenant_shard_dump() + env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) + status = env.storage_controller.node_status(env.pageservers[0].id) + assert status["scheduling"] == "Pause" + + def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder): # single unsharded tenant, two locations neon_env_builder.num_pageservers = 2 @@ -3908,6 +4073,101 @@ def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvB assert reconciles_after_restart == 0 +@run_only_on_default_postgres("PG version is not interesting here") +@pytest.mark.parametrize("restart_storcon", [True, False]) +def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart_storcon: bool): + """ + Test that the storcon can create and delete tenants and timelines with a safekeeper being down. + - restart_storcon: tests whether the pending ops are persisted. + if we don't restart, we test that we don't require it to come from the db. + """ + + neon_env_builder.num_safekeepers = 3 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + } + env = neon_env_builder.init_start() + + env.safekeepers[0].stop() + + # Wait for heartbeater to pick up that the safekeeper is gone + # This isn't really neccessary + def logged_offline(): + env.storage_controller.assert_log_contains( + "Heartbeat round complete for 3 safekeepers, 1 offline" + ) + + wait_until(logged_offline) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.create_tenant(tenant_id, timeline_id) + + env.safekeepers[1].assert_log_contains(f"creating new timeline {tenant_id}/{timeline_id}") + env.safekeepers[2].assert_log_contains(f"creating new timeline {tenant_id}/{timeline_id}") + + env.storage_controller.allowed_errors.extend( + [ + ".*Call to safekeeper.* management API still failed after.*", + ".*reconcile_one.*tenant_id={tenant_id}.*Call to safekeeper.* management API still failed after.*", + ] + ) + + if restart_storcon: + # Restart the storcon to check that we persist operations + env.storage_controller.stop() + env.storage_controller.start() + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + with env.endpoints.create("main", tenant_id=tenant_id, config_lines=config_lines) as ep: + # endpoint should start. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") + + env.storage_controller.assert_log_contains("writing pending op for sk id 1") + env.safekeepers[0].start() + + # ensure that we applied the operation also for the safekeeper we just brought down + def logged_contains_on_sk(): + env.safekeepers[0].assert_log_contains( + f"pulling timeline {tenant_id}/{timeline_id} from safekeeper" + ) + + wait_until(logged_contains_on_sk) + + env.safekeepers[1].stop() + + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + + # ensure the safekeeper deleted the timeline + def timeline_deleted_on_active_sks(): + env.safekeepers[0].assert_log_contains( + f"deleting timeline {tenant_id}/{timeline_id} from disk" + ) + env.safekeepers[2].assert_log_contains( + f"deleting timeline {tenant_id}/{timeline_id} from disk" + ) + + wait_until(timeline_deleted_on_active_sks) + + if restart_storcon: + # Restart the storcon to check that we persist operations + env.storage_controller.stop() + env.storage_controller.start() + + env.safekeepers[1].start() + + # ensure that there is log msgs for the third safekeeper too + def timeline_deleted_on_sk(): + env.safekeepers[1].assert_log_contains( + f"deleting timeline {tenant_id}/{timeline_id} from disk" + ) + + wait_until(timeline_deleted_on_sk) + + @pytest.mark.parametrize("wrong_az", [True, False]) def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, wrong_az: bool): """ @@ -4011,3 +4271,121 @@ def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, ) else: assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == [] + + +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") +def test_storage_controller_migrate_with_pageserver_restart( + neon_env_builder: NeonEnvBuilder, make_httpserver +): + """ + Test that live migrations which fail right after incrementing the generation + due to the destination going offline eventually send a compute notification + after the destination re-attaches. + """ + neon_env_builder.num_pageservers = 2 + + neon_env_builder.storage_controller_config = { + # Disable transitions to offline + "max_offline": "600s", + "use_local_compute_notifications": False, + } + + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" + ) + + notifications = [] + + def notify(request: Request): + log.info(f"Received notify-attach: {request}") + notifications.append(request.json) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(notify) + + env = neon_env_builder.init_start() + + env.storage_controller.allowed_errors.extend( + [ + ".*Call to node.*management API failed.*", + ".*Call to node.*management API still failed.*", + ".*Reconcile error.*", + ".*request.*PUT.*migrate.*", + ] + ) + + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}}) + env.storage_controller.reconcile_until_idle() + + initial_desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0] + log.info(f"{initial_desc=}") + primary = env.get_pageserver(initial_desc["node_attached"]) + secondary = env.get_pageserver(initial_desc["node_secondary"][0]) + + # Pause the migration after incrementing the generation in the database + env.storage_controller.configure_failpoints( + ("reconciler-live-migrate-post-generation-inc", "pause") + ) + + tenant_shard_id = TenantShardId(env.initial_tenant, 0, 0) + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + tenant_shard_id, + secondary.id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), + ) + + def has_hit_migration_failpoint(): + expr = "at failpoint reconciler-live-migrate-post-generation-inc" + log.info(expr) + assert env.storage_controller.log_contains(expr) + + wait_until(has_hit_migration_failpoint) + + secondary.stop() + + # Eventually migration completes + env.storage_controller.configure_failpoints( + ("reconciler-live-migrate-post-generation-inc", "off") + ) + try: + migrate_fut.result() + except StorageControllerApiException as err: + log.info(f"Migration failed: {err}") + except: + env.storage_controller.configure_failpoints( + ("reconciler-live-migrate-post-generation-inc", "off") + ) + raise + + def process_migration_result(): + dump = env.storage_controller.tenant_shard_dump() + observed = dump[0]["observed"]["locations"] + + log.info(f"{observed=} primary={primary.id} secondary={secondary.id}") + + assert observed[str(primary.id)]["conf"]["mode"] == "AttachedStale" + assert observed[str(secondary.id)]["conf"] is None + + wait_until(process_migration_result) + + # Start and wait for re-attach to be processed + secondary.start() + env.storage_controller.poll_node_status( + secondary.id, + desired_availability=PageserverAvailability.ACTIVE, + desired_scheduling_policy=None, + max_attempts=10, + backoff=1, + ) + + env.storage_controller.reconcile_until_idle() + + assert notifications[-1] == { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(secondary.id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, + } diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 0f4e5688a9..70af299de3 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -6,19 +6,22 @@ import shutil import threading import time from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnv, - NeonEnvBuilder, -) from fixtures.pg_version import PgVersion from fixtures.remote_storage import S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload +if TYPE_CHECKING: + from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + ) + @pytest.mark.parametrize("shard_count", [None, 4]) def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: int | None): diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index 8ad7282ea2..f084911fcc 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -2,11 +2,14 @@ from __future__ import annotations import threading import time +from typing import TYPE_CHECKING import pytest -from fixtures.neon_fixtures import NeonEnv from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + # This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates. # It requires tracking information about replication origins at page server side diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 0c2d535af4..de6bdc0aec 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -5,9 +5,6 @@ from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn -from fixtures.neon_fixtures import ( - NeonEnvBuilder, -) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import run_only_on_default_postgres, wait_until @@ -16,6 +13,10 @@ from fixtures.workload import Workload if TYPE_CHECKING: from typing import Any + from fixtures.neon_fixtures import ( + NeonEnvBuilder, + ) + def test_tenant_config(neon_env_builder: NeonEnvBuilder): """Test per tenant configuration""" @@ -68,9 +69,9 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): # check the configuration of the default tenant # it should match global configuration default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant) - assert ( - not default_tenant_config.tenant_specific_overrides - ), "Should have no specific settings yet" + assert not default_tenant_config.tenant_specific_overrides, ( + "Should have no specific settings yet" + ) effective_config = default_tenant_config.effective_config assert effective_config["checkpoint_distance"] == 10000 assert effective_config["compaction_target_size"] == 1048576 @@ -92,22 +93,22 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): new_specific_config = new_tenant_config.tenant_specific_overrides assert new_specific_config["checkpoint_distance"] == 20000 assert new_specific_config["gc_period"] == "30s" - assert len(new_specific_config) == len( - new_conf - ), f"No more specific properties were expected, but got: {new_specific_config}" + assert len(new_specific_config) == len(new_conf), ( + f"No more specific properties were expected, but got: {new_specific_config}" + ) new_effective_config = new_tenant_config.effective_config - assert ( - new_effective_config["checkpoint_distance"] == 20000 - ), "Specific 'checkpoint_distance' config should override the default value" - assert ( - new_effective_config["gc_period"] == "30s" - ), "Specific 'gc_period' config should override the default value" - assert ( - new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s" - ), "Should override default value" - assert new_effective_config["eviction_policy"] == { - "kind": "NoEviction" - }, "Specific 'eviction_policy' config should override the default value" + assert new_effective_config["checkpoint_distance"] == 20000, ( + "Specific 'checkpoint_distance' config should override the default value" + ) + assert new_effective_config["gc_period"] == "30s", ( + "Specific 'gc_period' config should override the default value" + ) + assert new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s", ( + "Should override default value" + ) + assert new_effective_config["eviction_policy"] == {"kind": "NoEviction"}, ( + "Specific 'eviction_policy' config should override the default value" + ) assert new_effective_config["compaction_target_size"] == 1048576 assert new_effective_config["compaction_period"] == "20s" assert new_effective_config["compaction_threshold"] == 10 @@ -134,22 +135,22 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): assert updated_specific_config["checkpoint_distance"] == 15000 assert updated_specific_config["gc_period"] == "1m 20s" assert updated_specific_config["compaction_period"] == "1m 20s" - assert len(updated_specific_config) == len( - conf_update - ), f"No more specific properties were expected, but got: {updated_specific_config}" + assert len(updated_specific_config) == len(conf_update), ( + f"No more specific properties were expected, but got: {updated_specific_config}" + ) updated_effective_config = updated_tenant_config.effective_config - assert ( - updated_effective_config["checkpoint_distance"] == 15000 - ), "Specific 'checkpoint_distance' config should override the default value" - assert ( - updated_effective_config["gc_period"] == "1m 20s" - ), "Specific 'gc_period' config should override the default value" - assert ( - updated_effective_config["compaction_period"] == "1m 20s" - ), "Specific 'compaction_period' config should override the default value" - assert ( - updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h" - ), "Should override default value" + assert updated_effective_config["checkpoint_distance"] == 15000, ( + "Specific 'checkpoint_distance' config should override the default value" + ) + assert updated_effective_config["gc_period"] == "1m 20s", ( + "Specific 'gc_period' config should override the default value" + ) + assert updated_effective_config["compaction_period"] == "1m 20s", ( + "Specific 'compaction_period' config should override the default value" + ) + assert updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h", ( + "Should override default value" + ) assert updated_effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "period": "1m 20s", @@ -167,9 +168,9 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): env.pageserver.start() restarted_tenant_config = http_client.tenant_config(tenant_id=tenant) - assert ( - restarted_tenant_config == updated_tenant_config - ), "Updated config should not change after the restart" + assert restarted_tenant_config == updated_tenant_config, ( + "Updated config should not change after the restart" + ) # update the config with very short config and make sure no trailing chars are left from previous config final_conf = { @@ -180,13 +181,13 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): final_tenant_config = http_client.tenant_config(tenant_id=tenant) final_specific_config = final_tenant_config.tenant_specific_overrides assert final_specific_config["pitr_interval"] == "1m" - assert len(final_specific_config) == len( - final_conf - ), f"No more specific properties were expected, but got: {final_specific_config}" + assert len(final_specific_config) == len(final_conf), ( + f"No more specific properties were expected, but got: {final_specific_config}" + ) final_effective_config = final_tenant_config.effective_config - assert ( - final_effective_config["pitr_interval"] == "1m" - ), "Specific 'pitr_interval' config should override the default value" + assert final_effective_config["pitr_interval"] == "1m", ( + "Specific 'pitr_interval' config should override the default value" + ) assert final_effective_config["checkpoint_distance"] == 10000 assert final_effective_config["compaction_target_size"] == 1048576 assert final_effective_config["compaction_period"] == "20s" @@ -207,9 +208,9 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): env.pageserver.start() restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant) - assert ( - restarted_final_tenant_config == final_tenant_config - ), "Updated config should not change after the restart" + assert restarted_final_tenant_config == final_tenant_config, ( + "Updated config should not change after the restart" + ) def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): @@ -299,9 +300,9 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( tenant_id, {"evictions_low_residence_duration_metric_threshold": default_value} ) updated_metric = get_metric() - assert int(updated_metric.value) == int( - metric.value - ), "metric is unchanged when setting same value" + assert int(updated_metric.value) == int(metric.value), ( + "metric is unchanged when setting same value" + ) env.config_tenant(tenant_id, {"evictions_low_residence_duration_metric_threshold": "2day"}) metric = get_metric() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 3720f653c5..8379908631 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -3,6 +3,7 @@ from __future__ import annotations import json from concurrent.futures import ThreadPoolExecutor from threading import Thread +from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TenantId, TimelineId @@ -23,9 +24,11 @@ from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.utils import run_pg_bench_small, wait_until from fixtures.workload import Workload from requests.exceptions import ReadTimeout -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response +if TYPE_CHECKING: + from werkzeug.wrappers.request import Request + def error_tolerant_delete(ps_http, tenant_id): """ diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 3f21dc895a..7f80a9bcfd 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -5,16 +5,12 @@ import random import time from enum import StrEnum from threading import Thread +from typing import TYPE_CHECKING import asyncpg import pytest from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - Endpoint, - NeonEnv, - NeonEnvBuilder, -) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( wait_for_last_record_lsn, @@ -25,7 +21,14 @@ from fixtures.remote_storage import ( RemoteStorageKind, ) from fixtures.utils import query_scalar, wait_until -from prometheus_client.samples import Sample + +if TYPE_CHECKING: + from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + NeonEnvBuilder, + ) + from prometheus_client.samples import Sample # In tests that overlap endpoint activity with tenant attach/detach, there are # a variety of warnings that the page service may emit when it cannot acquire @@ -434,9 +437,9 @@ def test_detach_while_activating( tenants_after_detach = [tenant["id"] for tenant in pageserver_http.tenant_list()] assert tenant_id not in tenants_after_detach, "Detached tenant should be missing" - assert len(tenants_after_detach) + 1 == len( - tenants_before_detach - ), "Only ignored tenant should be missing" + assert len(tenants_after_detach) + 1 == len(tenants_before_detach), ( + "Only ignored tenant should be missing" + ) # Subsequently attaching it again should still work pageserver_http.configure_failpoints([("attach-before-activate-sleep", "off")]) @@ -478,9 +481,9 @@ def insert_test_data( def ensure_test_data(data_id: int, data: str, endpoint: Endpoint): with endpoint.cursor() as cur: - assert ( - query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data - ), "Should have timeline data back" + assert query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data, ( + "Should have timeline data back" + ) def test_metrics_while_ignoring_broken_tenant_and_reloading( diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index df53a98e92..f395aa665d 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -5,14 +5,11 @@ import shutil import threading import time from contextlib import closing, contextmanager -from pathlib import Path from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver -from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, @@ -28,8 +25,12 @@ from fixtures.utils import ( ) if TYPE_CHECKING: + from pathlib import Path from typing import Any + from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver + from fixtures.pageserver.http import PageserverHttpClient + def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): assert abs(a - b) / a < margin_ratio, abs(a - b) / a @@ -158,9 +159,9 @@ def switch_pg_to_new_pageserver( timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id) files_before_detach = os.listdir(timeline_to_detach_local_path) - assert ( - len(files_before_detach) >= 1 - ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}" + assert len(files_before_detach) >= 1, ( + f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}" + ) return timeline_to_detach_local_path @@ -175,9 +176,9 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (sum_before_migration + 1500500,) - assert not os.path.exists( - old_local_path - ), f"After detach, local timeline dir {old_local_path} should be removed" + assert not os.path.exists(old_local_path), ( + f"After detach, local timeline dir {old_local_path} should be removed" + ) @pytest.mark.parametrize( diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 81e727a3aa..190dd914ee 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,10 +1,9 @@ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor -from pathlib import Path +from typing import TYPE_CHECKING import pytest -from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -19,9 +18,14 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_until_tenant_active, ) -from fixtures.pg_version import PgVersion from fixtures.utils import skip_in_debug_build, wait_until +if TYPE_CHECKING: + from pathlib import Path + + from fixtures.common_types import Lsn, TenantId, TimelineId + from fixtures.pg_version import PgVersion + def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs() @@ -578,9 +582,9 @@ def test_get_tenant_size_with_multiple_branches( wait_for_last_flush_lsn(env, second_branch_endpoint, tenant_id, second_branch_timeline_id) size_after_thinning_branch = http_client.tenant_size(tenant_id) - assert ( - size_after_thinning_branch > size_after_growing_second_branch - ), "tenant_size should grow with dropped tables and full vacuum" + assert size_after_thinning_branch > size_after_growing_second_branch, ( + "tenant_size should grow with dropped tables and full vacuum" + ) first_branch_endpoint.stop_and_destroy() second_branch_endpoint.stop_and_destroy() @@ -753,6 +757,47 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, env.stop(immediate=True) +def test_lsn_lease_storcon(neon_env_builder: NeonEnvBuilder): + conf = { + "pitr_interval": "0s", + "gc_period": "0s", + "compaction_period": "0s", + } + env = neon_env_builder.init_start(initial_tenant_conf=conf) + with env.endpoints.create_start( + "main", + ) as ep: + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + last_flush_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + env.storage_controller.pageserver_api().timeline_lsn_lease( + env.initial_tenant, env.initial_timeline, last_flush_lsn + ) + env.storage_controller.tenant_shard_split(env.initial_tenant, 8) + env.storage_controller.reconcile_until_idle(timeout_secs=120) + # TODO: do we preserve LSN leases across shard splits? + env.storage_controller.pageserver_api().timeline_lsn_lease( + env.initial_tenant, env.initial_timeline, last_flush_lsn + ) + + +def test_mark_invisible_storcon(neon_env_builder: NeonEnvBuilder): + conf = { + "pitr_interval": "0s", + "gc_period": "0s", + "compaction_period": "0s", + } + env = neon_env_builder.init_start(initial_tenant_conf=conf) + env.storage_controller.pageserver_api().timeline_mark_invisible( + env.initial_tenant, env.initial_timeline + ) + env.storage_controller.pageserver_api().timeline_mark_invisible( + env.initial_tenant, env.initial_timeline, True + ) + + def insert_with_action( env: NeonEnv, tenant: TenantId, diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 4c26b64d22..814ebc14f5 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,8 +1,9 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, @@ -10,6 +11,9 @@ from fixtures.pageserver.utils import ( ) from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + def get_only_element(l): # noqa: E741 assert len(l) == 1 diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index afe444f227..c613a79374 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -8,6 +8,7 @@ from contextlib import closing from datetime import datetime from itertools import chain from pathlib import Path +from typing import TYPE_CHECKING import pytest import requests @@ -29,7 +30,9 @@ from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import wait_until -from prometheus_client.samples import Sample + +if TYPE_CHECKING: + from prometheus_client.samples import Sample def test_tenant_creation_fails(neon_simple_env: NeonEnv): @@ -313,9 +316,9 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): files_in_timelines_dir = sum( 1 for _p in Path.iterdir(env.pageserver.timeline_dir(tenant_with_empty_timelines)) ) - assert ( - files_in_timelines_dir == 0 - ), f"Tenant {tenant_with_empty_timelines} should have an empty timelines/ directory" + assert files_in_timelines_dir == 0, ( + f"Tenant {tenant_with_empty_timelines} should have an empty timelines/ directory" + ) # Trigger timeline re-initialization after pageserver restart env.endpoints.stop_all() @@ -335,14 +338,14 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): tenants = client.tenant_list() [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)] - assert ( - loaded_tenant["state"]["slug"] == "Active" - ), "Tenant {tenant_with_empty_timelines} with empty timelines dir should be active and ready for timeline creation" + assert loaded_tenant["state"]["slug"] == "Active", ( + "Tenant {tenant_with_empty_timelines} with empty timelines dir should be active and ready for timeline creation" + ) loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines) - assert ( - loaded_tenant_status["state"]["slug"] == "Active" - ), f"Tenant {tenant_with_empty_timelines} without timelines dir should be active" + assert loaded_tenant_status["state"]["slug"] == "Active", ( + f"Tenant {tenant_with_empty_timelines} without timelines dir should be active" + ) time.sleep(1) # to allow metrics propagation @@ -357,9 +360,9 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): ).value ) - assert ( - tenant_active_count == 1 - ), f"Tenant {tenant_with_empty_timelines} should have metric as active" + assert tenant_active_count == 1, ( + f"Tenant {tenant_with_empty_timelines} should have metric as active" + ) def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 6b27c41d1c..47056e2786 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -181,13 +181,13 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) restored_timelines = client.timeline_list(tenant_id) - assert ( - len(restored_timelines) == 1 - ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + assert len(restored_timelines) == 1, ( + f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + ) restored_timeline = restored_timelines[0] - assert ( - restored_timeline["timeline_id"] == str(timeline_id) - ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + assert restored_timeline["timeline_id"] == str(timeline_id), ( + f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + ) # Check that we had to retry the downloads assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*") @@ -235,9 +235,9 @@ def test_tenant_redownloads_truncated_file_on_startup( os.truncate(path, 0) local_layer_truncated = (path, correct_size) break - assert ( - local_layer_truncated is not None - ), f"Found no local layer files to delete in directory {timeline_dir}" + assert local_layer_truncated is not None, ( + f"Found no local layer files to delete in directory {timeline_dir}" + ) (path, expected_size) = local_layer_truncated @@ -256,13 +256,13 @@ def test_tenant_redownloads_truncated_file_on_startup( wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) restored_timelines = client.timeline_list(tenant_id) - assert ( - len(restored_timelines) == 1 - ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + assert len(restored_timelines) == 1, ( + f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + ) retored_timeline = restored_timelines[0] - assert ( - retored_timeline["timeline_id"] == str(timeline_id) - ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + assert retored_timeline["timeline_id"] == str(timeline_id), ( + f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + ) # Request non-incremental logical size. Calculating it needs the layer file that # we corrupted, forcing it to be redownloaded. @@ -277,9 +277,9 @@ def test_tenant_redownloads_truncated_file_on_startup( # if the upload ever was ongoing, this check would be racy, but at least one # extra http request has been made in between so assume it's enough delay - assert ( - os.stat(remote_layer_path).st_size == expected_size - ), "truncated file should not had been uploaded around re-download" + assert os.stat(remote_layer_path).st_size == expected_size, ( + "truncated file should not had been uploaded around re-download" + ) endpoint = env.endpoints.create_start("main") @@ -295,6 +295,6 @@ def test_tenant_redownloads_truncated_file_on_startup( # re-uploaded truncated. this is a rather bogus check given the current # implementation, but it's critical it doesn't happen so wasting a few # lines of python to do this. - assert ( - os.stat(remote_layer_path).st_size == expected_size - ), "truncated file should not had been uploaded after next checkpoint" + assert os.stat(remote_layer_path).st_size == expected_size, ( + "truncated file should not had been uploaded after next checkpoint" + ) diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index c87b520366..8d3d7b623c 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -10,12 +10,12 @@ from fixtures.neon_fixtures import ( PgBin, last_flush_lsn_upload, ) -from fixtures.pageserver.http import LayerMapInfo from fixtures.remote_storage import RemoteStorageKind -from pytest_httpserver import HTTPServer if TYPE_CHECKING: from fixtures.httpserver import ListenAddress + from fixtures.pageserver.http import LayerMapInfo + from pytest_httpserver import HTTPServer # NB: basic config change tests are in test_tenant_conf.py @@ -54,7 +54,12 @@ def test_threshold_based_eviction( ps_http = env.pageserver.http_client() vps_http = env.storage_controller.pageserver_api() - assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] is None + # check config on pageserver, set via storcon; https://github.com/neondatabase/neon/issues/9621 + + assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == {} + assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + "kind": "NoEviction" + } eviction_threshold = 10 eviction_period = 2 @@ -68,7 +73,7 @@ def test_threshold_based_eviction( }, }, ) - assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -77,7 +82,7 @@ def test_threshold_based_eviction( # restart because changing tenant config is not instant env.pageserver.restart() - assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -176,14 +181,14 @@ def test_threshold_based_eviction( # TODO: can we be more precise here? E.g., require we're stable _within_ X*threshold, # instead of what we do here, i.e., stable _for at least_ X*threshold toward the end of the observation window - assert ( - stable_for > consider_stable_when_no_change_for_seconds - ), "layer residencies did not become stable within the observation window" + assert stable_for > consider_stable_when_no_change_for_seconds, ( + "layer residencies did not become stable within the observation window" + ) post = map_info_changes[-1][1].by_local_and_remote() assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized" assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident" - assert ( - env.pageserver.log_contains(metrics_refused_log_line) is not None - ), "ensure the metrics collection worker ran" + assert env.pageserver.log_contains(metrics_refused_log_line) is not None, ( + "ensure the metrics collection worker ran" + ) diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 2bad0bb671..17abe1ea75 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -4,6 +4,7 @@ import json import random import threading import time +from typing import TYPE_CHECKING import pytest import requests @@ -23,11 +24,13 @@ from fixtures.pageserver.utils import ( from fixtures.pg_version import PgVersion from fixtures.remote_storage import S3Storage, s3_storage from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build, wait_until -from mypy_boto3_s3.type_defs import ( - ObjectTypeDef, -) from psycopg2.errors import IoError, UndefinedTable +if TYPE_CHECKING: + from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, + ) + @pytest.mark.parametrize("shard_count", [0, 4]) def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): @@ -292,7 +295,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel conf={ "gc_period": "0s", "compaction_period": "0s", - "checkpoint_distance": f"{1024 ** 2}", + "checkpoint_distance": f"{1024**2}", } ) @@ -898,7 +901,7 @@ def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder): conf={ "gc_period": "0s", "compaction_period": "0s", - "checkpoint_distance": f"{1024 ** 2}", + "checkpoint_distance": f"{1024**2}", } ) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index fbece68367..f7629edf7a 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -167,7 +167,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( initial_tenant_conf={ "gc_period": "0s", "compaction_period": "0s", - "checkpoint_distance": f"{1024 ** 2}", + "checkpoint_distance": f"{1024**2}", "image_creation_threshold": "100", } ) @@ -363,9 +363,9 @@ def test_timeline_resurrection_on_attach( wait_until_tenant_active(ps_http, tenant_id=tenant_id) timelines = ps_http.timeline_list(tenant_id=tenant_id) - assert {TimelineId(tl["timeline_id"]) for tl in timelines} == { - main_timeline_id - }, "the deleted timeline should not have been resurrected" + assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {main_timeline_id}, ( + "the deleted timeline should not have been resurrected" + ) assert all([tl["state"] == "Active" for tl in timelines]) @@ -423,9 +423,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id) - assert ( - not leaf_timeline_path.exists() - ), "timeline load procedure should have resumed the deletion interrupted by the failpoint" + assert not leaf_timeline_path.exists(), ( + "timeline load procedure should have resumed the deletion interrupted by the failpoint" + ) timelines = ps_http.timeline_list(env.initial_tenant) assert {TimelineId(tl["timeline_id"]) for tl in timelines} == { intermediate_timeline_id, @@ -705,7 +705,7 @@ def test_delete_orphaned_objects( initial_tenant_conf={ "gc_period": "0s", "compaction_period": "0s", - "checkpoint_distance": f"{1024 ** 2}", + "checkpoint_distance": f"{1024**2}", "image_creation_threshold": "100", } ) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 96664f2b8d..a71652af8a 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -343,7 +343,8 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) -def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("snapshots_archived", ["archived", "normal"]) +def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots_archived: str): """ Test the v2 behavior of ancestor detach. @@ -385,6 +386,11 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + branchpoint_y = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) @@ -395,6 +401,10 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): "earlier", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_pipe ) + snapshot_branchpoint_old = env.create_branch( + "snapshot_branchpoint_old", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_y + ) + snapshot_branchpoint = env.create_branch( "snapshot_branchpoint", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_x ) @@ -407,19 +417,32 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): after = env.create_branch("after", ancestor_branch_name="main", ancestor_start_lsn=None) + if snapshots_archived == "archived": + # archive the previous snapshot branchpoint + client.timeline_archival_config( + env.initial_tenant, snapshot_branchpoint_old, TimelineArchivalState.ARCHIVED + ) + all_reparented = client.detach_ancestor( env.initial_tenant, branch_to_detach, detach_behavior="v2" ) assert set(all_reparented) == set() + if snapshots_archived == "archived": + # restore the branchpoint so that we can query from the endpoint + client.timeline_archival_config( + env.initial_tenant, snapshot_branchpoint_old, TimelineArchivalState.UNARCHIVED + ) + env.pageserver.quiesce_tenants() # checking the ancestor after is much faster than waiting for the endpoint not start expected_result = [ - ("main", env.initial_timeline, None, 16384, 1), - ("after", after, env.initial_timeline, 16384, 1), - ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 8192, 1), - ("branch_to_detach", branch_to_detach, None, 8192, 1), + ("main", env.initial_timeline, None, 24576, 1), + ("after", after, env.initial_timeline, 24576, 1), + ("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1), + ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1), + ("branch_to_detach", branch_to_detach, None, 16384, 1), ("earlier", earlier, env.initial_timeline, 0, 1), ] @@ -431,9 +454,9 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): if expected_ancestor is None: assert ancestor_timeline_id is None else: - assert ( - TimelineId(ancestor_timeline_id) == expected_ancestor - ), f"when checking branch {branch_name}, mapping={expected_result}" + assert TimelineId(ancestor_timeline_id) == expected_ancestor, ( + f"when checking branch {branch_name}, mapping={expected_result}" + ) index_part = env.pageserver_remote_storage.index_content( env.initial_tenant, queried_timeline @@ -1301,9 +1324,9 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv offset, ) if mode == "delete_reparentable_timeline": - assert ( - retried is None - ), "detaching should had converged after both nodes saw the deletion" + assert retried is None, ( + "detaching should had converged after both nodes saw the deletion" + ) elif mode == "create_reparentable_timeline": assert retried is not None, "detaching should not have converged" _, offset = retried @@ -1531,9 +1554,9 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon # first round -- do more checking to make sure the gc gets paused try_detach() - assert ( - http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None - ), "first round should had detached 'detached'" + assert http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None, ( + "first round should had detached 'detached'" + ) reparented, not_reparented = reparenting_progress(timelines) assert reparented == 1 @@ -1569,9 +1592,9 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon for _ in range(2): try_detach() - assert ( - http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None - ), "first round should had detached 'detached'" + assert http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None, ( + "first round should had detached 'detached'" + ) reparented, not_reparented = reparenting_progress(timelines) assert reparented == reparented_before + 1 @@ -1611,9 +1634,9 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon assert reparented == len(timelines) time.sleep(2) - assert ( - env.pageserver.log_contains(".*: attach finished, activating", offset) is None - ), "there should be no restart with the final detach_ancestor as it only completed" + assert env.pageserver.log_contains(".*: attach finished, activating", offset) is None, ( + "there should be no restart with the final detach_ancestor as it only completed" + ) # gc is unblocked env.pageserver.assert_log_contains(".* gc_loop.*: 5 timelines need GC", offset) @@ -1702,7 +1725,7 @@ def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBu "compaction_period": "5s", # No PiTR interval and small GC horizon "pitr_interval": "0s", - "gc_horizon": f"{1024 ** 2}", + "gc_horizon": f"{1024**2}", "lsn_lease_length": "0s", # Small checkpoint distance to create many layers "checkpoint_distance": 1024**2, @@ -1745,6 +1768,87 @@ def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBu workload_child.validate(env.pageserver.id) +def test_timeline_detach_with_aux_files_with_detach_v1( + neon_env_builder: NeonEnvBuilder, +): + """ + Validate that "branches do not inherit their parent" is invariant over detach_ancestor. + + Branches hide parent branch aux files etc by stopping lookup of non-inherited keyspace at the parent-child boundary. + We had a bug where detach_ancestor running on a child branch would copy aux files key range from child to parent, + thereby making parent aux files reappear. + """ + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + http = env.pageserver.http_client() + + endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant) + lsn0 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + endpoint.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')" + ) + lsn1 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + endpoint.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_parent_2', 'pgoutput')" + ) + lsn2 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn0).keys()) == set( + [] + ) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn1).keys()) == set( + ["pg_replslot/test_slot_parent_1/state"] + ) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( + ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] + ) + + # Restore at LSN1 + branch_timeline_id = env.create_branch("restore", env.initial_tenant, "main", lsn1) + endpoint2 = env.endpoints.create_start("restore", tenant_id=env.initial_tenant) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) + + # Add a new slot file to the restore branch (This won't happen in reality because cplane immediately detaches the branch on restore, + # but we want to ensure that aux files on the detached branch are NOT inherited during ancestor detach. We could change the behavior + # in the future. + # TL;DR we should NEVER automatically detach a branch as a background optimization for those tenants that already used the restore + # feature before branch detach was introduced because it will clean up the aux files and stop logical replication. + endpoint2.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')" + ) + lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( + ["pg_replslot/test_slot_restore/state"] + ) + + print("lsn0=", lsn0) + print("lsn1=", lsn1) + print("lsn2=", lsn2) + print("lsn3=", lsn3) + # Detach the restore branch so that main doesn't have any child branches. + all_reparented = http.detach_ancestor( + env.initial_tenant, branch_timeline_id, detach_behavior="v1" + ) + assert all_reparented == set([]) + + # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN. + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( + ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] + ), "main branch unaffected" + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( + ["pg_replslot/test_slot_restore/state"] + ) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) + + # TODO: # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 7605e1f758..9a710f5b80 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -3,16 +3,19 @@ from __future__ import annotations import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass +from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - LogCursor, - NeonEnvBuilder, - NeonPageserver, -) from fixtures.pageserver.utils import wait_timeline_detail_404 +if TYPE_CHECKING: + from fixtures.neon_fixtures import ( + LogCursor, + NeonEnvBuilder, + NeonPageserver, + ) + @pytest.mark.parametrize("sharded", [True, False]) def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool): diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index e2fdacdbfc..7247027165 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -6,7 +6,7 @@ import random import time from collections import defaultdict from contextlib import closing -from pathlib import Path +from typing import TYPE_CHECKING import psycopg2.errors import psycopg2.extras @@ -22,18 +22,22 @@ from fixtures.neon_fixtures import ( VanillaPostgres, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, wait_for_upload_queue_empty, wait_until_tenant_active, ) -from fixtures.pg_version import PgVersion -from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import get_timeline_dir_size, wait_until +if TYPE_CHECKING: + from pathlib import Path + + from fixtures.pageserver.http import PageserverHttpClient + from fixtures.pg_version import PgVersion + from fixtures.port_distributor import PortDistributor + def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env @@ -310,9 +314,9 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): new_res = client.timeline_detail( env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) - assert ( - new_res["current_logical_size"] == new_res["current_logical_size_non_incremental"] - ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value" + assert new_res["current_logical_size"] == new_res["current_logical_size_non_incremental"], ( + "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value" + ) @pytest.mark.parametrize("deletion_method", ["tenant_detach", "timeline_delete"]) diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py index 946dab2676..5c494a1368 100644 --- a/test_runner/regress/test_truncate.py +++ b/test_runner/regress/test_truncate.py @@ -1,8 +1,10 @@ from __future__ import annotations import time +from typing import TYPE_CHECKING -from fixtures.neon_fixtures import NeonEnvBuilder +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder # @@ -34,10 +36,10 @@ def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark): cur.execute(f"insert into t1 values (generate_series(1,{n_records}))") cur.execute("vacuum t1") for _ in range(n_iter): - cur.execute(f"delete from t1 where x>{n_records//2}") + cur.execute(f"delete from t1 where x>{n_records // 2}") cur.execute("vacuum t1") time.sleep(1) # let pageserver a chance to create image layers - cur.execute(f"insert into t1 values (generate_series({n_records//2+1}, {n_records}))") + cur.execute(f"insert into t1 values (generate_series({n_records // 2 + 1}, {n_records}))") cur.execute("vacuum t1") time.sleep(1) # let pageserver a chance to create image layers diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index e37e8dd3e8..169c966fa5 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from pathlib import Path +from typing import TYPE_CHECKING from fixtures.common_types import TimelineId from fixtures.log_helper import log @@ -13,6 +13,9 @@ from fixtures.neon_fixtures import ( wait_for_wal_insert_lsn, ) +if TYPE_CHECKING: + from pathlib import Path + # # Test branching, when a transaction is in prepared state diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index b30c02e0e4..85fa4c821e 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -163,9 +163,9 @@ def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder): relfrozenxid = int( query_scalar(cur, "SELECT relfrozenxid FROM pg_class WHERE relname='vmtest_lock'") ) - assert ( - relfrozenxid > xid - ), f"Inserted rows were not frozen. This can be caused by concurrent activity in the database. (XID {xid}, relfrozenxid {relfrozenxid}" + assert relfrozenxid > xid, ( + f"Inserted rows were not frozen. This can be caused by concurrent activity in the database. (XID {xid}, relfrozenxid {relfrozenxid}" + ) # Lock a row. This clears the all-frozen VM bit for that page. cur.execute("BEGIN") @@ -324,7 +324,7 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # Run pgbench in 4 different databases, to exercise different shards. dbnames = [f"pgbench{i}" for i in range(PGBENCH_RUNS)] for i, dbname in enumerate(dbnames): - log.info(f"pgbench run {i+1}/{PGBENCH_RUNS}") + log.info(f"pgbench run {i + 1}/{PGBENCH_RUNS}") endpoint.safe_psql(f"create database {dbname}") connstr = endpoint.connstr(dbname=dbname) # Initialize the data set, but don't vacuum yet. diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 89c4a96499..e3d39f9315 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -39,7 +39,6 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) from fixtures.pg_version import PgVersion -from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( RemoteStorageKind, default_remote_storage, @@ -72,6 +71,8 @@ from fixtures.utils import ( if TYPE_CHECKING: from typing import Any, Self + from fixtures.port_distributor import PortDistributor + @dataclass class TimelineMetrics: @@ -138,20 +139,24 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns, strict=False): # Invariant. May be < when transaction is in progress. - assert ( - commit_lsn <= flush_lsn - ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + assert commit_lsn <= flush_lsn, ( + f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + ) # We only call collect_metrics() after a transaction is confirmed by # the compute node, which only happens after a consensus of safekeepers # has confirmed the transaction. We assume majority consensus here. assert ( 2 * sum(m.last_record_lsn <= lsn for lsn in m.flush_lsns) > neon_env_builder.num_safekeepers - ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + ), ( + f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + ) assert ( 2 * sum(m.last_record_lsn <= lsn for lsn in m.commit_lsns) > neon_env_builder.num_safekeepers - ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + ), ( + f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + ) timeline_metrics.append(m) log.info(f"{message}: {timeline_metrics}") return timeline_metrics @@ -1112,16 +1117,16 @@ def cmp_sk_wal(sks: list[Safekeeper], tenant_id: TenantId, timeline_id: Timeline statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses] for tfl, sk in zip(term_flush_lsns[1:], sks[1:], strict=False): - assert ( - term_flush_lsns[0] == tfl - ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" + assert term_flush_lsns[0] == tfl, ( + f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" + ) # check that WALs are identic. segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] for cmp_segs, sk in zip(segs[1:], sks[1:], strict=False): - assert ( - segs[0] == cmp_segs - ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}" + assert segs[0] == cmp_segs, ( + f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}" + ) log.info(f"comparing segs {segs[0]}") sk0 = sks[0] @@ -2418,7 +2423,7 @@ def test_s3_eviction( for j in range(n_timelines): detail = ps_client.timeline_detail(env.initial_tenant, timelines[j]) log.debug( - f'{branch_names[j]}: RCL={detail["remote_consistent_lsn"]}, LRL={detail["last_record_lsn"]}' + f"{branch_names[j]}: RCL={detail['remote_consistent_lsn']}, LRL={detail['last_record_lsn']}" ) i = random.randint(0, n_timelines - 1) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 56539a0a08..b7c7478e78 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -4,9 +4,8 @@ import asyncio import random import time from dataclasses import dataclass -from pathlib import Path +from typing import TYPE_CHECKING -import asyncpg import pytest import toml from fixtures.common_types import Lsn, TenantId, TimelineId @@ -21,6 +20,11 @@ from fixtures.neon_fixtures import ( from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import skip_in_debug_build +if TYPE_CHECKING: + from pathlib import Path + + import asyncpg + log = getLogger("root.safekeeper_async") @@ -692,7 +696,7 @@ async def run_race_conditions(env: NeonEnv, endpoint: Endpoint): expected_sum += i i += 1 - log.info(f"Executed {i-1} queries") + log.info(f"Executed {i - 1} queries") res = await conn.fetchval("SELECT sum(key) FROM t") assert res == expected_sum @@ -766,7 +770,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat endpoint.start() conn = await endpoint.connect_async() - log.info(f"Executed {i-1} queries") + log.info(f"Executed {i - 1} queries") res = await conn.fetchval("SELECT sum(key) FROM t") assert res == expected_sum diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index d22a900c59..0252b590cc 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -5,12 +5,13 @@ from typing import TYPE_CHECKING from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.utils import wait_until if TYPE_CHECKING: from typing import Any + from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder + # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. # Ensures that walreceiver does not run without any data inserted and only starts after the insertion. @@ -33,9 +34,9 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): except Exception as e: exception_string = str(e) assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" - assert ( - "WalReceiver status: Not active" in exception_string - ), "Walreceiver should not be active before any data writes" + assert "WalReceiver status: Not active" in exception_string, ( + "Walreceiver should not be active before any data writes" + ) insert_test_elements(env, tenant_id, start=0, count=1_000) try: @@ -43,9 +44,9 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): except Exception as e: exception_string = str(e) assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" - assert ( - "WalReceiver status: Not active" not in exception_string - ), "Should not be inactive anymore after INSERTs are made" + assert "WalReceiver status: Not active" not in exception_string, ( + "Should not be inactive anymore after INSERTs are made" + ) assert "WalReceiver status" in exception_string, "But still should have some other status" @@ -88,14 +89,14 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil trigger_wait_lsn_timeout(env, tenant_id) except Exception as e: exception_string = str(e) - assert ( - expected_timeout_error in exception_string - ), "Should time out during waiting for WAL" + assert expected_timeout_error in exception_string, ( + "Should time out during waiting for WAL" + ) for safekeeper in env.safekeepers: - assert ( - str(safekeeper.id) in exception_string - ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout" + assert str(safekeeper.id) in exception_string, ( + f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout" + ) wait_until(all_sks_in_wareceiver_state, timeout=30) @@ -110,19 +111,19 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil except Exception as e: # Strip out the part before stdout, as it contains full command with the list of all safekeepers exception_string = str(e).split("stdout", 1)[-1] - assert ( - expected_timeout_error in exception_string - ), "Should time out during waiting for WAL" + assert expected_timeout_error in exception_string, ( + "Should time out during waiting for WAL" + ) for safekeeper in env.safekeepers: if safekeeper.id == stopped_safekeeper_id: - assert ( - str(safekeeper.id) not in exception_string - ), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" + assert str(safekeeper.id) not in exception_string, ( + f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" + ) else: - assert ( - str(safekeeper.id) in exception_string - ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" + assert str(safekeeper.id) in exception_string, ( + f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" + ) wait_until(all_but_stopped_sks_in_wareceiver_state, timeout=30) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index c8e51fde13..0bb63308bb 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -3,7 +3,7 @@ from __future__ import annotations import sys import tarfile import tempfile -from pathlib import Path +from typing import TYPE_CHECKING import pytest import zstandard @@ -19,11 +19,15 @@ from fixtures.pageserver.utils import ( remote_storage_delete_key, timeline_delete_wait_completed, ) -from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage -from mypy_boto3_s3.type_defs import ( - ObjectTypeDef, -) + +if TYPE_CHECKING: + from pathlib import Path + + from fixtures.port_distributor import PortDistributor + from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, + ) @pytest.mark.skipif( diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 182e57b8a4..7e9e9481a8 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -1,14 +1,17 @@ from __future__ import annotations import time +from typing import TYPE_CHECKING import psutil import pytest from fixtures.common_types import TenantId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import PageserverApiException +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False): children = psutil.Process(pageserver_pid).children() diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 112e699395..8d04a16f1a 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -1,10 +1,13 @@ from __future__ import annotations import os +from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv """ Use this test to see what happens when tests fail. diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 7b7592e740..a0391901a2 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 7b7592e74059f795b64f06860cea97673418f35e +Subproject commit a0391901a2af13aa029b905272a5b2024133c926 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index ee794ba767..aeb292eeac 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit ee794ba767eef9b10260ef67d3a58084f1dabd6f +Subproject commit aeb292eeace9072e07071254b6ffc7a74007d4d2 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 512856aaa8..d56e79cd5d 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 512856aaa8bedbaa8f06811449518dcb0c2e5d8f +Subproject commit d56e79cd5d6136c159b1d8d98acb7981d4b69364 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index e5e87b9f52..66114c23bc 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit e5e87b9f52d0eaeb83f3e2517bb9727aac37729b +Subproject commit 66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8 diff --git a/vendor/revisions.json b/vendor/revisions.json index 1d76e1da01..d7eddf42b7 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.4", - "e5e87b9f52d0eaeb83f3e2517bb9727aac37729b" + "66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8" ], "v16": [ "16.8", - "512856aaa8bedbaa8f06811449518dcb0c2e5d8f" + "d56e79cd5d6136c159b1d8d98acb7981d4b69364" ], "v15": [ "15.12", - "ee794ba767eef9b10260ef67d3a58084f1dabd6f" + "aeb292eeace9072e07071254b6ffc7a74007d4d2" ], "v14": [ "14.17", - "7b7592e74059f795b64f06860cea97673418f35e" + "a0391901a2af13aa029b905272a5b2024133c926" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index e9eaf4b35e..b548a2a88a 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -61,7 +61,7 @@ memchr = { version = "2" } nix = { version = "0.26" } nom = { version = "7" } num = { version = "0.4" } -num-bigint = { version = "0.4", default-features = false, features = ["std"] } +num-bigint = { version = "0.4" } num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } @@ -127,7 +127,7 @@ log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num = { version = "0.4" } -num-bigint = { version = "0.4", default-features = false, features = ["std"] } +num-bigint = { version = "0.4" } num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }