diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 2b96ce95da..1e6c2d0aa2 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -28,3 +28,7 @@ config-variables: - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN - SLACK_ON_CALL_STORAGE_STAGING_STREAM - SLACK_CICD_CHANNEL_ID + - SLACK_STORAGE_CHANNEL_ID + - NEON_DEV_AWS_ACCOUNT_ID + - NEON_PROD_AWS_ACCOUNT_ID + - AWS_ECR_REGION diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index d07e3e32e8..b85ca7874d 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -38,9 +38,11 @@ runs: # - name: Set variables shell: bash -euxo pipefail {0} + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BUCKET: neon-github-public-dev run: | - PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) - if [ "${PR_NUMBER}" != "null" ]; then + if [ -n "${PR_NUMBER}" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then @@ -59,8 +61,6 @@ runs: echo "LOCK_FILE=${LOCK_FILE}" >> $GITHUB_ENV echo "WORKDIR=${WORKDIR}" >> $GITHUB_ENV echo "BUCKET=${BUCKET}" >> $GITHUB_ENV - env: - BUCKET: neon-github-public-dev # TODO: We can replace with a special docker image with Java and Allure pre-installed - uses: actions/setup-java@v4 @@ -80,8 +80,8 @@ runs: rm -f ${ALLURE_ZIP} fi env: - ALLURE_VERSION: 2.27.0 - ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777 + ALLURE_VERSION: 2.32.2 + ALLURE_ZIP_SHA256: 3f28885e2118f6317c92f667eaddcc6491400af1fb9773c1f3797a5fa5174953 - uses: aws-actions/configure-aws-credentials@v4 if: ${{ !cancelled() }} diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 8548a886cf..687bfd49af 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -18,9 +18,11 @@ runs: steps: - name: Set variables shell: bash -euxo pipefail {0} + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + REPORT_DIR: ${{ inputs.report-dir }} run: | - PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) - if [ "${PR_NUMBER}" != "null" ]; then + if [ -n "${PR_NUMBER}" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then @@ -32,8 +34,6 @@ runs: echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV echo "REPORT_DIR=${REPORT_DIR}" >> $GITHUB_ENV - env: - REPORT_DIR: ${{ inputs.report-dir }} - uses: aws-actions/configure-aws-credentials@v4 if: ${{ !cancelled() }} diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index c9f6b0832e..a393aa6106 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -19,7 +19,11 @@ inputs: default: '[1, 1]' # settings below only needed if you want the project to be sharded from the beginning shard_split_project: - description: 'by default new projects are not shard-split, specify true to shard-split' + description: 'by default new projects are not shard-split initiailly, but only when shard-split threshold is reached, specify true to explicitly shard-split initially' + required: false + default: 'false' + disable_sharding: + description: 'by default new projects use storage controller default policy to shard-split when shard-split threshold is reached, specify true to explicitly disable sharding' required: false default: 'false' admin_api_key: @@ -107,6 +111,21 @@ runs: -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}" fi + if [ "${DISABLE_SHARDING}" = "true" ]; then + # determine tenant ID + TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"` + + echo "Explicitly disabling shard-splitting for project ${project_id} with tenant_id ${TENANT_ID}" + + echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" + echo "with body {\"scheduling\": \"Essential\"}" + + # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set) + curl -X PUT \ + "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" \ + -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -d "{\"scheduling\": \"Essential\"}" + fi env: API_HOST: ${{ inputs.api_host }} @@ -116,6 +135,7 @@ runs: MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }} + DISABLE_SHARDING: ${{ inputs.disable_sharding }} ADMIN_API_KEY: ${{ inputs.admin_api_key }} SHARD_COUNT: ${{ inputs.shard_count }} STRIPE_SIZE: ${{ inputs.stripe_size }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 0eddfe5da6..122fe48b68 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -236,5 +236,5 @@ runs: uses: ./.github/actions/allure-report-store with: report-dir: /tmp/test_output/allure/results - unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }} + unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}-${{ runner.arch }} aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index 3c97c8a67a..c938f62ad5 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -2,7 +2,7 @@ name: Push images to Container Registry on: workflow_call: inputs: - # Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} + # Example: {"docker.io/neondatabase/neon:13196061314":["${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} image-map: description: JSON map of images, mapping from a source image to an array of target images that should be pushed. required: true diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bc773600ea..8f3392ceea 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -68,7 +68,7 @@ jobs: tag: needs: [ check-permissions ] runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} @@ -859,14 +859,17 @@ jobs: BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" + DEV_AWS: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + PROD_AWS: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + AWS_REGION: "${{ vars.AWS_ECR_REGION }}" push-neon-image-dev: needs: [ generate-image-maps, neon-image ] uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' - aws-region: eu-central-1 - aws-account-ids: "369495373322" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -881,8 +884,8 @@ jobs: uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' - aws-region: eu-central-1 - aws-account-ids: "369495373322" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -898,8 +901,8 @@ jobs: uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' - aws-region: eu-central-1 - aws-account-ids: "093970136003" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -915,8 +918,8 @@ jobs: uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' - aws-region: eu-central-1 - aws-account-ids: "093970136003" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -1029,7 +1032,7 @@ jobs: statuses: write contents: write runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest steps: - uses: actions/checkout@v4 @@ -1178,6 +1181,22 @@ jobs: exit 1 fi + notify-storage-release-deploy-failure: + needs: [ deploy ] + # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs. + if: github.ref_name == 'release' && needs.deploy.result != 'success' && always() + runs-on: ubuntu-22.04 + steps: + - name: Post release-deploy failure to team-storage slack channel + uses: slackapi/slack-github-action@v2 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} + text: | + 🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. + # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: needs: [ deploy ] @@ -1274,7 +1293,7 @@ jobs: done pin-build-tools-image: - needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ] + needs: [ build-build-tools-image, test-images, build-and-test-locally ] if: github.ref_name == 'main' uses: ./.github/workflows/pin-build-tools-image.yml with: diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml index 2bc938509f..e40b02b5d2 100644 --- a/.github/workflows/build_and_test_with_sanitizers.yml +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -27,7 +27,7 @@ env: jobs: tag: runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index 7b303fa37a..c20c5890f9 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -32,18 +32,27 @@ jobs: - target_project: new_empty_project_stripe_size_2048 stripe_size: 2048 # 16 MiB postgres_version: 16 + disable_sharding: false - target_project: new_empty_project_stripe_size_32768 stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold # while here it is sharded from the beginning with a shard size of 256 MiB + disable_sharding: false postgres_version: 16 - target_project: new_empty_project stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: false postgres_version: 16 - target_project: new_empty_project stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: false postgres_version: 17 - target_project: large_existing_project stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project + disable_sharding: false + postgres_version: 16 + - target_project: new_empty_project_unsharded + stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: true postgres_version: 16 max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results permissions: @@ -96,6 +105,7 @@ jobs: admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} shard_count: 8 stripe_size: ${{ matrix.stripe_size }} + disable_sharding: ${{ matrix.disable_sharding }} - name: Initialize Neon project if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 626de2b0e0..b305b662ee 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -33,10 +33,6 @@ concurrency: # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} -env: - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: pinned - jobs: check-manifests: runs-on: ubuntu-22.04 @@ -46,11 +42,14 @@ jobs: steps: - name: Check if we really need to pin the image id: check-manifests + env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned run: | - docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json - docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json + docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json" + docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json" - if diff ${FROM_TAG}.json ${TO_TAG}.json; then + if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then skip=true else skip=false @@ -64,55 +63,34 @@ jobs: # use format(..) to catch both inputs.force = true AND inputs.force = 'true' if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true' - runs-on: ubuntu-22.04 - permissions: - id-token: write # for `azure/login` and aws auth + id-token: write # Required for aws/azure login - steps: - - uses: docker/login-action@v3 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - name: Azure login - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 - with: - client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} - - - name: Login to ACR - run: | - az acr login --name=neoneastus2 - - - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR - env: - DEFAULT_DEBIAN_VERSION: bookworm - run: | - for debian_version in bullseye bookworm; do - tags=() - - tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}") - tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}") - tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}") - - if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then - tags+=("-t" "neondatabase/build-tools:${TO_TAG}") - tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}") - tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}") - fi - - docker buildx imagetools create "${tags[@]}" \ - neondatabase/build-tools:${FROM_TAG}-${debian_version} - done + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ + "docker.io/neondatabase/build-tools:pinned-bullseye", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" + ], + "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ + "docker.io/neondatabase/build-tools:pinned-bookworm", + "docker.io/neondatabase/build-tools:pinned", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned" + ] + } + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/Cargo.lock b/Cargo.lock index 12c12bc771..038727f1a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1316,7 +1316,6 @@ dependencies = [ "flate2", "futures", "http 1.1.0", - "jsonwebtoken", "metrics", "nix 0.27.1", "notify", @@ -1326,7 +1325,6 @@ dependencies = [ "opentelemetry_sdk", "postgres", "postgres_initdb", - "prometheus", "regex", "remote_storage", "reqwest", @@ -1345,7 +1343,6 @@ dependencies = [ "tower 0.5.2", "tower-http", "tracing", - "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", @@ -1877,6 +1874,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "digest" version = "0.10.7" @@ -3334,6 +3337,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "json-structural-diff" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e878e36a8a44c158505c2c818abdc1350413ad83dcb774a0459f6a7ef2b65cbf" +dependencies = [ + "difflib", + "regex", + "serde_json", +] + [[package]] name = "jsonwebtoken" version = "9.2.0" @@ -4158,7 +4172,6 @@ dependencies = [ "pageserver_client", "pageserver_compaction", "pin-project-lite", - "postgres", "postgres-protocol", "postgres-types", "postgres_backend", @@ -4245,7 +4258,6 @@ dependencies = [ "futures", "http-utils", "pageserver_api", - "postgres", "reqwest", "serde", "thiserror 1.0.69", @@ -4660,7 +4672,6 @@ dependencies = [ "anyhow", "itertools 0.10.5", "once_cell", - "postgres", "tokio-postgres", "url", ] @@ -5802,7 +5813,6 @@ dependencies = [ "once_cell", "pageserver_api", "parking_lot 0.12.1", - "postgres", "postgres-protocol", "postgres_backend", "postgres_ffi", @@ -6446,6 +6456,7 @@ dependencies = [ "humantime", "hyper 0.14.30", "itertools 0.10.5", + "json-structural-diff", "lasso", "measured", "metrics", @@ -6468,6 +6479,7 @@ dependencies = [ "strum", "strum_macros", "thiserror 1.0.69", + "tikv-jemallocator", "tokio", "tokio-postgres", "tokio-postgres-rustls", @@ -7021,14 +7033,11 @@ dependencies = [ name = "tokio-postgres2" version = "0.1.0" dependencies = [ - "async-trait", - "byteorder", "bytes", "fallible-iterator", "futures-util", "log", "parking_lot 0.12.1", - "percent-encoding", "phf", "pin-project-lite", "postgres-protocol2", @@ -7615,13 +7624,13 @@ dependencies = [ "hex", "hex-literal", "humantime", - "inferno 0.12.0", "jsonwebtoken", "metrics", "nix 0.27.1", "once_cell", "pin-project-lite", "postgres_connection", + "pprof", "pq_proto", "rand 0.8.5", "regex", diff --git a/Cargo.toml b/Cargo.toml index 7228623c6b..21310ce6ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -210,6 +210,7 @@ rustls-native-certs = "0.8" x509-parser = "0.16" whoami = "1.5.1" zerocopy = { version = "0.7", features = ["derive"] } +json-structural-diff = { version = "0.2.0" } ## TODO replace this with tracing env_logger = "0.10" diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 317eded26e..c103ceaea5 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.84.1 +ENV RUSTC_VERSION=1.85.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index e78c26c0a6..ef4c22612d 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -395,15 +395,22 @@ RUN case "${PG_VERSION:?}" in \ cd plv8-src && \ if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi -FROM pg-build AS plv8-build +# Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use +# 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds. +# (The V8 engine takes a very long time to build) +FROM build-deps AS plv8-build ARG PG_VERSION +WORKDIR /ext-src/plv8-src RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y \ ninja-build python3-dev libncurses5 binutils clang \ && apt clean && rm -rf /var/lib/apt/lists/* - COPY --from=plv8-src /ext-src/ /ext-src/ -WORKDIR /ext-src/plv8-src +RUN make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) v8 + +# Step 2: Build the PostgreSQL-dependent parts +COPY --from=pg-build /usr/local/pgsql /usr/local/pgsql +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN \ # generate and copy upgrade scripts make generate_upgrades && \ @@ -1848,14 +1855,20 @@ COPY --from=pg_semver-src /ext-src/ /ext-src/ COPY --from=pg_ivm-src /ext-src/ /ext-src/ COPY --from=pg_partman-src /ext-src/ /ext-src/ #COPY --from=pg_mooncake-src /ext-src/ /ext-src/ -#COPY --from=pg_repack-src /ext-src/ /ext-src/ +COPY --from=pg_repack-src /ext-src/ /ext-src/ +COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY compute/patches/pg_repack.patch /ext-src +RUN cd /ext-src/pg_repack-src && patch -p1 OK + \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check + INFO: repacking table "public.tbl_cluster" + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper + ERROR: pg_repack failed with error: You must be a superuser to use pg_repack + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + ERROR: pg_repack failed with error: ERROR: permission denied for schema repack + LINE 1: select repack.version(), repack.version_sql() + ^ + GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; + GRANT USAGE ON SCHEMA repack TO nosuper; + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + INFO: repacking table "public.tbl_cluster" + ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block + DETAIL: query was: RESET lock_timeout +diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql +index 072f0fa..dbe60f8 100644 +--- a/regress/sql/nosuper.sql ++++ b/regress/sql/nosuper.sql +@@ -4,19 +4,19 @@ + SET client_min_messages = error; + DROP ROLE IF EXISTS nosuper; + SET client_min_messages = warning; +-CREATE ROLE nosuper WITH LOGIN; ++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD'; + -- => OK + \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + + GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; + GRANT USAGE ON SCHEMA repack TO nosuper; + + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + + REVOKE ALL ON ALL TABLES IN SCHEMA repack FROM nosuper; + REVOKE USAGE ON SCHEMA repack FROM nosuper; diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 81dcf99560..c276996df5 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -25,7 +25,6 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true -jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true notify.workspace = true @@ -48,13 +47,11 @@ tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true tracing.workspace = true -tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true uuid.workspace = true -prometheus.workspace = true walkdir.workspace = true postgres_initdb.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index a8803ec793..1cdae718fe 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -41,7 +41,6 @@ use std::process::exit; use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; -use std::time::SystemTime; use std::{thread, time::Duration}; use anyhow::{Context, Result}; @@ -86,19 +85,6 @@ fn parse_remote_ext_config(arg: &str) -> Result { } } -/// Generate a compute ID if one is not supplied. This exists to keep forward -/// compatibility tests working, but will be removed in a future iteration. -fn generate_compute_id() -> String { - let now = SystemTime::now(); - - format!( - "compute-{}", - now.duration_since(SystemTime::UNIX_EPOCH) - .unwrap() - .as_secs() - ) -} - #[derive(Parser)] #[command(rename_all = "kebab-case")] struct Cli { @@ -112,16 +98,13 @@ struct Cli { /// outside the compute will talk to the compute through this port. Keep /// the previous name for this argument around for a smoother release /// with the control plane. - /// - /// TODO: Remove the alias after the control plane release which teaches the - /// control plane about the renamed argument. - #[arg(long, alias = "http-port", default_value_t = 3080)] + #[arg(long, default_value_t = 3080)] pub external_http_port: u16, - /// The port to bind the internal listening HTTP server to. Clients like + /// The port to bind the internal listening HTTP server to. Clients include /// the neon extension (for installing remote extensions) and local_proxy. - #[arg(long)] - pub internal_http_port: Option, + #[arg(long, default_value_t = 3081)] + pub internal_http_port: u16, #[arg(short = 'D', long, value_name = "DATADIR")] pub pgdata: String, @@ -156,7 +139,7 @@ struct Cli { #[arg(short = 'S', long, group = "spec-path")] pub spec_path: Option, - #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())] + #[arg(short = 'i', long, group = "compute-id")] pub compute_id: String, #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")] @@ -359,7 +342,7 @@ fn wait_spec( pgbin: cli.pgbin.clone(), pgversion: get_pg_version_string(&cli.pgbin), external_http_port: cli.external_http_port, - internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1), + internal_http_port: cli.internal_http_port, live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), @@ -383,7 +366,7 @@ fn wait_spec( // The internal HTTP server could be launched later, but there isn't much // sense in waiting. - Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute); + Server::Internal(cli.internal_http_port).launch(&compute); if !spec_set { // No spec provided, hang waiting for it. diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 5ee9c5fbd8..c4416480d8 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -7,12 +7,12 @@ use std::sync::Arc; use crate::compute::construct_superuser_query; use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt}; -use anyhow::{bail, Result}; +use anyhow::Result; use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; -use tracing::{debug, info_span, Instrument}; +use tracing::{debug, info_span, warn, Instrument}; #[derive(Clone)] pub enum DB { @@ -47,6 +47,11 @@ pub enum PerDatabasePhase { DeleteDBRoleReferences, ChangeSchemaPerms, HandleAnonExtension, + /// This is a shared phase, used for both i) dropping dangling LR subscriptions + /// before dropping the DB, and ii) dropping all subscriptions after creating + /// a fresh branch. + /// N.B. we will skip all DBs that are not present in Postgres, invalid, or + /// have `datallowconn = false` (`restrict_conn`). DropLogicalSubscriptions, } @@ -168,7 +173,7 @@ where /// /// In the future we may generate a single stream of changes and then /// sort/merge/batch execution, but for now this is a nice way to improve -/// batching behaviour of the commands. +/// batching behavior of the commands. async fn get_operations<'a>( spec: &'a ComputeSpec, ctx: &'a RwLock, @@ -451,6 +456,38 @@ async fn get_operations<'a>( )), }))), ApplySpecPhase::RunInEachDatabase { db, subphase } => { + // Do some checks that user DB exists and we can access it. + // + // During the phases like DropLogicalSubscriptions, DeleteDBRoleReferences, + // which happen before dropping the DB, the current run could be a retry, + // so it's a valid case when DB is absent already. The case of + // `pg_database.datallowconn = false`/`restrict_conn` is a bit tricky, as + // in theory user can have some dangling objects there, so we will fail at + // the actual drop later. Yet, to fix that in the current code we would need + // to ALTER DATABASE, and then check back, but that even more invasive, so + // that's not what we really want to do here. + // + // For ChangeSchemaPerms, skipping DBs we cannot access is totally fine. + if let DB::UserDB(db) = db { + let databases = &ctx.read().await.dbs; + + let edb = match databases.get(&db.name) { + Some(edb) => edb, + None => { + warn!("skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", subphase, db.name); + return Ok(Box::new(empty())); + } + }; + + if edb.restrict_conn || edb.invalid { + warn!( + "skipping RunInEachDatabase phase {:?}, database {} is (restrict_conn={}, invalid={})", + subphase, db.name, edb.restrict_conn, edb.invalid + ); + return Ok(Box::new(empty())); + } + } + match subphase { PerDatabasePhase::DropLogicalSubscriptions => { match &db { @@ -530,25 +567,12 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } PerDatabasePhase::ChangeSchemaPerms => { - let ctx = ctx.read().await; - let databases = &ctx.dbs; - let db = match &db { // ignore schema permissions on the system database DB::SystemDB => return Ok(Box::new(empty())), DB::UserDB(db) => db, }; - if databases.get(&db.name).is_none() { - bail!("database {} doesn't exist in PostgreSQL", db.name); - } - - let edb = databases.get(&db.name).unwrap(); - - if edb.restrict_conn || edb.invalid { - return Ok(Box::new(empty())); - } - let operations = vec![ Operation { query: format!( @@ -566,6 +590,7 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } + // TODO: remove this completely https://github.com/neondatabase/cloud/issues/22663 PerDatabasePhase::HandleAnonExtension => { // Only install Anon into user databases let db = match &db { diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index dfb925e48e..03e8e158fa 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -2,6 +2,7 @@ DO $$ DECLARE subname TEXT; BEGIN + LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE; FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname); EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index c3c8229c38..407578abb8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -46,6 +46,8 @@ use std::process::Command; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use std::time::SystemTime; +use std::time::UNIX_EPOCH; use anyhow::{anyhow, bail, Context, Result}; use compute_api::requests::ConfigurationRequest; @@ -59,6 +61,7 @@ use nix::sys::signal::Signal; use pageserver_api::shard::ShardStripeSize; use reqwest::header::CONTENT_TYPE; use serde::{Deserialize, Serialize}; +use tracing::debug; use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; @@ -81,8 +84,10 @@ pub struct EndpointConf { internal_http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, + reconfigure_concurrency: usize, drop_subscriptions_before_start: bool, features: Vec, + cluster: Option, } // @@ -179,7 +184,9 @@ impl ComputeControlPlane { // we also skip catalog updates in the cloud. skip_pg_catalog_updates, drop_subscriptions_before_start, + reconfigure_concurrency: 1, features: vec![], + cluster: None, }); ep.create_endpoint_dir()?; @@ -196,7 +203,9 @@ impl ComputeControlPlane { pg_version, skip_pg_catalog_updates, drop_subscriptions_before_start, + reconfigure_concurrency: 1, features: vec![], + cluster: None, })?, )?; std::fs::write( @@ -261,8 +270,11 @@ pub struct Endpoint { skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, + reconfigure_concurrency: usize, // Feature flags features: Vec, + // Cluster settings + cluster: Option, } #[derive(PartialEq, Eq)] @@ -302,6 +314,8 @@ impl Endpoint { let conf: EndpointConf = serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; + debug!("serialized endpoint conf: {:?}", conf); + Ok(Endpoint { pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port), external_http_address: SocketAddr::new( @@ -319,8 +333,10 @@ impl Endpoint { tenant_id: conf.tenant_id, pg_version: conf.pg_version, skip_pg_catalog_updates: conf.skip_pg_catalog_updates, + reconfigure_concurrency: conf.reconfigure_concurrency, drop_subscriptions_before_start: conf.drop_subscriptions_before_start, features: conf.features, + cluster: conf.cluster, }) } @@ -607,7 +623,7 @@ impl Endpoint { }; // Create spec file - let spec = ComputeSpec { + let mut spec = ComputeSpec { skip_pg_catalog_updates: self.skip_pg_catalog_updates, format_version: 1.0, operation_uuid: None, @@ -640,7 +656,7 @@ impl Endpoint { Vec::new() }, settings: None, - postgresql_conf: Some(postgresql_conf), + postgresql_conf: Some(postgresql_conf.clone()), }, delta_operations: None, tenant_id: Some(self.tenant_id), @@ -653,9 +669,35 @@ impl Endpoint { pgbouncer_settings: None, shard_stripe_size: Some(shard_stripe_size), local_proxy_config: None, - reconfigure_concurrency: 1, + reconfigure_concurrency: self.reconfigure_concurrency, drop_subscriptions_before_start: self.drop_subscriptions_before_start, }; + + // this strange code is needed to support respec() in tests + if self.cluster.is_some() { + debug!("Cluster is already set in the endpoint spec, using it"); + spec.cluster = self.cluster.clone().unwrap(); + + debug!("spec.cluster {:?}", spec.cluster); + + // fill missing fields again + if create_test_user { + spec.cluster.roles.push(Role { + name: PgIdent::from_str("test").unwrap(), + encrypted_password: None, + options: None, + }); + spec.cluster.databases.push(Database { + name: PgIdent::from_str("neondb").unwrap(), + owner: PgIdent::from_str("test").unwrap(), + options: None, + restrict_conn: false, + invalid: false, + }); + } + spec.cluster.postgresql_conf = Some(postgresql_conf); + } + let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -673,18 +715,14 @@ impl Endpoint { println!("Also at '{}'", conn_str); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); - //cmd.args([ - // "--external-http-port", - // &self.external_http_address.port().to_string(), - //]) - //.args([ - // "--internal-http-port", - // &self.internal_http_address.port().to_string(), - //]) cmd.args([ - "--http-port", + "--external-http-port", &self.external_http_address.port().to_string(), ]) + .args([ + "--internal-http-port", + &self.internal_http_address.port().to_string(), + ]) .args(["--pgdata", self.pgdata().to_str().unwrap()]) .args(["--connstr", &conn_str]) .args([ @@ -701,20 +739,16 @@ impl Endpoint { ]) // TODO: It would be nice if we generated compute IDs with the same // algorithm as the real control plane. - // - // TODO: Add this back when - // https://github.com/neondatabase/neon/pull/10747 is merged. - // - //.args([ - // "--compute-id", - // &format!( - // "compute-{}", - // SystemTime::now() - // .duration_since(UNIX_EPOCH) - // .unwrap() - // .as_secs() - // ), - //]) + .args([ + "--compute-id", + &format!( + "compute-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs() + ), + ]) .stdin(std::process::Stdio::null()) .stderr(logfile.try_clone()?) .stdout(logfile); diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 28d130d9e0..2bf89b7bfa 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -335,13 +335,21 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'checkpoint_distance' as an integer")?, - checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), + checkpoint_timeout: settings + .remove("checkpoint_timeout") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'checkpoint_timeout' as duration")?, compaction_target_size: settings .remove("compaction_target_size") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_target_size' as an integer")?, - compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), + compaction_period: settings + .remove("compaction_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'compaction_period' as duration")?, compaction_threshold: settings .remove("compaction_threshold") .map(|x| x.parse::()) @@ -387,7 +395,10 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_horizon' as an integer")?, - gc_period: settings.remove("gc_period").map(|x| x.to_string()), + gc_period: settings.remove("gc_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'gc_period' as duration")?, image_creation_threshold: settings .remove("image_creation_threshold") .map(|x| x.parse::()) @@ -403,13 +414,20 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_preempt_threshold' as integer")?, - pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), + pitr_interval: settings.remove("pitr_interval") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'pitr_interval' as duration")?, walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'walreceiver_connect_timeout' as duration")?, lagging_wal_timeout: settings .remove("lagging_wal_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lagging_wal_timeout' as duration")?, max_lsn_wal_lag: settings .remove("max_lsn_wal_lag") .map(|x| x.parse::()) @@ -427,8 +445,14 @@ impl PageServerNode { .context("Failed to parse 'min_resident_size_override' as integer")?, evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") - .map(|x| x.to_string()), - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'evictions_low_residence_duration_metric_threshold' as duration")?, + heatmap_period: settings + .remove("heatmap_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'heatmap_period' as duration")?, lazy_slru_download: settings .remove("lazy_slru_download") .map(|x| x.parse::()) @@ -439,10 +463,15 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, - lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), + lsn_lease_length: settings.remove("lsn_lease_length") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length' as duration")?, lsn_lease_length_for_ts: settings .remove("lsn_lease_length_for_ts") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length_for_ts' as duration")?, timeline_offloading: settings .remove("timeline_offloading") .map(|x| x.parse::()) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 3c574efc63..40b86e4110 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -47,6 +47,9 @@ enum Command { listen_http_addr: String, #[arg(long)] listen_http_port: u16, + #[arg(long)] + listen_https_port: Option, + #[arg(long)] availability_zone_id: String, }, @@ -394,6 +397,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + listen_https_port, availability_zone_id, } => { storcon_client @@ -406,6 +410,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + listen_https_port, availability_zone_id: AvailabilityZone(availability_zone_id), }), ) @@ -954,7 +959,7 @@ async fn main() -> anyhow::Result<()> { threshold: threshold.into(), }, )), - heatmap_period: Some("300s".to_string()), + heatmap_period: Some(Duration::from_secs(300)), ..Default::default() }, }) diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index b4f8d3d66a..9dbdcce69f 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -77,4 +77,5 @@ echo "Start compute node" /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ -C "postgresql://cloud_admin@localhost:55433/postgres" \ -b /usr/local/bin/postgres \ + --compute-id "compute-$RANDOM" \ -S ${SPEC_FILE} diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index dd520d4986..5b3cfc74eb 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -81,15 +81,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}') [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}') for d in $FAILED $CONTRIB_FAILED; do - dn="$(basename $d)" - rm -rf $dn - mkdir $dn - docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ] - docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ] - cat $dn/regression.out $dn/regression.diffs || true - rm -rf $dn + docker exec $TEST_CONTAINER_NAME bash -c 'for file in $(find '"$d"' -name regression.diffs -o -name regression.out); do cat $file; done' || [ $? -eq 1 ] done - rm -rf $FAILED exit 1 fi fi diff --git a/docker-compose/ext-src/pg_repack-src/test-upgrade.sh b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh new file mode 100755 index 0000000000..5021eb4027 --- /dev/null +++ b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -ex +cd "$(dirname ${0})" +PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress +${PG_REGRESS} --use-existing --inputdir=./regress --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch similarity index 100% rename from docker-compose/ext-src/pg_semver-src/test-upgrade.patch rename to docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch new file mode 100644 index 0000000000..2d0bf280db --- /dev/null +++ b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch @@ -0,0 +1,24 @@ +diff --git a/test/sql/base.sql b/test/sql/base.sql +index 53adb30..2eed91b 100644 +--- a/test/sql/base.sql ++++ b/test/sql/base.sql +@@ -2,7 +2,6 @@ + BEGIN; + + \i test/pgtap-core.sql +-CREATE EXTENSION semver; + + SELECT plan(334); + --SELECT * FROM no_plan(); +diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql +index c0fe98e..39cdd2e 100644 +--- a/test/sql/corpus.sql ++++ b/test/sql/corpus.sql +@@ -4,7 +4,6 @@ BEGIN; + -- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/. + + \i test/pgtap-core.sql +-CREATE EXTENSION semver; + + SELECT plan(76); + --SELECT * FROM no_plan(); diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh index e1541f272a..18b2848fd1 100755 --- a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh +++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh @@ -1,6 +1,7 @@ #!/bin/sh set -ex cd "$(dirname ${0})" -patch -p1 , pub name: Option, @@ -283,7 +283,7 @@ pub struct DeltaOp { /// Rust representation of Postgres role info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Role { pub name: PgIdent, pub encrypted_password: Option, @@ -292,7 +292,7 @@ pub struct Role { /// Rust representation of Postgres database info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Database { pub name: PgIdent, pub owner: PgIdent, @@ -308,7 +308,7 @@ pub struct Database { /// Common type representing both SQL statement params with or without value, /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// options like `wal_level = logical`. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct GenericOption { pub name: String, pub value: Option, diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs index dd57f9ed4b..fe1cc10838 100644 --- a/libs/http-utils/src/pprof.rs +++ b/libs/http-utils/src/pprof.rs @@ -2,7 +2,6 @@ use anyhow::bail; use flate2::write::{GzDecoder, GzEncoder}; use flate2::Compression; use itertools::Itertools as _; -use once_cell::sync::Lazy; use pprof::protos::{Function, Line, Location, Message as _, Profile}; use regex::Regex; @@ -58,38 +57,30 @@ pub fn symbolize(mut profile: Profile) -> anyhow::Result { // Resolve the line and function for each location. backtrace::resolve(loc.address as *mut c_void, |symbol| { - let Some(symname) = symbol.name() else { + let Some(symbol_name) = symbol.name() else { return; }; - let mut name = symname.to_string(); - // Strip the Rust monomorphization suffix from the symbol name. - static SUFFIX_REGEX: Lazy = - Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex")); - if let Some(m) = SUFFIX_REGEX.find(&name) { - name.truncate(m.start()); - } - - let function_id = match functions.get(&name) { - Some(function) => function.id, - None => { - let id = functions.len() as u64 + 1; - let system_name = String::from_utf8_lossy(symname.as_bytes()); + let function_name = format!("{symbol_name:#}"); + let functions_len = functions.len(); + let function_id = functions + .entry(function_name) + .or_insert_with_key(|function_name| { + let function_id = functions_len as u64 + 1; + let system_name = String::from_utf8_lossy(symbol_name.as_bytes()); let filename = symbol .filename() .map(|path| path.to_string_lossy()) .unwrap_or(Cow::Borrowed("")); - let function = Function { - id, - name: string_id(&name), + Function { + id: function_id, + name: string_id(function_name), system_name: string_id(&system_name), filename: string_id(&filename), ..Default::default() - }; - functions.insert(name, function); - id - } - }; + } + }) + .id; loc.line.push(Line { function_id, line: symbol.lineno().unwrap_or(0) as i64, diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index e64052c73d..1aff5a7012 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -122,6 +122,8 @@ pub struct ConfigToml { pub page_service_pipelining: PageServicePipeliningConfig, pub get_vectored_concurrent_io: GetVectoredConcurrentIo, pub enable_read_path_debugging: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub validate_wal_contiguity: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -521,6 +523,7 @@ impl Default for ConfigToml { } else { None }, + validate_wal_contiguity: None, } } } @@ -544,10 +547,11 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; - // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on - // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole - // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB. - pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50; + // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's + // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could + // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So + // with this config, we can get a maximum peak compaction usage of 9 GB. + pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20; pub const DEFAULT_COMPACTION_L0_FIRST: bool = false; pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 42f6e47e63..f94bfab581 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -57,6 +57,7 @@ pub struct NodeRegisterRequest { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, pub availability_zone_id: AvailabilityZone, } @@ -105,6 +106,7 @@ pub struct TenantLocateResponseShard { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, } #[derive(Serialize, Deserialize)] @@ -148,6 +150,7 @@ pub struct NodeDescribeResponse { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, pub listen_pg_addr: String, pub listen_pg_port: u16, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index dd7bea2916..1164048229 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -526,9 +526,13 @@ pub struct TenantConfigPatch { #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] pub struct TenantConfig { pub checkpoint_distance: Option, - pub checkpoint_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub checkpoint_timeout: Option, pub compaction_target_size: Option, - pub compaction_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub compaction_period: Option, pub compaction_threshold: Option, pub compaction_upper_limit: Option, // defer parsing compaction_algorithm, like eviction_policy @@ -539,22 +543,38 @@ pub struct TenantConfig { pub l0_flush_stall_threshold: Option, pub l0_flush_wait_upload: Option, pub gc_horizon: Option, - pub gc_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub gc_period: Option, pub image_creation_threshold: Option, - pub pitr_interval: Option, - pub walreceiver_connect_timeout: Option, - pub lagging_wal_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub pitr_interval: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, pub eviction_policy: Option, pub min_resident_size_override: Option, - pub evictions_low_residence_duration_metric_threshold: Option, - pub heatmap_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub heatmap_period: Option, pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, pub image_creation_preempt_threshold: Option, - pub lsn_lease_length: Option, - pub lsn_lease_length_for_ts: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Option, pub timeline_offloading: Option, pub wal_receiver_protocol_override: Option, pub rel_size_v2_enabled: Option, @@ -564,7 +584,10 @@ pub struct TenantConfig { } impl TenantConfig { - pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig { + pub fn apply_patch( + self, + patch: TenantConfigPatch, + ) -> Result { let Self { mut checkpoint_distance, mut checkpoint_timeout, @@ -604,11 +627,17 @@ impl TenantConfig { } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); - patch.checkpoint_timeout.apply(&mut checkpoint_timeout); + patch + .checkpoint_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut checkpoint_timeout); patch .compaction_target_size .apply(&mut compaction_target_size); - patch.compaction_period.apply(&mut compaction_period); + patch + .compaction_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut compaction_period); patch.compaction_threshold.apply(&mut compaction_threshold); patch .compaction_upper_limit @@ -626,15 +655,25 @@ impl TenantConfig { .apply(&mut l0_flush_stall_threshold); patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); patch.gc_horizon.apply(&mut gc_horizon); - patch.gc_period.apply(&mut gc_period); + patch + .gc_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut gc_period); patch .image_creation_threshold .apply(&mut image_creation_threshold); - patch.pitr_interval.apply(&mut pitr_interval); + patch + .pitr_interval + .map(|v| humantime::parse_duration(&v))? + .apply(&mut pitr_interval); patch .walreceiver_connect_timeout + .map(|v| humantime::parse_duration(&v))? .apply(&mut walreceiver_connect_timeout); - patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout); + patch + .lagging_wal_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lagging_wal_timeout); patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag); patch.eviction_policy.apply(&mut eviction_policy); patch @@ -642,8 +681,12 @@ impl TenantConfig { .apply(&mut min_resident_size_override); patch .evictions_low_residence_duration_metric_threshold + .map(|v| humantime::parse_duration(&v))? .apply(&mut evictions_low_residence_duration_metric_threshold); - patch.heatmap_period.apply(&mut heatmap_period); + patch + .heatmap_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut heatmap_period); patch.lazy_slru_download.apply(&mut lazy_slru_download); patch .timeline_get_throttle @@ -654,9 +697,13 @@ impl TenantConfig { patch .image_creation_preempt_threshold .apply(&mut image_creation_preempt_threshold); - patch.lsn_lease_length.apply(&mut lsn_lease_length); + patch + .lsn_lease_length + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lsn_lease_length); patch .lsn_lease_length_for_ts + .map(|v| humantime::parse_duration(&v))? .apply(&mut lsn_lease_length_for_ts); patch.timeline_offloading.apply(&mut timeline_offloading); patch @@ -673,7 +720,7 @@ impl TenantConfig { .gc_compaction_ratio_percent .apply(&mut gc_compaction_ratio_percent); - Self { + Ok(Self { checkpoint_distance, checkpoint_timeout, compaction_target_size, @@ -709,7 +756,7 @@ impl TenantConfig { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, - } + }) } } @@ -2503,7 +2550,7 @@ mod tests { ..base.clone() }; - let patched = base.apply_patch(decoded.config); + let patched = base.apply_patch(decoded.config).unwrap(); assert_eq!(patched, expected); } diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml index 19027d13ff..462fb4a533 100644 --- a/libs/postgres_connection/Cargo.toml +++ b/libs/postgres_connection/Cargo.toml @@ -7,7 +7,6 @@ license.workspace = true [dependencies] anyhow.workspace = true itertools.workspace = true -postgres.workspace = true tokio-postgres.workspace = true url.workspace = true diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index ddf9f7b610..e3d31c6cfc 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -171,10 +171,10 @@ impl PgConnectionConfig { tokio_postgres::Client, tokio_postgres::Connection, ), - postgres::Error, + tokio_postgres::Error, > { self.to_tokio_postgres_config() - .connect(postgres::NoTls) + .connect(tokio_postgres::NoTls) .await } } diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 0239b56d9c..301bc2f16e 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -278,7 +278,7 @@ pub fn generate_pg_control( checkpoint_bytes: &[u8], lsn: Lsn, pg_version: u32, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { dispatch_pgversion!( pg_version, pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 852b20eace..14fb1f2a1f 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -124,23 +124,59 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } } +/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN +/// +/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in +/// the pageserver. They use the same format as the PostgreSQL control file and the +/// checkpoint record, but see walingest.rs for how exactly they are kept up to date. +/// 'lsn' is the LSN at which we're starting up. +/// +/// Returns: +/// - pg_control file contents +/// - system_identifier, extracted from the persisted information +/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown +/// checkpoint at the given LSN pub fn generate_pg_control( pg_control_bytes: &[u8], checkpoint_bytes: &[u8], lsn: Lsn, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { let mut pg_control = ControlFileData::decode(pg_control_bytes)?; let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; // Generate new pg_control needed for bootstrap + // + // NB: In the checkpoint struct that we persist in the pageserver, we have a different + // convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint, + // 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to + // the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0. + // + // We didn't always have this convention however, and old persisted records will have + // old REDO values that point to some old LSN. + // + // The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown + // checkpoint record at that point in WAL, with no new WAL records after it. That case + // can be treated as starting from a clean shutdown. All other cases are treated as + // non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so + // that distinction doesn't matter very much. As of this writing, it only affects + // whether the persisted pg_stats information can be used or not. + // + // In the Checkpoint struct in the returned pg_control file, the redo pointer is + // always set to the LSN we're starting at, to hint that no WAL replay is required. + // (There's some neon-specific code in Postgres startup to make that work, though. + // Just setting the redo pointer is not sufficient.) + let was_shutdown = Lsn(checkpoint.redo) == lsn; checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; - //save new values in pg_control + // We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown. The + // neon-specific code at postgres startup ignores the state stored in the control + // file, similar to archive recovery in standalone PostgreSQL. Similarly, the + // checkPoint pointer is ignored, so just set it to 0. pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; pg_control.state = DBState_DB_SHUTDOWNED; - Ok((pg_control.encode(), pg_control.system_identifier)) + Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown)) } pub fn get_current_timestamp() -> TimestampTz { diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml index f66a292d5e..7ebb05eec1 100644 --- a/libs/proxy/postgres-protocol2/Cargo.toml +++ b/libs/proxy/postgres-protocol2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-protocol2" version = "0.1.0" -edition = "2021" +edition = "2024" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index f2200a40ce..27e05e24ec 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -1,14 +1,12 @@ //! SASL-based authentication support. +use std::fmt::Write; +use std::{io, iter, mem, str}; + use hmac::{Hmac, Mac}; use rand::{self, Rng}; use sha2::digest::FixedOutput; use sha2::{Digest, Sha256}; -use std::fmt::Write; -use std::io; -use std::iter; -use std::mem; -use std::str; use tokio::task::yield_now; const NONCE_LENGTH: usize = 24; @@ -493,11 +491,9 @@ mod test { let nonce = "9IZ2O01zb9IgiIZ1WJ/zgpJB"; let client_first = "n,,n=,r=9IZ2O01zb9IgiIZ1WJ/zgpJB"; - let server_first = - "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\ + let server_first = "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\ =4096"; - let client_final = - "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\ + let client_final = "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\ 1NTlQYNs5BTeQjdHdk7lOflDo5re2an8="; let server_final = "v=U+ppxD5XUKtradnv8e2MkeupiA8FU87Sg8CXzXHDAzw="; diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs index 6032440f9a..afbd1e92bd 100644 --- a/libs/proxy/postgres-protocol2/src/lib.rs +++ b/libs/proxy/postgres-protocol2/src/lib.rs @@ -11,9 +11,10 @@ //! set to `UTF8`. It will most likely not behave properly if that is not the case. #![warn(missing_docs, clippy::all)] +use std::io; + use byteorder::{BigEndian, ByteOrder}; use bytes::{BufMut, BytesMut}; -use std::io; pub mod authentication; pub mod escape; diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs index 097964f9c1..d7eaef9509 100644 --- a/libs/proxy/postgres-protocol2/src/message/backend.rs +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -1,13 +1,13 @@ #![allow(missing_docs)] +use std::io::{self, Read}; +use std::ops::Range; +use std::{cmp, str}; + use byteorder::{BigEndian, ByteOrder, ReadBytesExt}; use bytes::{Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use memchr::memchr; -use std::cmp; -use std::io::{self, Read}; -use std::ops::Range; -use std::str; use crate::Oid; diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs index 640f35ada3..b447290ea8 100644 --- a/libs/proxy/postgres-protocol2/src/message/frontend.rs +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -1,13 +1,13 @@ //! Frontend message serialization. #![allow(missing_docs)] +use std::error::Error; +use std::{io, marker}; + use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, BufMut, BytesMut}; -use std::error::Error; -use std::io; -use std::marker; -use crate::{write_nullable, FromUsize, IsNull, Oid}; +use crate::{FromUsize, IsNull, Oid, write_nullable}; #[inline] fn write_body(buf: &mut BytesMut, f: F) -> Result<(), E> diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs index 38eb31dfcf..4cd9bfb060 100644 --- a/libs/proxy/postgres-protocol2/src/password/mod.rs +++ b/libs/proxy/postgres-protocol2/src/password/mod.rs @@ -6,12 +6,13 @@ //! side. This is good because it ensures the cleartext password won't //! end up in logs pg_stat displays, etc. -use crate::authentication::sasl; use hmac::{Hmac, Mac}; use rand::RngCore; use sha2::digest::FixedOutput; use sha2::{Digest, Sha256}; +use crate::authentication::sasl; + #[cfg(test)] mod test; diff --git a/libs/proxy/postgres-protocol2/src/types/mod.rs b/libs/proxy/postgres-protocol2/src/types/mod.rs index 78131c05bf..6a9b334bcb 100644 --- a/libs/proxy/postgres-protocol2/src/types/mod.rs +++ b/libs/proxy/postgres-protocol2/src/types/mod.rs @@ -1,11 +1,12 @@ //! Conversions to and from Postgres's binary format for various types. -use byteorder::{BigEndian, ReadBytesExt}; -use bytes::{BufMut, BytesMut}; -use fallible_iterator::FallibleIterator; use std::boxed::Box as StdBox; use std::error::Error; use std::str; +use byteorder::{BigEndian, ReadBytesExt}; +use bytes::{BufMut, BytesMut}; +use fallible_iterator::FallibleIterator; + use crate::Oid; #[cfg(test)] diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml index 57efd94cd3..25ad23ba35 100644 --- a/libs/proxy/postgres-types2/Cargo.toml +++ b/libs/proxy/postgres-types2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-types2" version = "0.1.0" -edition = "2021" +edition = "2024" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index d4f3afdfd4..0ccd8c295f 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -4,19 +4,18 @@ //! unless you want to define your own `ToSql` or `FromSql` definitions. #![warn(clippy::all, missing_docs)] -use fallible_iterator::FallibleIterator; -use postgres_protocol2::types; use std::any::type_name; use std::error::Error; use std::fmt; use std::sync::Arc; -use crate::type_gen::{Inner, Other}; - +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; #[doc(inline)] pub use postgres_protocol2::Oid; +use postgres_protocol2::types; -use bytes::BytesMut; +use crate::type_gen::{Inner, Other}; /// Generates a simple implementation of `ToSql::accepts` which accepts the /// types passed to it. diff --git a/libs/proxy/postgres-types2/src/private.rs b/libs/proxy/postgres-types2/src/private.rs index 774f9a301c..188b982812 100644 --- a/libs/proxy/postgres-types2/src/private.rs +++ b/libs/proxy/postgres-types2/src/private.rs @@ -1,7 +1,9 @@ -use crate::{FromSql, Type}; -pub use bytes::BytesMut; use std::error::Error; +pub use bytes::BytesMut; + +use crate::{FromSql, Type}; + pub fn read_be_i32(buf: &mut &[u8]) -> Result> { if buf.len() < 4 { return Err("invalid buffer size".into()); diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml index ade0ffc9f6..540876742f 100644 --- a/libs/proxy/tokio-postgres2/Cargo.toml +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -1,22 +1,19 @@ [package] name = "tokio-postgres2" version = "0.1.0" -edition = "2021" +edition = "2024" license = "MIT/Apache-2.0" [dependencies] -async-trait.workspace = true bytes.workspace = true -byteorder.workspace = true fallible-iterator.workspace = true futures-util = { workspace = true, features = ["sink"] } log = "0.4" parking_lot.workspace = true -percent-encoding = "2.0" pin-project-lite.workspace = true phf = "0.11" postgres-protocol2 = { path = "../postgres-protocol2" } postgres-types2 = { path = "../postgres-types2" } tokio = { workspace = true, features = ["io-util", "time", "net"] } tokio-util = { workspace = true, features = ["codec"] } -serde = { workspace = true, features = ["derive"] } \ No newline at end of file +serde = { workspace = true, features = ["derive"] } diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs index cddbf16336..b65fb571e6 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -1,10 +1,11 @@ +use std::io; + use tokio::net::TcpStream; use crate::client::SocketConfig; use crate::config::{Host, SslMode}; use crate::tls::MakeTlsConnect; -use crate::{cancel_query_raw, connect_socket, Error}; -use std::io; +use crate::{Error, cancel_query_raw, connect_socket}; pub(crate) async fn cancel_query( config: Option, @@ -22,7 +23,7 @@ where return Err(Error::connect(io::Error::new( io::ErrorKind::InvalidInput, "unknown host", - ))) + ))); } }; diff --git a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs index 8c08296435..c720214e9b 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs @@ -1,10 +1,11 @@ -use crate::config::SslMode; -use crate::tls::TlsConnect; -use crate::{connect_tls, Error}; use bytes::BytesMut; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use crate::config::SslMode; +use crate::tls::TlsConnect; +use crate::{Error, connect_tls}; + pub async fn cancel_query_raw( stream: S, mode: SslMode, diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs index 718f903a92..f6526395ee 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_token.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs @@ -1,12 +1,12 @@ -use crate::config::SslMode; -use crate::tls::TlsConnect; - -use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect}; -use crate::{cancel_query_raw, Error}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; +use crate::client::SocketConfig; +use crate::config::SslMode; +use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::{Error, cancel_query, cancel_query_raw}; + /// The capability to request cancellation of in-progress queries on a /// connection. #[derive(Clone, Serialize, Deserialize)] diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 46151ab924..39b1db75da 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -1,31 +1,28 @@ -use crate::codec::{BackendMessages, FrontendMessage}; - -use crate::config::Host; -use crate::config::SslMode; -use crate::connection::{Request, RequestMessages}; - -use crate::query::RowStream; -use crate::simple_query::SimpleQueryStream; - -use crate::types::{Oid, ToSql, Type}; - -use crate::{ - query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, - SimpleQueryMessage, Statement, Transaction, TransactionBuilder, -}; -use bytes::BytesMut; -use fallible_iterator::FallibleIterator; -use futures_util::{future, ready, TryStreamExt}; -use parking_lot::Mutex; -use postgres_protocol2::message::{backend::Message, frontend}; -use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fmt; use std::sync::Arc; use std::task::{Context, Poll}; +use std::time::Duration; + +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{TryStreamExt, future, ready}; +use parking_lot::Mutex; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; -use std::time::Duration; +use crate::codec::{BackendMessages, FrontendMessage}; +use crate::config::{Host, SslMode}; +use crate::connection::{Request, RequestMessages}; +use crate::query::RowStream; +use crate::simple_query::SimpleQueryStream; +use crate::types::{Oid, ToSql, Type}; +use crate::{ + CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction, + TransactionBuilder, query, simple_query, slice_iter, +}; pub struct Responses { receiver: mpsc::Receiver, diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs index 0ec46198ce..f1fd9b47b3 100644 --- a/libs/proxy/tokio-postgres2/src/codec.rs +++ b/libs/proxy/tokio-postgres2/src/codec.rs @@ -1,8 +1,9 @@ +use std::io; + use bytes::{Buf, Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend; use postgres_protocol2::message::frontend::CopyData; -use std::io; use tokio_util::codec::{Decoder, Encoder}; pub enum FrontendMessage { diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 47cc45ac80..4c25491b67 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -1,21 +1,19 @@ //! Connection configuration. -use crate::connect::connect; -use crate::connect_raw::connect_raw; -use crate::connect_raw::RawConnection; -use crate::tls::MakeTlsConnect; -use crate::tls::TlsConnect; -use crate::{Client, Connection, Error}; -use postgres_protocol2::message::frontend::StartupMessageParams; -use serde::{Deserialize, Serialize}; -use std::fmt; -use std::str; use std::time::Duration; -use tokio::io::{AsyncRead, AsyncWrite}; +use std::{fmt, str}; pub use postgres_protocol2::authentication::sasl::ScramKeys; +use postgres_protocol2::message::frontend::StartupMessageParams; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; +use crate::connect::connect; +use crate::connect_raw::{RawConnection, connect_raw}; +use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::{Client, Connection, Error}; + /// TLS configuration. #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] #[non_exhaustive] diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index e0cb69748d..d2bd0dfbcd 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -1,3 +1,7 @@ +use postgres_protocol2::message::backend::Message; +use tokio::net::TcpStream; +use tokio::sync::mpsc; + use crate::client::SocketConfig; use crate::codec::BackendMessage; use crate::config::Host; @@ -5,9 +9,6 @@ use crate::connect_raw::connect_raw; use crate::connect_socket::connect_socket; use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Client, Config, Connection, Error, RawConnection}; -use postgres_protocol2::message::backend::Message; -use tokio::net::TcpStream; -use tokio::sync::mpsc; pub async fn connect( mut tls: T, diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs index 66db85e07d..20dc538cf2 100644 --- a/libs/proxy/tokio-postgres2/src/connect_raw.rs +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -1,22 +1,24 @@ +use std::collections::HashMap; +use std::io; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{Sink, SinkExt, Stream, TryStreamExt, ready}; +use postgres_protocol2::authentication::sasl; +use postgres_protocol2::authentication::sasl::ScramSha256; +use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody}; +use postgres_protocol2::message::frontend; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_util::codec::Framed; + +use crate::Error; use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; use crate::config::{self, AuthKeys, Config}; use crate::connect_tls::connect_tls; use crate::maybe_tls_stream::MaybeTlsStream; use crate::tls::{TlsConnect, TlsStream}; -use crate::Error; -use bytes::BytesMut; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt}; -use postgres_protocol2::authentication::sasl; -use postgres_protocol2::authentication::sasl::ScramSha256; -use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody}; -use postgres_protocol2::message::frontend; -use std::collections::HashMap; -use std::io; -use std::pin::Pin; -use std::task::{Context, Poll}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_util::codec::Framed; pub struct StartupStream { inner: Framed, PostgresCodec>, @@ -158,7 +160,7 @@ where | Some(Message::AuthenticationSspi) => { return Err(Error::authentication( "unsupported authentication method".into(), - )) + )); } Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), Some(_) => return Err(Error::unexpected_message()), diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs index 336a13317f..15411f7ef3 100644 --- a/libs/proxy/tokio-postgres2/src/connect_socket.rs +++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs @@ -1,11 +1,13 @@ -use crate::config::Host; -use crate::Error; use std::future::Future; use std::io; use std::time::Duration; + use tokio::net::{self, TcpStream}; use tokio::time; +use crate::Error; +use crate::config::Host; + pub(crate) async fn connect_socket( host: &Host, port: u16, diff --git a/libs/proxy/tokio-postgres2/src/connect_tls.rs b/libs/proxy/tokio-postgres2/src/connect_tls.rs index 64b0b68abc..4dc929a9e2 100644 --- a/libs/proxy/tokio-postgres2/src/connect_tls.rs +++ b/libs/proxy/tokio-postgres2/src/connect_tls.rs @@ -1,12 +1,13 @@ -use crate::config::SslMode; -use crate::maybe_tls_stream::MaybeTlsStream; -use crate::tls::private::ForcePrivateApi; -use crate::tls::TlsConnect; -use crate::Error; use bytes::BytesMut; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use crate::Error; +use crate::config::SslMode; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::tls::TlsConnect; +use crate::tls::private::ForcePrivateApi; + pub async fn connect_tls( mut stream: S, mode: SslMode, @@ -19,7 +20,7 @@ where match mode { SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)), SslMode::Prefer if !tls.can_connect(ForcePrivateApi) => { - return Ok(MaybeTlsStream::Raw(stream)) + return Ok(MaybeTlsStream::Raw(stream)); } SslMode::Prefer | SslMode::Require => {} } diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs index f478717e0d..60e39b3b44 100644 --- a/libs/proxy/tokio-postgres2/src/connection.rs +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -1,22 +1,24 @@ -use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; -use crate::error::DbError; -use crate::maybe_tls_stream::MaybeTlsStream; -use crate::{AsyncMessage, Error, Notification}; -use bytes::BytesMut; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Sink, Stream}; -use log::{info, trace}; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; use std::collections::{HashMap, VecDeque}; use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; + +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{Sink, Stream, ready}; +use log::{info, trace}; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc; use tokio_util::codec::Framed; use tokio_util::sync::PollSender; +use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; +use crate::error::DbError; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::{AsyncMessage, Error, Notification}; + pub enum RequestMessages { Single(FrontendMessage), } @@ -139,7 +141,7 @@ where Some(response) => response, None => match messages.next().map_err(Error::parse)? { Some(Message::ErrorResponse(error)) => { - return Poll::Ready(Err(Error::db(error))) + return Poll::Ready(Err(Error::db(error))); } _ => return Poll::Ready(Err(Error::unexpected_message())), }, diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs index 922c348525..b12e76e5bf 100644 --- a/libs/proxy/tokio-postgres2/src/error/mod.rs +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -1,10 +1,10 @@ //! Errors. +use std::error::{self, Error as _Error}; +use std::{fmt, io}; + use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody}; -use std::error::{self, Error as _Error}; -use std::fmt; -use std::io; pub use self::sqlstate::*; diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 042b5a675e..31c3d8fa3e 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -1,9 +1,10 @@ #![allow(async_fn_in_trait)] +use postgres_protocol2::Oid; + use crate::query::RowStream; use crate::types::Type; use crate::{Client, Error, Transaction}; -use postgres_protocol2::Oid; mod private { pub trait Sealed {} diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 7426279167..c8ebba5487 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -1,6 +1,8 @@ //! An asynchronous, pipelined, PostgreSQL client. #![warn(clippy::all)] +use postgres_protocol2::message::backend::ReadyForQueryBody; + pub use crate::cancel_token::CancelToken; pub use crate::client::{Client, SocketConfig}; pub use crate::config::Config; @@ -17,7 +19,6 @@ pub use crate::tls::NoTls; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; use crate::types::ToSql; -use postgres_protocol2::message::backend::ReadyForQueryBody; /// After executing a query, the connection will be in one of these states #[derive(Clone, Copy, Debug, PartialEq)] diff --git a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs index 9a7e248997..4aa838613e 100644 --- a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs +++ b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs @@ -1,12 +1,14 @@ //! MaybeTlsStream. //! //! Represents a stream that may or may not be encrypted with TLS. -use crate::tls::{ChannelBinding, TlsStream}; use std::io; use std::pin::Pin; use std::task::{Context, Poll}; + use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use crate::tls::{ChannelBinding, TlsStream}; + /// A stream that may or may not be encrypted with TLS. pub enum MaybeTlsStream { /// An unencrypted stream. diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index 58bbb26cbc..b36d2e5f74 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -1,18 +1,19 @@ +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; + +use bytes::Bytes; +use fallible_iterator::FallibleIterator; +use futures_util::{TryStreamExt, pin_mut}; +use log::debug; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; + use crate::client::InnerClient; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; use crate::types::{Field, Kind, Oid, Type}; -use crate::{query, slice_iter}; -use crate::{Column, Error, Statement}; -use bytes::Bytes; -use fallible_iterator::FallibleIterator; -use futures_util::{pin_mut, TryStreamExt}; -use log::debug; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; -use std::future::Future; -use std::pin::Pin; -use std::sync::Arc; +use crate::{Column, Error, Statement, query, slice_iter}; pub(crate) const TYPEINFO_QUERY: &str = "\ SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs index e21631c85d..29f05fba79 100644 --- a/libs/proxy/tokio-postgres2/src/query.rs +++ b/libs/proxy/tokio-postgres2/src/query.rs @@ -1,22 +1,24 @@ -use crate::client::{InnerClient, Responses}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::types::IsNull; -use crate::{Column, Error, ReadyForQueryStatus, Row, Statement}; -use bytes::{BufMut, Bytes, BytesMut}; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Stream}; -use log::{debug, log_enabled, Level}; -use pin_project_lite::pin_project; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; -use postgres_types2::{Format, ToSql, Type}; use std::fmt; use std::marker::PhantomPinned; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use bytes::{BufMut, Bytes, BytesMut}; +use fallible_iterator::FallibleIterator; +use futures_util::{Stream, ready}; +use log::{Level, debug, log_enabled}; +use pin_project_lite::pin_project; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use postgres_types2::{Format, ToSql, Type}; + +use crate::client::{InnerClient, Responses}; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::types::IsNull; +use crate::{Column, Error, ReadyForQueryStatus, Row, Statement}; + struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]); impl fmt::Debug for BorrowToSqlParamsDebug<'_> { @@ -257,7 +259,7 @@ impl Stream for RowStream { this.statement.clone(), body, *this.output_format, - )?))) + )?))); } Message::EmptyQueryResponse | Message::PortalSuspended => {} Message::CommandComplete(body) => { diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs index 10e130707d..5fc955eef4 100644 --- a/libs/proxy/tokio-postgres2/src/row.rs +++ b/libs/proxy/tokio-postgres2/src/row.rs @@ -1,17 +1,18 @@ //! Rows. +use std::ops::Range; +use std::sync::Arc; +use std::{fmt, str}; + +use fallible_iterator::FallibleIterator; +use postgres_protocol2::message::backend::DataRowBody; +use postgres_types2::{Format, WrongFormat}; + use crate::row::sealed::{AsName, Sealed}; use crate::simple_query::SimpleColumn; use crate::statement::Column; use crate::types::{FromSql, Type, WrongType}; use crate::{Error, Statement}; -use fallible_iterator::FallibleIterator; -use postgres_protocol2::message::backend::DataRowBody; -use postgres_types2::{Format, WrongFormat}; -use std::fmt; -use std::ops::Range; -use std::str; -use std::sync::Arc; mod sealed { pub trait Sealed {} diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs index fb2550377b..f13d63983f 100644 --- a/libs/proxy/tokio-postgres2/src/simple_query.rs +++ b/libs/proxy/tokio-postgres2/src/simple_query.rs @@ -1,19 +1,21 @@ -use crate::client::{InnerClient, Responses}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; -use bytes::Bytes; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Stream}; -use log::debug; -use pin_project_lite::pin_project; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; use std::marker::PhantomPinned; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use bytes::Bytes; +use fallible_iterator::FallibleIterator; +use futures_util::{Stream, ready}; +use log::debug; +use pin_project_lite::pin_project; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; + +use crate::client::{InnerClient, Responses}; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; + /// Information about a column of a single query row. #[derive(Debug)] pub struct SimpleColumn { diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs index 591872fbc5..e4828db712 100644 --- a/libs/proxy/tokio-postgres2/src/statement.rs +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -1,15 +1,14 @@ +use std::fmt; +use std::sync::{Arc, Weak}; + +use postgres_protocol2::Oid; +use postgres_protocol2::message::backend::Field; +use postgres_protocol2::message::frontend; + use crate::client::InnerClient; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; use crate::types::Type; -use postgres_protocol2::{ - message::{backend::Field, frontend}, - Oid, -}; -use std::{ - fmt, - sync::{Arc, Weak}, -}; struct StatementInner { client: Weak, diff --git a/libs/proxy/tokio-postgres2/src/tls.rs b/libs/proxy/tokio-postgres2/src/tls.rs index dc8140719f..41b51368ff 100644 --- a/libs/proxy/tokio-postgres2/src/tls.rs +++ b/libs/proxy/tokio-postgres2/src/tls.rs @@ -5,6 +5,7 @@ use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; use std::{fmt, io}; + use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; pub(crate) mod private { diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs index 03a57e4947..eecbfc5873 100644 --- a/libs/proxy/tokio-postgres2/src/transaction.rs +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -1,8 +1,9 @@ +use postgres_protocol2::message::frontend; + use crate::codec::FrontendMessage; use crate::connection::RequestMessages; use crate::query::RowStream; use crate::{CancelToken, Client, Error, ReadyForQueryStatus}; -use postgres_protocol2::message::frontend; /// A representation of a PostgreSQL database transaction. /// diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index e9611a0f12..5020d82adf 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -24,11 +24,10 @@ diatomic-waker.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true -inferno.workspace = true fail.workspace = true futures = { workspace = true } jsonwebtoken.workspace = true -nix = {workspace = true, features = [ "ioctl" ] } +nix = { workspace = true, features = ["ioctl"] } once_cell.workspace = true pin-project-lite.workspace = true regex.workspace = true @@ -62,6 +61,7 @@ bytes.workspace = true criterion.workspace = true hex-literal.workspace = true camino-tempfile.workspace = true +pprof.workspace = true serde_assert.workspace = true tokio = { workspace = true, features = ["test-util"] } diff --git a/libs/utils/benches/README.md b/libs/utils/benches/README.md new file mode 100644 index 0000000000..e23ec268c2 --- /dev/null +++ b/libs/utils/benches/README.md @@ -0,0 +1,26 @@ +## Utils Benchmarks + +To run benchmarks: + +```sh +# All benchmarks. +cargo bench --package utils + +# Specific file. +cargo bench --package utils --bench benchmarks + +# Specific benchmark. +cargo bench --package utils --bench benchmarks warn_slow/enabled=true + +# List available benchmarks. +cargo bench --package utils --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10 +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. \ No newline at end of file diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 44eb36387c..cff3792f3a 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,5 +1,18 @@ -use criterion::{criterion_group, criterion_main, Criterion}; +use std::time::Duration; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use pprof::criterion::{Output, PProfProfiler}; use utils::id; +use utils::logging::warn_slow; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_id_stringify, + bench_warn_slow, +); +criterion_main!(benches); pub fn bench_id_stringify(c: &mut Criterion) { // Can only use public methods. @@ -16,5 +29,31 @@ pub fn bench_id_stringify(c: &mut Criterion) { }); } -criterion_group!(benches, bench_id_stringify); -criterion_main!(benches); +pub fn bench_warn_slow(c: &mut Criterion) { + for enabled in [false, true] { + c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| { + run_bench(b, enabled).unwrap() + }); + } + + // The actual benchmark. + fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> { + const THRESHOLD: Duration = Duration::from_secs(1); + + // Use a multi-threaded runtime to avoid thread parking overhead when yielding. + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + + // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling + // performance too. Use a simple noop future that yields once, to avoid any scheduler fast + // paths for a ready future. + if enabled { + b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now()))); + } else { + b.iter(|| runtime.block_on(tokio::task::yield_now())); + } + + Ok(()) + } +} diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 4a6069294d..95c69ac8ba 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,9 +1,13 @@ +use std::future::Future; use std::str::FromStr; +use std::time::Duration; use anyhow::Context; use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; +use tokio::time::Instant; +use tracing::warn; /// Logs a critical error, similarly to `tracing::error!`. This will: /// @@ -318,6 +322,41 @@ impl std::fmt::Debug for SecretString { } } +/// Logs a periodic warning if a future is slow to complete. +/// +/// This is performance-sensitive as it's used on the GetPage read path. +#[inline] +pub async fn warn_slow(name: &str, threshold: Duration, f: impl Future) -> O { + // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and + // won't fit on the stack. + let mut f = Box::pin(f); + + let started = Instant::now(); + let mut attempt = 1; + + loop { + // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common + // case where the timeout doesn't fire. + let deadline = started + attempt * threshold; + if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await { + // NB: we check if we exceeded the threshold even if the timeout never fired, because + // scheduling or execution delays may cause the future to succeed even if it exceeds the + // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid + // false negatives. + let elapsed = started.elapsed(); + if elapsed >= threshold { + warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64()); + } + return output; + } + + let elapsed = started.elapsed().as_secs_f64(); + warn!("slow {name} still running after {elapsed:.3}s",); + + attempt += 1; + } +} + #[cfg(test)] mod tests { use metrics::{core::Opts, IntCounterVec}; diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml index ba73902d38..a70465921c 100644 --- a/libs/vm_monitor/Cargo.toml +++ b/libs/vm_monitor/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "vm_monitor" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [[bin]] diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs index 1d70cedcf9..dda9b23818 100644 --- a/libs/vm_monitor/src/cgroup.rs +++ b/libs/vm_monitor/src/cgroup.rs @@ -1,12 +1,10 @@ use std::fmt::{self, Debug, Formatter}; use std::time::{Duration, Instant}; -use anyhow::{anyhow, Context}; -use cgroups_rs::{ - hierarchies::{self, is_cgroup2_unified_mode}, - memory::MemController, - Subsystem, -}; +use anyhow::{Context, anyhow}; +use cgroups_rs::Subsystem; +use cgroups_rs::hierarchies::{self, is_cgroup2_unified_mode}; +use cgroups_rs::memory::MemController; use tokio::sync::watch; use tracing::{info, warn}; diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs index c81848cb70..7b7201ab77 100644 --- a/libs/vm_monitor/src/dispatcher.rs +++ b/libs/vm_monitor/src/dispatcher.rs @@ -6,17 +6,15 @@ //! the cgroup (requesting upscale), and the signals that go to the cgroup //! (notifying it of upscale). -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use axum::extract::ws::{Message, Utf8Bytes, WebSocket}; -use futures::{ - stream::{SplitSink, SplitStream}, - SinkExt, StreamExt, -}; +use futures::stream::{SplitSink, SplitStream}; +use futures::{SinkExt, StreamExt}; use tracing::{debug, info}; use crate::protocol::{ - OutboundMsg, OutboundMsgKind, ProtocolRange, ProtocolResponse, ProtocolVersion, - PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, + OutboundMsg, OutboundMsgKind, PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, ProtocolRange, + ProtocolResponse, ProtocolVersion, }; /// The central handler for all communications in the monitor. diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs index 4f5bf1c1e3..bc42347e5a 100644 --- a/libs/vm_monitor/src/filecache.rs +++ b/libs/vm_monitor/src/filecache.rs @@ -2,12 +2,14 @@ use std::num::NonZeroU64; -use crate::MiB; -use anyhow::{anyhow, Context}; -use tokio_postgres::{types::ToSql, Client, NoTls, Row}; +use anyhow::{Context, anyhow}; +use tokio_postgres::types::ToSql; +use tokio_postgres::{Client, NoTls, Row}; use tokio_util::sync::CancellationToken; use tracing::{error, info}; +use crate::MiB; + /// Manages Postgres' file cache by keeping a connection open. #[derive(Debug)] pub struct FileCacheState { diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs index 0cd97d4ca1..7c77aca35d 100644 --- a/libs/vm_monitor/src/lib.rs +++ b/libs/vm_monitor/src/lib.rs @@ -2,24 +2,26 @@ #![deny(clippy::undocumented_unsafe_blocks)] #![cfg(target_os = "linux")] +use std::fmt::Debug; +use std::net::SocketAddr; +use std::time::Duration; + use anyhow::Context; -use axum::{ - extract::{ws::WebSocket, State, WebSocketUpgrade}, - response::Response, -}; -use axum::{routing::get, Router}; +use axum::Router; +use axum::extract::ws::WebSocket; +use axum::extract::{State, WebSocketUpgrade}; +use axum::response::Response; +use axum::routing::get; use clap::Parser; use futures::Future; -use std::net::SocketAddr; -use std::{fmt::Debug, time::Duration}; +use runner::Runner; use sysinfo::{RefreshKind, System, SystemExt}; use tokio::net::TcpListener; -use tokio::{sync::broadcast, task::JoinHandle}; +use tokio::sync::broadcast; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::{error, info}; -use runner::Runner; - // Code that interfaces with agent pub mod dispatcher; pub mod protocol; diff --git a/libs/vm_monitor/src/protocol.rs b/libs/vm_monitor/src/protocol.rs index 5f07435503..4fce3cdefc 100644 --- a/libs/vm_monitor/src/protocol.rs +++ b/libs/vm_monitor/src/protocol.rs @@ -35,7 +35,8 @@ use core::fmt; use std::cmp; -use serde::{de::Error, Deserialize, Serialize}; +use serde::de::Error; +use serde::{Deserialize, Serialize}; /// A Message we send to the agent. #[derive(Serialize, Deserialize, Debug, Clone)] diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index 8839f5803f..6f75ff0abd 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -7,7 +7,7 @@ use std::fmt::Debug; use std::time::{Duration, Instant}; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use axum::extract::ws::{Message, WebSocket}; use futures::StreamExt; use tokio::sync::{broadcast, watch}; @@ -18,7 +18,7 @@ use crate::cgroup::{self, CgroupWatcher}; use crate::dispatcher::Dispatcher; use crate::filecache::{FileCacheConfig, FileCacheState}; use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources}; -use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB}; +use crate::{Args, MiB, bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel}; /// Central struct that interacts with agent, dispatcher, and cgroup to handle /// signals from the agent. @@ -233,7 +233,9 @@ impl Runner { // // TODO: make the duration here configurable. if last_time.elapsed() > Duration::from_secs(5) { - bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information"); + bail!( + "haven't gotten cgroup memory stats recently enough to determine downscaling information" + ); } else if last_history.samples_count <= 1 { let status = "haven't received enough cgroup memory stats yet"; info!(status, "discontinuing downscale"); diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto index d68484d30f..7b40201a75 100644 --- a/libs/wal_decoder/proto/interpreted_wal.proto +++ b/libs/wal_decoder/proto/interpreted_wal.proto @@ -5,6 +5,7 @@ package interpreted_wal; message InterpretedWalRecords { repeated InterpretedWalRecord records = 1; optional uint64 next_record_lsn = 2; + optional uint64 raw_wal_start_lsn = 3; } message InterpretedWalRecord { diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index 51bf7e44ab..7e1934c6c3 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -60,7 +60,11 @@ pub struct InterpretedWalRecords { pub records: Vec, // Start LSN of the next record after the batch. // Note that said record may not belong to the current shard. - pub next_record_lsn: Option, + pub next_record_lsn: Lsn, + // Inclusive start LSN of the PG WAL from which the interpreted + // WAL records were extracted. Note that this is not necessarily the + // start LSN of the first interpreted record in the batch. + pub raw_wal_start_lsn: Option, } /// An interpreted Postgres WAL record, ready to be handled by the pageserver diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs index 944ee5c919..52ed5c70b5 100644 --- a/libs/wal_decoder/src/wire_format.rs +++ b/libs/wal_decoder/src/wire_format.rs @@ -167,7 +167,8 @@ impl TryFrom for proto::InterpretedWalRecords { .collect::, _>>()?; Ok(proto::InterpretedWalRecords { records, - next_record_lsn: value.next_record_lsn.map(|l| l.0), + next_record_lsn: Some(value.next_record_lsn.0), + raw_wal_start_lsn: value.raw_wal_start_lsn.map(|l| l.0), }) } } @@ -254,7 +255,11 @@ impl TryFrom for InterpretedWalRecords { Ok(InterpretedWalRecords { records, - next_record_lsn: value.next_record_lsn.map(Lsn::from), + next_record_lsn: value + .next_record_lsn + .map(Lsn::from) + .expect("Always provided"), + raw_wal_start_lsn: value.raw_wal_start_lsn.map(Lsn::from), }) } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 41ac3b69b8..9d4463d595 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -40,7 +40,6 @@ num_cpus.workspace = true num-traits.workspace = true once_cell.workspace = true pin-project-lite.workspace = true -postgres.workspace = true postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index db77a395e0..970a437a42 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -21,5 +21,4 @@ tokio.workspace = true futures.workspace = true tokio-util.workspace = true anyhow.workspace = true -postgres.workspace = true bytes.workspace = true diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 27280912b4..47da83b0eb 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -34,7 +34,8 @@ pub struct BasebackupRequest { impl Client { pub async fn new(connstring: String) -> anyhow::Result { - let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?; + let (client, connection) = + tokio_postgres::connect(&connstring, tokio_postgres::NoTls).await?; let conn_task_cancel = CancellationToken::new(); let conn_task = tokio::spawn({ diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index af4b5a21ab..c7f0719c41 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -345,6 +345,7 @@ impl AuxFileV2 { AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash) } (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash), + (3, 1) => AuxFileV2::Recognized("pg_stat/pgstat.stat", hash), (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash), (0xff, 0xff) => AuxFileV2::Other(hash), _ => return None, diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index 5e527b7d61..5cc20a70b2 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -39,6 +39,7 @@ fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key const AUX_DIR_PG_LOGICAL: u8 = 0x01; const AUX_DIR_PG_REPLSLOT: u8 = 0x02; +const AUX_DIR_PG_STAT: u8 = 0x03; const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; /// Encode the aux file into a fixed-size key. @@ -53,6 +54,7 @@ const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; /// * pg_logical/replorigin_checkpoint -> 0x0103 /// * pg_logical/others -> 0x01FF /// * pg_replslot/ -> 0x0201 +/// * pg_stat/pgstat.stat -> 0x0301 /// * others -> 0xFFFF /// /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. @@ -75,6 +77,8 @@ pub fn encode_aux_file_key(path: &str) -> Key { aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) } else if let Some(fname) = path.strip_prefix("pg_replslot/") { aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_stat/") { + aux_hash_to_metadata_key(AUX_DIR_PG_STAT, 0x01, fname.as_bytes()) } else { if cfg!(debug_assertions) { warn!( diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index e03b1bbe96..99b0775316 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -264,6 +264,31 @@ where async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum + // Construct the pg_control file from the persisted checkpoint and pg_control + // information. But we only add this to the tarball at the end, so that if the + // writing is interrupted half-way through, the resulting incomplete tarball will + // be missing the pg_control file, which prevents PostgreSQL from starting up on + // it. With proper error handling, you should never try to start up from an + // incomplete basebackup in the first place, of course, but this is a nice little + // extra safety measure. + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn, self.ctx) + .await + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn, self.ctx) + .await + .context("failed to get control bytes")?; + let (pg_control_bytes, system_identifier, was_shutdown) = + postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; + let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; let pgversion = self.timeline.pg_version; @@ -401,6 +426,10 @@ where // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, // but now we should handle (skip) it for backward compatibility. continue; + } else if path == "pg_stat/pgstat.stat" && !was_shutdown { + // Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN + // of a shutdown checkpoint. + continue; } let header = new_tar_header(&path, content.len() as u64)?; self.ar @@ -462,8 +491,9 @@ where ))) }); - // Generate pg_control and bootstrap WAL segment. - self.add_pgcontrol_file().await?; + // Last, add the pg_control file and bootstrap WAL segment. + self.add_pgcontrol_file(pg_control_bytes, system_identifier) + .await?; self.ar .finish() .await @@ -671,7 +701,11 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> { + async fn add_pgcontrol_file( + &mut self, + pg_control_bytes: Bytes, + system_identifier: u64, + ) -> Result<(), BasebackupError> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -694,24 +728,6 @@ where .await .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?; - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn, self.ctx) - .await - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn, self.ctx) - .await - .context("failed get control bytes")?; - - let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( - &pg_control_bytes, - &checkpoint_bytes, - self.lsn, - self.timeline.pg_version, - )?; - //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index fa098e9364..e2b9a7f073 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -134,6 +134,7 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); + info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation"); info!(?conf.page_service_pipelining, "starting with page service pipelining config"); info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config"); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index c5368f6806..09d9444dd5 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -197,6 +197,10 @@ pub struct PageServerConf { /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer /// files read. pub enable_read_path_debugging: bool, + + /// Interpreted protocol feature: if enabled, validate that the logical WAL received from + /// safekeepers does not have gaps. + pub validate_wal_contiguity: bool, } /// Token for authentication to safekeepers @@ -360,6 +364,7 @@ impl PageServerConf { page_service_pipelining, get_vectored_concurrent_io, enable_read_path_debugging, + validate_wal_contiguity, } = config_toml; let mut conf = PageServerConf { @@ -446,6 +451,7 @@ impl PageServerConf { virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), no_sync: no_sync.unwrap_or(false), enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), + validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false), }; // ------------------------------------------------------------ diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 8f2177fe5b..da9c095a15 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -98,6 +98,7 @@ pub struct RequestContext { download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, + read_path_debug: bool, } /// The kind of access to the page cache. @@ -155,6 +156,7 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, + read_path_debug: false, }, } } @@ -168,6 +170,7 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, + read_path_debug: original.read_path_debug, }, } } @@ -191,6 +194,11 @@ impl RequestContextBuilder { self } + pub(crate) fn read_path_debug(mut self, b: bool) -> Self { + self.inner.read_path_debug = b; + self + } + pub fn build(self) -> RequestContext { self.inner } @@ -291,4 +299,8 @@ impl RequestContext { pub(crate) fn page_content_kind(&self) -> PageContentKind { self.page_content_kind } + + pub(crate) fn read_path_debug(&self) -> bool { + self.read_path_debug + } } diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index d41bfd9021..4990f17b40 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -173,6 +173,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { listen_pg_port: m.postgres_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, + listen_https_port: None, // TODO: Support https. availability_zone_id: az_id.expect("Checked above"), }) } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 329bf82bde..56a84a98a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -68,6 +68,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use crate::config::PageServerConf; +use crate::context::RequestContextBuilder; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; @@ -2394,6 +2395,7 @@ async fn timeline_checkpoint_handler( match e { CompactionError::ShuttingDown => ApiError::ShuttingDown, CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), + CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e) } )?; @@ -2571,14 +2573,30 @@ async fn deletion_queue_flush( } } -/// Try if `GetPage@Lsn` is successful, useful for manual debugging. async fn getpage_at_lsn_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + getpage_at_lsn_handler_inner(false, request, cancel).await +} + +async fn touchpage_at_lsn_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + getpage_at_lsn_handler_inner(true, request, cancel).await +} + +/// Try if `GetPage@Lsn` is successful, useful for manual debugging. +async fn getpage_at_lsn_handler_inner( + touch: bool, request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_shard_id.tenant_id))?; + // Require pageserver admin permission for this API instead of only tenant-level token. + check_permission(&request, None)?; let state = get_state(&request); struct Key(pageserver_api::key::Key); @@ -2593,22 +2611,29 @@ async fn getpage_at_lsn_handler( let key: Key = parse_query_param(&request, "key")? .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?; - let lsn: Lsn = parse_query_param(&request, "lsn")? - .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + let lsn: Option = parse_query_param(&request, "lsn")?; async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + // Enable read path debugging + let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build(); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + // Use last_record_lsn if no lsn is provided + let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let page = timeline.get(key.0, lsn, &ctx).await?; - Result::<_, ApiError>::Ok( - Response::builder() - .status(StatusCode::OK) - .header(header::CONTENT_TYPE, "application/octet-stream") - .body(hyper::Body::from(page)) - .unwrap(), - ) + if touch { + json_response(StatusCode::OK, ()) + } else { + Result::<_, ApiError>::Ok( + Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "application/octet-stream") + .body(hyper::Body::from(page)) + .unwrap(), + ) + } } .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await @@ -3743,6 +3768,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage", |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage", + |r| api_handler(r, touchpage_at_lsn_handler), + ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", |r| api_handler(r, timeline_collect_keyspace), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0c8da6f2a8..7285697040 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -34,11 +34,13 @@ use std::str::FromStr; use std::sync::Arc; use std::time::SystemTime; use std::time::{Duration, Instant}; +use strum_macros::IntoStaticStr; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::logging::warn_slow; use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; use utils::{ @@ -81,6 +83,9 @@ use std::os::fd::AsRawFd; /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); +/// Threshold at which to log a warning about slow GetPage requests. +const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); + /////////////////////////////////////////////////////////////////////////////// pub struct Listener { @@ -594,6 +599,7 @@ struct BatchedTestRequest { /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum, /// so that we don't keep the [`Timeline::gate`] open while the batch /// is being built up inside the [`spsc_fold`] (pagestream pipelining). +#[derive(IntoStaticStr)] enum BatchedFeMessage { Exists { span: Span, @@ -638,6 +644,10 @@ enum BatchedFeMessage { } impl BatchedFeMessage { + fn as_static_str(&self) -> &'static str { + self.into() + } + fn observe_execution_start(&mut self, at: Instant) { match self { BatchedFeMessage::Exists { timer, .. } @@ -1463,17 +1473,20 @@ impl PageServerHandler { } }; - let err = self - .pagesteam_handle_batched_message( + let result = warn_slow( + msg.as_static_str(), + WARN_SLOW_GETPAGE_THRESHOLD, + self.pagesteam_handle_batched_message( pgb_writer, msg, io_concurrency.clone(), &cancel, protocol_version, ctx, - ) - .await; - match err { + ), + ) + .await; + match result { Ok(()) => {} Err(e) => break e, } @@ -1636,13 +1649,17 @@ impl PageServerHandler { return Err(e); } }; - self.pagesteam_handle_batched_message( - pgb_writer, - batch, - io_concurrency.clone(), - &cancel, - protocol_version, - &ctx, + warn_slow( + batch.as_static_str(), + WARN_SLOW_GETPAGE_THRESHOLD, + self.pagesteam_handle_batched_message( + pgb_writer, + batch, + io_concurrency.clone(), + &cancel, + protocol_version, + &ctx, + ), ) .await?; } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ae2762bd1e..d0e2dab042 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -45,7 +45,7 @@ use std::ops::ControlFlow; use std::ops::Range; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, trace, warn}; +use tracing::{debug, info, trace, warn}; use utils::bin_ser::DeserializeError; use utils::pausable_failpoint; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -2264,6 +2264,13 @@ impl DatadirModification<'_> { self.tline.aux_file_size_estimator.on_add(content.len()); new_files.push((path, content)); } + // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit. + // Compute doesn't know if previous version of this file exists or not, so + // attempt to delete non-existing file can cause this message. + // To avoid false alarms, log it as info rather than warning. + (None, true) if path.starts_with("pg_stat/") => { + info!("removing non-existing pg_stat file: {}", path) + } (None, true) => warn!("removing non-existing aux file: {}", path), } let new_val = aux_file::encode_file_value(&new_files)?; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5d917da574..56718f5294 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1189,6 +1189,39 @@ impl Tenant { format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}") })?; + // When unarchiving, we've mostly likely lost the heatmap generated prior + // to the archival operation. To allow warming this timeline up, generate + // a previous heatmap which contains all visible layers in the layer map. + // This previous heatmap will be used whenever a fresh heatmap is generated + // for the timeline. + if matches!(cause, LoadTimelineCause::Unoffload) { + let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn())); + while let Some((tline, end_lsn)) = tline_ending_at { + let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await; + if !tline.is_previous_heatmap_active() { + tline + .previous_heatmap + .store(Some(Arc::new(unarchival_heatmap))); + } else { + tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.") + } + + match tline.ancestor_timeline() { + Some(ancestor) => { + if ancestor.update_layer_visibility().await.is_err() { + // Ancestor timeline is shutting down. + break; + } + + tline_ending_at = Some((ancestor, tline.get_ancestor_lsn())); + } + None => { + tline_ending_at = None; + } + } + } + } + match import_pgdata { Some(import_pgdata) if !import_pgdata.is_done() => { match cause { @@ -3150,6 +3183,12 @@ impl Tenant { // Offload failures don't trip the circuit breaker, since they're cheap to retry and // shouldn't block compaction. CompactionError::Offload(_) => {} + CompactionError::CollectKeySpaceError(err) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, err); + } CompactionError::Other(err) => { self.compaction_circuit_breaker .lock() diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c6bcfdf2fb..ab4c4c935d 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -693,16 +693,15 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { /// This is a conversion from our internal tenant config object to the one used /// in external APIs. impl From for models::TenantConfig { + // TODO(vlad): These are now the same, but they have different serialization logic. + // Can we merge them? fn from(value: TenantConfOpt) -> Self { - fn humantime(d: Duration) -> String { - format!("{}s", d.as_secs()) - } Self { checkpoint_distance: value.checkpoint_distance, - checkpoint_timeout: value.checkpoint_timeout.map(humantime), + checkpoint_timeout: value.checkpoint_timeout, compaction_algorithm: value.compaction_algorithm, compaction_target_size: value.compaction_target_size, - compaction_period: value.compaction_period.map(humantime), + compaction_period: value.compaction_period, compaction_threshold: value.compaction_threshold, compaction_upper_limit: value.compaction_upper_limit, compaction_l0_first: value.compaction_l0_first, @@ -711,24 +710,23 @@ impl From for models::TenantConfig { l0_flush_stall_threshold: value.l0_flush_stall_threshold, l0_flush_wait_upload: value.l0_flush_wait_upload, gc_horizon: value.gc_horizon, - gc_period: value.gc_period.map(humantime), + gc_period: value.gc_period, image_creation_threshold: value.image_creation_threshold, - pitr_interval: value.pitr_interval.map(humantime), - walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime), - lagging_wal_timeout: value.lagging_wal_timeout.map(humantime), + pitr_interval: value.pitr_interval, + walreceiver_connect_timeout: value.walreceiver_connect_timeout, + lagging_wal_timeout: value.lagging_wal_timeout, max_lsn_wal_lag: value.max_lsn_wal_lag, eviction_policy: value.eviction_policy, min_resident_size_override: value.min_resident_size_override, evictions_low_residence_duration_metric_threshold: value - .evictions_low_residence_duration_metric_threshold - .map(humantime), - heatmap_period: value.heatmap_period.map(humantime), + .evictions_low_residence_duration_metric_threshold, + heatmap_period: value.heatmap_period, lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle, image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, image_creation_preempt_threshold: value.image_creation_preempt_threshold, - lsn_lease_length: value.lsn_lease_length.map(humantime), - lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), + lsn_lease_length: value.lsn_lease_length, + lsn_lease_length_for_ts: value.lsn_lease_length_for_ts, timeline_offloading: value.timeline_offloading, wal_receiver_protocol_override: value.wal_receiver_protocol_override, rel_size_v2_enabled: value.rel_size_v2_enabled, @@ -760,29 +758,10 @@ mod tests { assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap()); } - #[test] - fn test_try_from_models_tenant_config_err() { - let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5a".to_string()), - ..TenantConfig::default() - }; - - let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config); - - assert!( - tenant_conf_opt.is_err(), - "Suceeded to convert TenantConfig to TenantConfOpt" - ); - - let expected_error_str = - "lagging_wal_timeout: invalid value: string \"5a\", expected a duration"; - assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str); - } - #[test] fn test_try_from_models_tenant_config_success() { let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5s".to_string()), + lagging_wal_timeout: Some(Duration::from_secs(5)), ..TenantConfig::default() }; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 885c50425f..7ba0e3679f 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -51,8 +51,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::DBDIR_KEY; -use pageserver_api::key::{Key, KEY_SIZE}; +use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; @@ -967,7 +966,10 @@ impl DeltaLayerInner { .as_slice() .iter() .filter_map(|(_, blob_meta)| { - if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + if blob_meta.key.is_rel_dir_key() + || blob_meta.key == DBDIR_KEY + || blob_meta.key.is_aux_file_key() + { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index c49281dc45..dc611bd6e1 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -48,8 +48,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::DBDIR_KEY; -use pageserver_api::key::{Key, KEY_SIZE}; +use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_api::value::Value; @@ -603,7 +602,10 @@ impl ImageLayerInner { .as_slice() .iter() .filter_map(|(_, blob_meta)| { - if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + if blob_meta.key.is_rel_dir_key() + || blob_meta.key == DBDIR_KEY + || blob_meta.key.is_aux_file_key() + { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 029444e973..5e63f59fd8 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -287,6 +287,7 @@ fn log_compaction_error( sleep_duration: Duration, task_cancelled: bool, ) { + use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::tenant::upload_queue::NotInitialized; use crate::tenant::PageReconstructError; use CompactionError::*; @@ -294,6 +295,8 @@ fn log_compaction_error( let level = match err { ShuttingDown => return, Offload(_) => Level::ERROR, + CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO, + CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { let root_cause = err.root_cause(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 94b4abb7e9..319c5e3d87 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -22,6 +22,7 @@ use chrono::{DateTime, Utc}; use compaction::CompactionOutcome; use enumset::EnumSet; use fail::fail_point; +use futures::FutureExt; use futures::{stream::FuturesUnordered, StreamExt}; use handle::ShardTimelineId; use layer_manager::Shutdown; @@ -467,7 +468,7 @@ pub struct Timeline { /// If Some, collects GetPage metadata for an ongoing PageTrace. pub(crate) page_trace: ArcSwapOption>, - previous_heatmap: ArcSwapOption, + pub(super) previous_heatmap: ArcSwapOption, /// May host a background Tokio task which downloads all the layers from the current /// heatmap on demand. @@ -1298,7 +1299,7 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - let read_path = if self.conf.enable_read_path_debugging { + let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() { Some(ReadPath::new(keyspace.clone(), lsn)) } else { None @@ -1881,7 +1882,7 @@ impl Timeline { // Signal compaction failure to avoid L0 flush stalls when it's broken. match result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), - Err(CompactionError::Other(_)) => { + Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => { self.compaction_failed.store(true, AtomicOrdering::Relaxed) } // Don't change the current value on offload failure or shutdown. We don't want to @@ -2873,6 +2874,7 @@ impl Timeline { auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), availability_zone: self.conf.availability_zone.clone(), ingest_batch_size: self.conf.ingest_batch_size, + validate_wal_contiguity: self.conf.validate_wal_contiguity, }, broker_client, ctx, @@ -3522,6 +3524,14 @@ impl Timeline { Ok(layer) } + pub(super) fn is_previous_heatmap_active(&self) -> bool { + self.previous_heatmap + .load() + .as_ref() + .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. })) + .unwrap_or(false) + } + /// The timeline heatmap is a hint to secondary locations from the primary location, /// indicating which layers are currently on-disk on the primary. /// @@ -3594,6 +3604,7 @@ impl Timeline { Some(non_resident) => { let mut non_resident = non_resident.peekable(); if non_resident.peek().is_none() { + tracing::info!(timeline_id=%self.timeline_id, "Previous heatmap now obsolete"); self.previous_heatmap .store(Some(PreviousHeatmap::Obsolete.into())); } @@ -3625,6 +3636,36 @@ impl Timeline { Some(HeatMapTimeline::new(self.timeline_id, layers)) } + pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap { + let guard = self.layers.read().await; + + let now = SystemTime::now(); + let mut heatmap_layers = Vec::default(); + for vl in guard.visible_layers() { + if vl.layer_desc().get_lsn_range().start >= end_lsn { + continue; + } + + let hl = HeatMapLayer { + name: vl.layer_desc().layer_name(), + metadata: vl.metadata(), + access_time: now, + }; + heatmap_layers.push(hl); + } + + tracing::info!( + "Generating unarchival heatmap with {} layers", + heatmap_layers.len() + ); + + let heatmap = HeatMapTimeline::new(self.timeline_id, heatmap_layers); + PreviousHeatmap::Active { + heatmap, + read_at: Instant::now(), + } + } + /// Returns true if the given lsn is or was an ancestor branchpoint. pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original @@ -4604,7 +4645,10 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let (dense_ks, sparse_ks) = self + .collect_keyspace(lsn, ctx) + .await + .map_err(CompactionError::CollectKeySpaceError)?; let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], @@ -5125,20 +5169,26 @@ impl Timeline { // image layer generation taking too long time and blocking L0 compaction. So in this // mode, we also inspect the current number of L0 layers and skip image layer generation // if there are too many of them. - let num_of_l0_layers = { - let layers = self.layers.read().await; - layers.layer_map()?.level0_deltas().len() - }; let image_preempt_threshold = self.get_image_creation_preempt_threshold() * self.get_compaction_threshold(); - if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold { - tracing::info!( - "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}", - partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers - ); - last_partition_processed = Some(partition.clone()); - all_generated = false; - break; + // TODO: currently we do not respect `get_image_creation_preempt_threshold` and always yield + // when there is a single timeline with more than L0 threshold L0 layers. As long as the + // `get_image_creation_preempt_threshold` is set to a value greater than 0, we will yield for L0 compaction. + if image_preempt_threshold != 0 { + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!( + "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers", + partition.start().unwrap(), partition.end().unwrap() + ); + last_partition_processed = Some(partition.clone()); + all_generated = false; + break; + } } } } @@ -5167,14 +5217,16 @@ impl Timeline { .map(|l| l.metadata().file_size) .sum::(); - info!( - "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", - image_layers.len(), - total_layer_size, - duration.as_secs_f64(), - partition_processed, - total_partitions - ); + if !image_layers.is_empty() { + info!( + "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", + image_layers.len(), + total_layer_size, + duration.as_secs_f64(), + partition_processed, + total_partitions + ); + } Ok(( image_layers, @@ -5317,6 +5369,8 @@ pub(crate) enum CompactionError { #[error("Failed to offload timeline: {0}")] Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. + #[error("Failed to collect keyspace: {0}")] + CollectKeySpaceError(CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), } @@ -5330,12 +5384,6 @@ impl From for CompactionError { } } -impl CompactionError { - pub fn is_cancelled(&self) -> bool { - matches!(self, CompactionError::ShuttingDown) - } -} - impl From for CompactionError { fn from(err: CollectKeySpaceError) -> Self { match err { @@ -6600,7 +6648,7 @@ impl TimelineWriter<'_> { if let Some(wait_threshold) = wait_threshold { if l0_count >= wait_threshold { - info!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers"); + debug!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers"); self.tl.wait_flush_completion(flush_id).await?; } } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9e082d74b5..d75591bd74 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -11,7 +11,8 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError, - ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, + ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration, + Timeline, }; use anyhow::{anyhow, bail, Context}; @@ -25,12 +26,13 @@ use pageserver_api::models::CompactInfoResponse; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use serde::Serialize; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, trace, warn, Instrument}; +use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use utils::critical; use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::gc_block::GcBlock; @@ -773,17 +775,25 @@ impl Timeline { return Ok(CompactionOutcome::YieldForL0); } } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() && !err.is_cancelled() { - tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); - } - } + + // Suppress errors when cancelled. + Err(_) if self.cancel.is_cancelled() => {} + Err(CompactionError::ShuttingDown) => {} + + // Alert on critical errors that indicate data corruption. + Err( + err @ CompactionError::CollectKeySpaceError( + CollectKeySpaceError::Decode(_) + | CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ), + ), + ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"), + + // Log other errors. No partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline as a simple + // key-value store, ignoring the datadir layout. Log the error but continue. + Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"), }; let partition_count = self.partitioning.read().0 .0.parts.len(); @@ -1010,7 +1020,7 @@ impl Timeline { /// /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers /// that we know won't be needed for reads. - pub(super) async fn update_layer_visibility( + pub(crate) async fn update_layer_visibility( &self, ) -> Result<(), super::layer_manager::Shutdown> { let head_lsn = self.get_last_record_lsn(); @@ -2202,7 +2212,7 @@ impl Timeline { let sub_compaction_max_job_size_mb = sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB); - let mut compact_jobs = Vec::new(); + let mut compact_jobs = Vec::::new(); // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning // by estimating the amount of files read for a compaction job. We should also partition on LSN. let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone(); @@ -2289,16 +2299,25 @@ impl Timeline { } else { end }; - info!( - "splitting compaction job: {}..{}, estimated_size={}", - start, end, total_size - ); - compact_jobs.push(GcCompactJob { - dry_run: job.dry_run, - compact_key_range: start..end, - compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn, - }); - current_start = Some(end); + if total_size == 0 && !compact_jobs.is_empty() { + info!( + "splitting compaction job: {}..{}, estimated_size={}, extending the previous job", + start, end, total_size + ); + compact_jobs.last_mut().unwrap().compact_key_range.end = end; + current_start = Some(end); + } else { + info!( + "splitting compaction job: {}..{}, estimated_size={}", + start, end, total_size + ); + compact_jobs.push(GcCompactJob { + dry_run: job.dry_run, + compact_key_range: start..end, + compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn, + }); + current_start = Some(end); + } } } Ok(compact_jobs) diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index cb7783d779..60e36a5d4d 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -15,8 +15,8 @@ use crate::{ tenant::{ layer_map::{BatchedUpdates, LayerMap}, storage_layer::{ - AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey, - ResidentLayer, + AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, + PersistentLayerKey, ResidentLayer, }, }, }; @@ -118,6 +118,12 @@ impl LayerManager { self.layers().values().filter(|l| l.is_likely_resident()) } + pub(crate) fn visible_layers(&self) -> impl Iterator + '_ { + self.layers() + .values() + .filter(|l| l.visibility() == LayerVisibilityHint::Visible) + } + pub(crate) fn contains(&self, layer: &Layer) -> bool { self.contains_key(&layer.layer_desc().key()) } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index f831f5e48a..67429bff98 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -56,6 +56,7 @@ pub struct WalReceiverConf { pub auth_token: Option>, pub availability_zone: Option, pub ingest_batch_size: u64, + pub validate_wal_contiguity: bool, } pub struct WalReceiver { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 65f9d39078..1955345315 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -537,6 +537,7 @@ impl ConnectionManagerState { let connect_timeout = self.conf.wal_connect_timeout; let ingest_batch_size = self.conf.ingest_batch_size; let protocol = self.conf.protocol; + let validate_wal_contiguity = self.conf.validate_wal_contiguity; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, @@ -558,6 +559,7 @@ impl ConnectionManagerState { ctx, node_id, ingest_batch_size, + validate_wal_contiguity, ) .await; @@ -1563,6 +1565,7 @@ mod tests { auth_token: None, availability_zone: None, ingest_batch_size: 1, + validate_wal_contiguity: false, }, wal_connection: None, wal_stream_candidates: HashMap::new(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 23db4f88d2..bb34a181da 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -13,12 +13,12 @@ use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; -use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow}; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError}; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{select, sync::watch, time}; +use tokio_postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow}; use tokio_postgres::{replication::ReplicationStream, Client}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn, Instrument}; @@ -64,7 +64,7 @@ pub(super) struct WalConnectionStatus { pub(super) enum WalReceiverError { /// An error of a type that does not indicate an issue, e.g. a connection closing - ExpectedSafekeeperError(postgres::Error), + ExpectedSafekeeperError(tokio_postgres::Error), /// An "error" message that carries a SUCCESSFUL_COMPLETION status code. Carries /// the message part of the original postgres error SuccessfulCompletion(String), @@ -120,6 +120,7 @@ pub(super) async fn handle_walreceiver_connection( ctx: RequestContext, safekeeper_node: NodeId, ingest_batch_size: u64, + validate_wal_contiguity: bool, ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -142,7 +143,7 @@ pub(super) async fn handle_walreceiver_connection( let mut config = wal_source_connconf.to_tokio_postgres_config(); config.application_name(format!("pageserver-{}", timeline.conf.id.0).as_str()); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); - match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await { + match time::timeout(connect_timeout, config.connect(tokio_postgres::NoTls)).await { Ok(client_and_conn) => client_and_conn?, Err(_elapsed) => { // Timing out to connect to a safekeeper node could happen long time, due to @@ -274,6 +275,7 @@ pub(super) async fn handle_walreceiver_connection( } => Some((format, compression)), }; + let mut expected_wal_start = startpoint; while let Some(replication_message) = { select! { _ = cancellation.cancelled() => { @@ -340,13 +342,49 @@ pub(super) async fn handle_walreceiver_connection( ) })?; + // Guard against WAL gaps. If the start LSN of the PG WAL section + // from which the interpreted records were extracted, doesn't match + // the end of the previous batch (or the starting point for the first batch), + // then kill this WAL receiver connection and start a new one. + if validate_wal_contiguity { + if let Some(raw_wal_start_lsn) = batch.raw_wal_start_lsn { + match raw_wal_start_lsn.cmp(&expected_wal_start) { + std::cmp::Ordering::Greater => { + let msg = format!( + "Gap in streamed WAL: [{}, {})", + expected_wal_start, raw_wal_start_lsn + ); + critical!("{msg}"); + return Err(WalReceiverError::Other(anyhow!(msg))); + } + std::cmp::Ordering::Less => { + // Other shards are reading WAL behind us. + // This is valid, but check that we received records + // that we haven't seen before. + if let Some(first_rec) = batch.records.first() { + if first_rec.next_record_lsn < last_rec_lsn { + let msg = format!( + "Received record with next_record_lsn multiple times ({} < {})", + first_rec.next_record_lsn, expected_wal_start + ); + critical!("{msg}"); + return Err(WalReceiverError::Other(anyhow!(msg))); + } + } + } + std::cmp::Ordering::Equal => {} + } + } + } + let InterpretedWalRecords { records, next_record_lsn, + raw_wal_start_lsn: _, } = batch; tracing::debug!( - "Received WAL up to {} with next_record_lsn={:?}", + "Received WAL up to {} with next_record_lsn={}", streaming_lsn, next_record_lsn ); @@ -423,12 +461,11 @@ pub(super) async fn handle_walreceiver_connection( // need to advance last record LSN on all shards. If we've not ingested the latest // record, then set the LSN of the modification past it. This way all shards // advance their last record LSN at the same time. - let needs_last_record_lsn_advance = match next_record_lsn { - Some(lsn) if lsn > modification.get_lsn() => { - modification.set_lsn(lsn).unwrap(); - true - } - _ => false, + let needs_last_record_lsn_advance = if next_record_lsn > modification.get_lsn() { + modification.set_lsn(next_record_lsn).unwrap(); + true + } else { + false }; if uncommitted_records > 0 || needs_last_record_lsn_advance { @@ -446,9 +483,8 @@ pub(super) async fn handle_walreceiver_connection( timeline.get_last_record_lsn() ); - if let Some(lsn) = next_record_lsn { - last_rec_lsn = lsn; - } + last_rec_lsn = next_record_lsn; + expected_wal_start = streaming_lsn; Some(streaming_lsn) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 04edb3e3f4..45c87353a7 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1180,6 +1180,50 @@ impl WalIngest { } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; } + // NB: We abuse the Checkpoint.redo field: + // + // - In PostgreSQL, the Checkpoint struct doesn't store the information + // of whether this is an online checkpoint or a shutdown checkpoint. It's + // stored in the XLOG info field of the WAL record, shutdown checkpoints + // use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use + // XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers + // in the pageserver, however. + // + // - In PostgreSQL, the Checkpoint.redo field stores the *start* of the + // checkpoint record, if it's a shutdown checkpoint. But when we are + // starting from a shutdown checkpoint, the basebackup LSN is the *end* + // of the shutdown checkpoint WAL record. That makes it difficult to + // correctly detect whether we're starting from a shutdown record or + // not. + // + // To address both of those issues, we store 0 in the redo field if it's + // an online checkpoint record, and the record's *end* LSN if it's a + // shutdown checkpoint. We don't need the original redo pointer in neon, + // because we don't perform WAL replay at startup anyway, so we can get + // away with abusing the redo field like this. + // + // XXX: Ideally, we would persist the extra information in a more + // explicit format, rather than repurpose the fields of the Postgres + // struct like this. However, we already have persisted data like this, + // so we need to maintain backwards compatibility. + // + // NB: We didn't originally have this convention, so there are still old + // persisted records that didn't do this. Before, we didn't update the + // persisted redo field at all. That means that old records have a bogus + // redo pointer that points to some old value, from the checkpoint record + // that was originally imported from the data directory. If it was a + // project created in Neon, that means it points to the first checkpoint + // after initdb. That's OK for our purposes: all such old checkpoints are + // treated as old online checkpoints when the basebackup is created. + cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { + // Store the *end* LSN of the checkpoint record. Or to be precise, + // the start LSN of the *next* record, i.e. if the record ends + // exactly at page boundary, the redo LSN points to just after the + // page header on the next page. + lsn.into() + } else { + Lsn::INVALID.into() + }; // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index a61dc9f4c6..f6a577abfc 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -22,6 +22,7 @@ #include "neon_pgversioncompat.h" #include "access/parallel.h" +#include "access/xlog.h" #include "funcapi.h" #include "miscadmin.h" #include "pagestore_client.h" @@ -40,12 +41,16 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif + #include "hll.h" #include "bitmap.h" #include "neon.h" #include "neon_perf_counters.h" -#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) +#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) /* * Local file cache is used to temporary store relations pages in local file system. @@ -93,7 +98,23 @@ #define MB ((uint64)1024*1024) #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) -#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32) + +/* + * Blocks are read or written to LFC file outside LFC critical section. + * To synchronize access to such block, writer set state of such block to PENDING. + * If some other backend (read or writer) see PENDING status, it change it to REQUESTED and start + * waiting until status is changed on conditional variable. + * When writer completes is operation, it checks if status is REQUESTED and if so, broadcast conditional variable, + * waking up all backend waiting for access to this block. + */ +typedef enum FileCacheBlockState +{ + UNAVAILABLE, /* block is not present in cache */ + AVAILABLE, /* block can be used */ + PENDING, /* block is loaded */ + REQUESTED /* some other backend is waiting for block to be loaded */ +} FileCacheBlockState; + typedef struct FileCacheEntry { @@ -101,10 +122,16 @@ typedef struct FileCacheEntry uint32 hash; uint32 offset; uint32 access_count; - uint32 bitmap[CHUNK_BITMAP_SIZE]; + uint32 state[(BLOCKS_PER_CHUNK + 31) / 32 * 2]; /* two bits per block */ dlist_node list_node; /* LRU/holes list node */ } FileCacheEntry; +#define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3) +#define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2)) + +#define N_COND_VARS 64 +#define CV_WAIT_TIMEOUT 10 + typedef struct FileCacheControl { uint64 generation; /* generation is needed to handle correct hash @@ -118,18 +145,24 @@ typedef struct FileCacheControl uint64 writes; /* number of writes issued */ uint64 time_read; /* time spent reading (us) */ uint64 time_write; /* time spent writing (us) */ + uint64 resizes; /* number of LFC resizes */ + uint64 evicted_pages; /* number of evicted pages */ dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ HyperLogLogState wss_estimation; /* estimation of working set size */ + ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */ } FileCacheControl; +bool lfc_store_prefetch_result; + static HTAB *lfc_hash; -static int lfc_desc = 0; +static int lfc_desc = -1; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; static char *lfc_path; +static uint64 lfc_generation; static FileCacheControl *lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 @@ -138,6 +171,20 @@ static shmem_request_hook_type prev_shmem_request_hook; #define LFC_ENABLED() (lfc_ctl->limit != 0) +/* + * Close LFC file if opened. + * All backends should close their LFC files once LFC is disabled. + */ +static void +lfc_close_file(void) +{ + if (lfc_desc >= 0) + { + close(lfc_desc); + lfc_desc = -1; + } +} + /* * Local file cache is optional and Neon can work without it. * In case of any any errors with this cache, we should disable it but to not throw error. @@ -145,20 +192,16 @@ static shmem_request_hook_type prev_shmem_request_hook; * All cache content should be invalidated to avoid reading of stale or corrupted data */ static void -lfc_disable(char const *op) +lfc_switch_off(void) { int fd; - elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path); - - /* Invalidate hash */ - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - if (LFC_ENABLED()) { HASH_SEQ_STATUS status; FileCacheEntry *entry; + /* Invalidate hash */ hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { @@ -171,41 +214,33 @@ lfc_disable(char const *op) dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); - if (lfc_desc > 0) - { - int rc; + /* + * We need to use unlink to to avoid races in LFC write, because it is not + * protected by lock + */ + unlink(lfc_path); - /* - * If the reason of error is ENOSPC, then truncation of file may - * help to reclaim some space - */ - pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_TRUNCATE); - rc = ftruncate(lfc_desc, 0); - pgstat_report_wait_end(); + fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); + if (fd < 0) + elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path); + else + close(fd); - if (rc < 0) - elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path); - } + /* Wakeup waiting backends */ + for (int i = 0; i < N_COND_VARS; i++) + ConditionVariableBroadcast(&lfc_ctl->cv[i]); } + lfc_close_file(); +} - /* - * We need to use unlink to to avoid races in LFC write, because it is not - * protectedby - */ - unlink(lfc_path); - - fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); - if (fd < 0) - elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path); - else - close(fd); +static void +lfc_disable(char const *op) +{ + elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + lfc_switch_off(); LWLockRelease(lfc_lock); - - if (lfc_desc > 0) - close(lfc_desc); - - lfc_desc = -1; } /* @@ -217,13 +252,20 @@ lfc_maybe_disabled(void) return !lfc_ctl || !LFC_ENABLED(); } +/* + * Open LFC file if not opened yet or generation is changed. + * Should be called under LFC lock. + */ static bool lfc_ensure_opened(void) { - bool enabled = !lfc_maybe_disabled(); - + if (lfc_generation != lfc_ctl->generation) + { + lfc_close_file(); + lfc_generation = lfc_ctl->generation; + } /* Open cache file if not done yet */ - if (lfc_desc <= 0 && enabled) + if (lfc_desc < 0) { lfc_desc = BasicOpenFile(lfc_path, O_RDWR); @@ -233,7 +275,7 @@ lfc_ensure_opened(void) return false; } } - return enabled; + return true; } static void @@ -267,14 +309,7 @@ lfc_shmem_startup(void) n_chunks + 1, n_chunks + 1, &info, HASH_ELEM | HASH_BLOBS); - lfc_ctl->generation = 0; - lfc_ctl->size = 0; - lfc_ctl->used = 0; - lfc_ctl->hits = 0; - lfc_ctl->misses = 0; - lfc_ctl->writes = 0; - lfc_ctl->time_read = 0; - lfc_ctl->time_write = 0; + memset(lfc_ctl, 0, sizeof(FileCacheControl)); dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); @@ -285,7 +320,7 @@ lfc_shmem_startup(void) fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) { - elog(WARNING, "Failed to create local file cache %s: %m", lfc_path); + elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path); lfc_ctl->limit = 0; } else @@ -293,6 +328,11 @@ lfc_shmem_startup(void) close(fd); lfc_ctl->limit = SIZE_MB_TO_CHUNKS(lfc_size_limit); } + + /* Initialize turnstile of condition variables */ + for (int i = 0; i < N_COND_VARS; i++) + ConditionVariableInit(&lfc_ctl->cv[i]); + } LWLockRelease(AddinShmemInitLock); } @@ -327,7 +367,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source) { if (*newval > lfc_max_size) { - elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size"); + elog(ERROR, "LFC: neon.file_cache_size_limit can not be larger than neon.max_file_cache_size"); return false; } return true; @@ -338,14 +378,31 @@ lfc_change_limit_hook(int newval, void *extra) { uint32 new_size = SIZE_MB_TO_CHUNKS(newval); - if (!is_normal_backend()) - return; - - if (!lfc_ensure_opened()) + if (!lfc_ctl || !is_normal_backend()) return; LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + /* Open LFC file only if LFC was enabled or we are going to reenable it */ + if (newval == 0 && !LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + /* File should be reopened if LFC is reenabled */ + lfc_close_file(); + return; + } + + if (!lfc_ensure_opened()) + { + LWLockRelease(lfc_lock); + return; + } + + if (lfc_ctl->limit != new_size) + { + lfc_ctl->resizes += 1; + } + while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru)) { /* @@ -367,7 +424,9 @@ lfc_change_limit_hook(int newval, void *extra) /* We remove the old entry, and re-enter a hole to the hash table */ for (int i = 0; i < BLOCKS_PER_CHUNK; i++) { - lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1; + bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; + lfc_ctl->used_pages -= is_page_cached; + lfc_ctl->evicted_pages += is_page_cached; } hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); @@ -383,10 +442,11 @@ lfc_change_limit_hook(int newval, void *extra) lfc_ctl->used -= 1; } - lfc_ctl->limit = new_size; - if (new_size == 0) { - lfc_ctl->generation += 1; - } + if (new_size == 0) + lfc_switch_off(); + else + lfc_ctl->limit = new_size; + neon_log(DEBUG1, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); @@ -403,6 +463,17 @@ lfc_init(void) neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries"); + DefineCustomBoolVariable("neon.store_prefetch_result_in_lfc", + "Immediately store received prefetch result in LFC", + NULL, + &lfc_store_prefetch_result, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + DefineCustomIntVariable("neon.max_file_cache_size", "Maximal size of Neon local file cache", NULL, @@ -480,7 +551,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) if (LFC_ENABLED()) { entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); - found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & ((uint32)1 << (chunk_offs & 31))) != 0; + found = entry != NULL && GET_STATE(entry, chunk_offs) != UNAVAILABLE; } LWLockRelease(lfc_lock); return found; @@ -529,8 +600,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) { - if ((entry->bitmap[chunk_offs >> 5] & - ((uint32)1 << (chunk_offs & 31))) != 0) + if (GET_STATE(entry, chunk_offs) != UNAVAILABLE) { BITMAP_SET(bitmap, i); found++; @@ -541,7 +611,6 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { i += this_chunk; } - /* * Break out of the iteration before doing expensive stuff for * a next iteration @@ -577,87 +646,6 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, return found; } -/* - * Evict a page (if present) from the local file cache - */ -void -lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) -{ - BufferTag tag; - FileCacheEntry *entry; - bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); - uint32 hash; - - if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return; - - CopyNRelFileInfoToBufTag(tag, rinfo); - tag.forkNum = forkNum; - tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1)); - - CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) - { - LWLockRelease(lfc_lock); - return; - } - - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found); - - if (!found) - { - /* nothing to do */ - LWLockRelease(lfc_lock); - return; - } - - /* remove the page from the cache */ - entry->bitmap[chunk_offs >> 5] &= ~((uint32)1 << (chunk_offs & (32 - 1))); - - if (entry->access_count == 0) - { - /* - * If the chunk has no live entries, we can position the chunk to be - * recycled first. - */ - if (entry->bitmap[chunk_offs >> 5] == 0) - { - bool has_remaining_pages = false; - - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) - { - if (entry->bitmap[i] != 0) - { - has_remaining_pages = true; - break; - } - } - - /* - * Put the entry at the position that is first to be reclaimed when we - * have no cached pages remaining in the chunk - */ - if (!has_remaining_pages) - { - dlist_delete(&entry->list_node); - dlist_push_head(&lfc_ctl->lru, &entry->list_node); - } - } - } - - /* - * Done: apart from empty chunks, we don't move chunks in the LRU when - * they're empty because eviction isn't usage. - */ - - LWLockRelease(lfc_lock); -} - /* * Try to read pages from local cache. * Returns the number of pages read from the local cache, and sets bits in @@ -685,17 +673,14 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return 0; - - if (!lfc_ensure_opened()) - return 0; + return -1; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - /* + /* * For every chunk that has blocks we're interested in, we * 1. get the chunk header * 2. Check if the chunk actually has the blocks we're interested in @@ -712,22 +697,35 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int iteration_hits = 0; int iteration_misses = 0; uint64 io_time_us = 0; + int n_blocks_to_read = 0; + ConditionVariable* cv; + Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) { + n_blocks_to_read += (BITMAP_ISSET(mask, buf_offset + i) != 0); iov[i].iov_base = buffers[buf_offset + i]; iov[i].iov_len = BLCKSZ; + BITMAP_CLR(mask, buf_offset + i); + } + if (n_blocks_to_read == 0) + { + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + continue; } tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); + cv = &lfc_ctl->cv[hash % N_COND_VARS]; LWLockAcquire(lfc_lock, LW_EXCLUSIVE); /* We can return the blocks we've read before LFC got disabled; * assuming we read any. */ - if (!LFC_ENABLED()) + if (!LFC_ENABLED() || !lfc_ensure_opened()) { LWLockRelease(lfc_lock); return blocks_read; @@ -763,15 +761,32 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - LWLockRelease(lfc_lock); - for (int i = 0; i < blocks_in_chunk; i++) { - /* - * If the page is valid, we consider it "read". - * All other pages will be fetched separately by the next cache - */ - if (entry->bitmap[(chunk_offs + i) / 32] & ((uint32)1 << ((chunk_offs + i) % 32))) + FileCacheBlockState state = UNAVAILABLE; + bool sleeping = false; + while (lfc_ctl->generation == generation) + { + state = GET_STATE(entry, chunk_offs + i); + if (state == PENDING) { + SET_STATE(entry, chunk_offs + i, REQUESTED); + } else if (state != REQUESTED) { + break; + } + if (!sleeping) + { + ConditionVariablePrepareToSleep(cv); + sleeping = true; + } + LWLockRelease(lfc_lock); + ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + } + if (sleeping) + { + ConditionVariableCancelSleep(); + } + if (state == AVAILABLE) { BITMAP_SET(mask, buf_offset + i); iteration_hits++; @@ -779,6 +794,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, else iteration_misses++; } + LWLockRelease(lfc_lock); Assert(iteration_hits + iteration_misses > 0); @@ -820,6 +836,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, else { /* generation mismatch, assume error condition */ + lfc_close_file(); LWLockRelease(lfc_lock); return -1; } @@ -835,6 +852,249 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, return blocks_read; } +/* + * Initialize new LFC hash entry, perform eviction if needed. + * Returns false if there are no unpinned entries and chunk can not be added. + */ +static bool +lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) +{ + /*----------- + * If the chunk wasn't already in the LFC then we have these + * options, in order of preference: + * + * Unless there is no space available, we can: + * 1. Use an entry from the `holes` list, and + * 2. Create a new entry. + * We can always, regardless of space in the LFC: + * 3. evict an entry from LRU, and + * 4. ignore the write operation (the least favorite option) + */ + if (lfc_ctl->used < lfc_ctl->limit) + { + if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, + dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool hole_found; + + hash_search_with_hash_value(lfc_hash, &hole->key, + hole->hash, HASH_REMOVE, &hole_found); + CriticalAssert(hole_found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } + else + { + lfc_ctl->used += 1; + entry->offset = lfc_ctl->size++;/* allocate new chunk at end + * of file */ + } + } + /* + * We've already used up all allocated LFC entries. + * + * If we can clear an entry from the LRU, do that. + * If we can't (e.g. because all other slots are being accessed) + * then we will remove this entry from the hash and continue + * on to the next chunk, as we may not exceed the limit. + */ + else if (!dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, + dlist_pop_head_node(&lfc_ctl->lru)); + + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + { + bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; + lfc_ctl->used_pages -= is_page_cached; + lfc_ctl->evicted_pages += is_page_cached; + } + + CriticalAssert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search_with_hash_value(lfc_hash, &victim->key, + victim->hash, HASH_REMOVE, NULL); + neon_log(DEBUG2, "Swap file cache page"); + } + else + { + /* Can't add this chunk - we don't have the space for it */ + hash_search_with_hash_value(lfc_hash, &entry->key, hash, + HASH_REMOVE, NULL); + + return false; + } + + entry->access_count = 1; + entry->hash = hash; + + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + SET_STATE(entry, i, UNAVAILABLE); + + return true; +} + +/* + * Store received prefetch result in LFC cache. + * Unlike lfc_read/lfc_write this call is is not protected by shared buffer lock. + * So we should be ready that other backends will try to concurrently read or write this block. + * We do not store prefetched block if it already exists in LFC or it's not_modified_since LSN is smaller + * than current last written LSN (LwLSN). + * + * We can enforce correctness of storing page in LFC by the following steps: + * 1. Check under LFC lock that page in not present in LFC. + * 2. Check under LFC lock that LwLSN is not changed since prefetch request time (not_modified_since). + * 3. Change page state to "Pending" under LFC lock to prevent all other backends to read or write this + * pages until this write is completed. + * 4. Assume that some other backend creates new image of the page without reading it + * (because reads will be blocked because of 2). This version of the page is stored in shared buffer. + * Any attempt to throw away this page from shared buffer will be blocked, because Postgres first + * needs to save dirty page and write will be blocked because of 2. + * So any backend trying to access this page, will take it from shared buffer without accessing + * SMGR and LFC. + * 5. After write completion we once again obtain LFC lock and wake-up all waiting backends. + * If there is some backend waiting to write new image of the page (4) then now it will be able to + * do it,overwriting old (prefetched) page image. As far as this write will be completed before + * shared buffer can be reassigned, not other backend can see old page image. +*/ +bool +lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + const void* buffer, XLogRecPtr lsn) +{ + BufferTag tag; + FileCacheEntry *entry; + ssize_t rc; + bool found; + uint32 hash; + uint64 generation; + uint32 entry_offset; + instr_time io_start, io_end; + ConditionVariable* cv; + FileCacheBlockState state; + XLogRecPtr lwlsn; + + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return false; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forknum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + cv = &lfc_ctl->cv[hash % N_COND_VARS]; + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED() || !lfc_ensure_opened()) + { + LWLockRelease(lfc_lock); + return false; + } + lwlsn = GetLastWrittenLSN(rinfo, forknum, blkno); + if (lwlsn > lsn) + { + elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X", + blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn)); + LWLockRelease(lfc_lock); + return false; + } + + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); + + if (found) + { + state = GET_STATE(entry, chunk_offs); + if (state != UNAVAILABLE) { + /* Do not rewrite existed LFC entry */ + LWLockRelease(lfc_lock); + return false; + } + /* + * Unlink entry from LRU list to pin it for the duration of IO + * operation + */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); + } + else + { + if (!lfc_init_new_entry(entry, hash)) + { + /* + * We can't process this chunk due to lack of space in LFC, + * so skip to the next one + */ + LWLockRelease(lfc_lock); + return false; + } + } + + generation = lfc_ctl->generation; + entry_offset = entry->offset; + + SET_STATE(entry, chunk_offs, PENDING); + + LWLockRelease(lfc_lock); + + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); + INSTR_TIME_SET_CURRENT(io_start); + rc = pwrite(lfc_desc, buffer, BLCKSZ, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + INSTR_TIME_SET_CURRENT(io_end); + pgstat_report_wait_end(); + + if (rc != BLCKSZ) + { + lfc_disable("write"); + } + else + { + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + uint64 time_spent_us; + CriticalAssert(LFC_ENABLED()); + /* Place entry to the head of LRU list */ + CriticalAssert(entry->access_count > 0); + + lfc_ctl->writes += 1; + INSTR_TIME_SUBTRACT(io_start, io_end); + time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); + lfc_ctl->time_write += time_spent_us; + inc_page_cache_write_wait(time_spent_us); + + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + + state = GET_STATE(entry, chunk_offs); + if (state == REQUESTED) { + ConditionVariableBroadcast(cv); + } + if (state != AVAILABLE) + { + lfc_ctl->used_pages += 1; + SET_STATE(entry, chunk_offs, AVAILABLE); + } + } + else + { + lfc_close_file(); + } + LWLockRelease(lfc_lock); + } + return true; +} + /* * Put page in local file cache. * If cache is full then evict some other page. @@ -855,15 +1115,21 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; - if (!lfc_ensure_opened()) - return; - CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - /* + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED() || !lfc_ensure_opened()) + { + LWLockRelease(lfc_lock); + return; + } + generation = lfc_ctl->generation; + + /* * For every chunk that has blocks we're interested in, we * 1. get the chunk header * 2. Check if the chunk actually has the blocks we're interested in @@ -878,6 +1144,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); instr_time io_start, io_end; + ConditionVariable* cv; + Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -888,14 +1156,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); hash = get_hash_value(lfc_hash, &tag); - - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) - { - LWLockRelease(lfc_lock); - return; - } + cv = &lfc_ctl->cv[hash % N_COND_VARS]; entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); @@ -908,92 +1169,50 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (entry->access_count++ == 0) dlist_delete(&entry->list_node); } - /*----------- - * If the chunk wasn't already in the LFC then we have these - * options, in order of preference: - * - * Unless there is no space available, we can: - * 1. Use an entry from the `holes` list, and - * 2. Create a new entry. - * We can always, regardless of space in the LFC: - * 3. evict an entry from LRU, and - * 4. ignore the write operation (the least favorite option) - */ - else if (lfc_ctl->used < lfc_ctl->limit) - { - if (!dlist_is_empty(&lfc_ctl->holes)) - { - /* We can reuse a hole that was left behind when the LFC was shrunk previously */ - FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, - dlist_pop_head_node(&lfc_ctl->holes)); - uint32 offset = hole->offset; - bool hole_found; - - hash_search_with_hash_value(lfc_hash, &hole->key, - hole->hash, HASH_REMOVE, &hole_found); - CriticalAssert(hole_found); - - lfc_ctl->used += 1; - entry->offset = offset; /* reuse the hole */ - } - else - { - lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++;/* allocate new chunk at end - * of file */ - } - } - /* - * We've already used up all allocated LFC entries. - * - * If we can clear an entry from the LRU, do that. - * If we can't (e.g. because all other slots are being accessed) - * then we will remove this entry from the hash and continue - * on to the next chunk, as we may not exceed the limit. - */ - else if (!dlist_is_empty(&lfc_ctl->lru)) - { - /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, - dlist_pop_head_node(&lfc_ctl->lru)); - - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) - { - lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1; - } - - CriticalAssert(victim->access_count == 0); - entry->offset = victim->offset; /* grab victim's chunk */ - hash_search_with_hash_value(lfc_hash, &victim->key, - victim->hash, HASH_REMOVE, NULL); - neon_log(DEBUG2, "Swap file cache page"); - } else { - /* Can't add this chunk - we don't have the space for it */ - hash_search_with_hash_value(lfc_hash, &entry->key, hash, - HASH_REMOVE, NULL); - - /* - * We can't process this chunk due to lack of space in LFC, - * so skip to the next one - */ - LWLockRelease(lfc_lock); - blkno += blocks_in_chunk; - buf_offset += blocks_in_chunk; - nblocks -= blocks_in_chunk; - continue; + if (!lfc_init_new_entry(entry, hash)) + { + /* + * We can't process this chunk due to lack of space in LFC, + * so skip to the next one + */ + blkno += blocks_in_chunk; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + continue; + } } - if (!found) - { - entry->access_count = 1; - entry->hash = hash; - memset(entry->bitmap, 0, sizeof entry->bitmap); - } - - generation = lfc_ctl->generation; entry_offset = entry->offset; + + for (int i = 0; i < blocks_in_chunk; i++) + { + FileCacheBlockState state = UNAVAILABLE; + bool sleeping = false; + while (lfc_ctl->generation == generation) + { + state = GET_STATE(entry, chunk_offs + i); + if (state == PENDING) { + SET_STATE(entry, chunk_offs + i, REQUESTED); + } else if (state != REQUESTED) { + SET_STATE(entry, chunk_offs + i, PENDING); + break; + } + if (!sleeping) + { + ConditionVariablePrepareToSleep(cv); + sleeping = true; + } + LWLockRelease(lfc_lock); + ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + } + if (sleeping) + { + ConditionVariableCancelSleep(); + } + } LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); @@ -1006,6 +1225,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (rc != BLCKSZ * blocks_in_chunk) { lfc_disable("write"); + return; } else { @@ -1029,18 +1249,30 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, for (int i = 0; i < blocks_in_chunk; i++) { - lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1); - entry->bitmap[(chunk_offs + i) >> 5] |= - ((uint32)1 << ((chunk_offs + i) & 31)); + FileCacheBlockState state = GET_STATE(entry, chunk_offs + i); + if (state == REQUESTED) + { + ConditionVariableBroadcast(cv); + } + if (state != AVAILABLE) + { + lfc_ctl->used_pages += 1; + SET_STATE(entry, chunk_offs + i, AVAILABLE); + } } } - - LWLockRelease(lfc_lock); + else + { + /* stop iteration if LFC was disabled */ + lfc_close_file(); + break; + } } blkno += blocks_in_chunk; buf_offset += blocks_in_chunk; nblocks -= blocks_in_chunk; } + LWLockRelease(lfc_lock); } typedef struct @@ -1127,6 +1359,16 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) if (lfc_ctl) value = lfc_ctl->used_pages; break; + case 6: + key = "file_cache_evicted_pages"; + if (lfc_ctl) + value = lfc_ctl->evicted_pages; + break; + case 7: + key = "file_cache_limit"; + if (lfc_ctl) + value = lfc_ctl->limit; + break; default: SRF_RETURN_DONE(funcctx); } @@ -1250,8 +1492,8 @@ local_cache_pages(PG_FUNCTION_ARGS) hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) - n_pages += pg_popcount32(entry->bitmap[i]); + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + n_pages += GET_STATE(entry, i) == AVAILABLE; } } } @@ -1279,7 +1521,7 @@ local_cache_pages(PG_FUNCTION_ARGS) { for (int i = 0; i < BLOCKS_PER_CHUNK; i++) { - if (entry->bitmap[i >> 5] & ((uint32)1 << (i & 31))) + if (GET_STATE(entry, i) == AVAILABLE) { fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i; fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index ce2938cfd5..700a942284 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -56,6 +56,7 @@ uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; uint32 WAIT_EVENT_NEON_LFC_READ; uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; uint32 WAIT_EVENT_NEON_LFC_WRITE; +uint32 WAIT_EVENT_NEON_LFC_CV_WAIT; uint32 WAIT_EVENT_NEON_PS_STARTING; uint32 WAIT_EVENT_NEON_PS_CONFIGURING; uint32 WAIT_EVENT_NEON_PS_SEND; @@ -538,6 +539,7 @@ neon_shmem_startup_hook(void) WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read"); WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate"); WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write"); + WAIT_EVENT_NEON_LFC_CV_WAIT = WaitEventExtensionNew("Neon/FileCache_CvWait"); WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting"); WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring"); WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO"); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 79aa88b8d3..912e09c3d3 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -28,6 +28,7 @@ extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; extern uint32 WAIT_EVENT_NEON_LFC_READ; extern uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; extern uint32 WAIT_EVENT_NEON_LFC_WRITE; +extern uint32 WAIT_EVENT_NEON_LFC_CV_WAIT; extern uint32 WAIT_EVENT_NEON_PS_STARTING; extern uint32 WAIT_EVENT_NEON_PS_CONFIGURING; extern uint32 WAIT_EVENT_NEON_PS_SEND; @@ -38,6 +39,7 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL; #define WAIT_EVENT_NEON_LFC_READ WAIT_EVENT_BUFFILE_READ #define WAIT_EVENT_NEON_LFC_TRUNCATE WAIT_EVENT_BUFFILE_TRUNCATE #define WAIT_EVENT_NEON_LFC_WRITE WAIT_EVENT_BUFFILE_WRITE +#define WAIT_EVENT_NEON_LFC_CV_WAIT WAIT_EVENT_BUFFILE_READ #define WAIT_EVENT_NEON_PS_STARTING PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_PS_CONFIGURING PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_PS_SEND PG_WAIT_EXTENSION diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 7b748d7252..9faab1e4f0 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -233,6 +233,7 @@ extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; extern int neon_protocol_version; +extern bool lfc_store_prefetch_result; extern shardno_t get_shard_number(BufferTag* tag); @@ -301,14 +302,16 @@ extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int nblocks, bits8 *bitmap); -extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); +extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + const void* buffer, XLogRecPtr lsn); + static inline bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, void *buffer) { - bits8 rv = 0; + bits8 rv = 1; return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index f1087a8ccb..4a79acd777 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -162,7 +162,7 @@ static uint32 local_request_counter; * UNUSED ------> REQUESTED --> RECEIVED * ^ : | | * | : v | - * | : TAG_UNUSED | + * | : TAG_REMAINS | * | : | | * +----------------+------------+ * : @@ -181,7 +181,7 @@ typedef enum PrefetchStatus /* must fit in uint8; bits 0x1 are used */ typedef enum { PRFSF_NONE = 0x0, - PRFSF_SEQ = 0x1, + PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ } PrefetchRequestFlags; typedef struct PrefetchRequest @@ -305,7 +305,7 @@ GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, static void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *output, - BlockNumber nblocks, const bits8 *mask); + BlockNumber nblocks); static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot); @@ -363,6 +363,7 @@ compact_prefetch_buffers(void) target_slot->buftag = source_slot->buftag; target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; + target_slot->flags = source_slot->flags; target_slot->response = source_slot->response; target_slot->reqid = source_slot->reqid; target_slot->request_lsns = source_slot->request_lsns; @@ -452,6 +453,18 @@ prefetch_pump_state(void) /* update slot state */ slot->status = PRFS_RECEIVED; slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } } } @@ -474,8 +487,7 @@ readahead_buffer_resize(int newsize, void *extra) */ if (MyPState->n_requests_inflight > newsize) { - Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize); - prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize)); + prefetch_wait_for(MyPState->ring_unused - newsize - 1); Assert(MyPState->n_requests_inflight <= newsize); } @@ -714,6 +726,18 @@ prefetch_read(PrefetchRequest *slot) /* update slot state */ slot->status = PRFS_RECEIVED; slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } return true; } else @@ -865,7 +889,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns else neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, - &slot->request_lsns, 1, NULL); + &slot->request_lsns, 1); request.hdr.lsn = slot->request_lsns.request_lsn; request.hdr.not_modified_since = slot->request_lsns.not_modified_since; @@ -891,6 +915,73 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns Assert(!found); } +/* + * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. + * Present pages are marked in "mask" bitmap and total number of such pages is returned. + */ +static int +prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns, + BlockNumber nblocks, void **buffers, bits8 *mask) +{ + int hits = 0; + PrefetchRequest hashkey; + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); + hashkey.buftag.forkNum = forknum; + + for (int i = 0; i < nblocks; i++) + { + PrfHashEntry *entry; + + hashkey.buftag.blockNum = blocknum + i; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + PrefetchRequest *slot = entry->slot; + uint64 ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); + + if (slot->status != PRFS_RECEIVED) + continue; + + /* + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. + */ + if (!neon_prefetch_response_usable(&lsns[i], slot)) + continue; + + memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); + prefetch_set_unused(ring_index); + BITMAP_SET(mask, i); + + hits += 1; + } + } + pgBufferUsage.prefetch.hits += hits; + return hits; +} + +#if PG_MAJORVERSION_NUM < 17 +static bool +prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_request_lsns *lsns, void *buffer) +{ + bits8 present = 0; + return prefetch_lookupv(rinfo, forkNum, blkn, lsns, 1, &buffer, &present) != 0; +} +#endif + /* * prefetch_register_bufferv() - register and prefetch buffers * @@ -1014,8 +1105,6 @@ Retry: /* The buffered request is good enough, return that index */ if (is_prefetch) pgBufferUsage.prefetch.duplicates++; - else - pgBufferUsage.prefetch.hits++; continue; } } @@ -1117,6 +1206,7 @@ Retry: slot->buftag = hashkey.buftag; slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; + slot->flags = 0; min_ring_index = Min(min_ring_index, ring_index); @@ -2057,8 +2147,7 @@ GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, */ static void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, - neon_request_lsns *output, BlockNumber nblocks, - const bits8 *mask) + neon_request_lsns *output, BlockNumber nblocks) { XLogRecPtr last_written_lsns[PG_IOV_MAX]; @@ -2146,9 +2235,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *result = &output[i]; XLogRecPtr last_written_lsn = last_written_lsns[i]; - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) - continue; - if (last_written_lsn > replay_lsn) { /* GetCurrentReplayRecPtr was introduced in v15 */ @@ -2191,8 +2277,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *result = &output[i]; XLogRecPtr last_written_lsn = last_written_lsns[i]; - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) - continue; /* * Use the latest LSN that was evicted from the buffer cache as the * 'not_modified_since' hint. Any pages modified by later WAL records @@ -2414,7 +2498,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) } neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); { NeonExistsRequest request = { .hdr.tag = T_NeonExistsRequest, @@ -2833,8 +2917,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, while (nblocks > 0) { int iterblocks = Min(nblocks, PG_IOV_MAX); - bits8 lfc_present[PG_IOV_MAX / 8]; - memset(lfc_present, 0, sizeof(lfc_present)); + bits8 lfc_present[PG_IOV_MAX / 8] = {0}; if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum, iterblocks, lfc_present) == iterblocks) @@ -2845,12 +2928,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } tag.blockNum = blocknum; - + for (int i = 0; i < PG_IOV_MAX / 8; i++) lfc_present[i] = ~(lfc_present[i]); ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present, true); + nblocks -= iterblocks; blocknum += iterblocks; @@ -3106,7 +3190,8 @@ Retry: } } memcpy(buffer, getpage_resp->page, BLCKSZ); - lfc_write(rinfo, forkNum, blockno, buffer); + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forkNum, blockno, buffer); break; } case T_NeonErrorResponse: @@ -3191,6 +3276,17 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } + /* Try to read PS results if they are available */ + prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); + + if (prefetch_lookup(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, buffer)) + { + /* Prefetch hit */ + return; + } + /* Try to read from local file cache */ if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) { @@ -3198,9 +3294,11 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL @@ -3281,11 +3379,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer #if PG_MAJORVERSION_NUM >= 17 static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void **buffers, BlockNumber nblocks) + void **buffers, BlockNumber nblocks) { + bits8 prefetch_hits[PG_IOV_MAX / 8] = {0}; + bits8 lfc_hits[PG_IOV_MAX / 8]; bits8 read[PG_IOV_MAX / 8]; neon_request_lsns request_lsns[PG_IOV_MAX]; int lfc_result; + int prefetch_result; switch (reln->smgr_relpersistence) { @@ -3308,38 +3409,52 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, neon_log(ERROR, "Read request too large: %d is larger than max %d", nblocks, PG_IOV_MAX); - memset(read, 0, sizeof(read)); + /* Try to read PS results if they are available */ + prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, + request_lsns, nblocks); + + + prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks, buffers, prefetch_hits); + + if (prefetch_result == nblocks) + return; + + /* invert the result: exclude prefetched blocks */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + lfc_hits[i] = ~prefetch_hits[i]; /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, - nblocks, read); + nblocks, lfc_hits); if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; /* Read all blocks from LFC, so we're done */ - if (lfc_result == nblocks) + if (prefetch_result + lfc_result == nblocks) return; - if (lfc_result == -1) + if (lfc_result <= 0) { /* can't use the LFC result, so read all blocks from PS */ for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = 0xFF; + read[i] = ~prefetch_hits[i]; } else { /* invert the result: exclude blocks read from lfc */ for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = ~(read[i]); + read[i] = ~(prefetch_hits[i] | lfc_hits[i]); } - neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, - request_lsns, nblocks, read); - neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read); + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL @@ -3611,7 +3726,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) } neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); { NeonNblocksRequest request = { @@ -3696,7 +3811,7 @@ neon_dbsize(Oid dbNode) NRelFileInfo dummy_node = {0}; neon_get_request_lsns(dummy_node, MAIN_FORKNUM, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); { NeonDbSizeRequest request = { @@ -4431,7 +4546,12 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) if (no_redo_needed) { SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); - lfc_evict(rinfo, forknum, blkno); + /* + * Redo changes if page exists in LFC. + * We should perform this check after assigning LwLSN to prevent + * prefetching of some older version of the page by some other backend. + */ + no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno); } LWLockRelease(partitionLock); diff --git a/poetry.lock b/poetry.lock index d66c3aae7a..ba3b0535e4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -122,7 +122,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.12.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] +speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.2.0) ; sys_platform == \"linux\" or sys_platform == \"darwin\"", "brotlicffi ; platform_python_implementation != \"CPython\""] [[package]] name = "aiopg" @@ -160,30 +160,30 @@ frozenlist = ">=1.1.0" [[package]] name = "allure-pytest" -version = "2.13.2" +version = "2.13.5" description = "Allure pytest integration" optional = false python-versions = "*" groups = ["main"] files = [ - {file = "allure-pytest-2.13.2.tar.gz", hash = "sha256:22243159e8ec81ce2b5254b4013802198821b1b42f118f69d4a289396607c7b3"}, - {file = "allure_pytest-2.13.2-py3-none-any.whl", hash = "sha256:17de9dbee7f61c8e66a5b5e818b00e419dbcea44cb55c24319401ba813220690"}, + {file = "allure-pytest-2.13.5.tar.gz", hash = "sha256:0ef8e1790c44a988db6b83c4d4f5e91451e2c4c8ea10601dfa88528d23afcf6e"}, + {file = "allure_pytest-2.13.5-py3-none-any.whl", hash = "sha256:94130bac32964b78058e62cf4b815ad97a5ac82a065e6dd2d43abac2be7640fc"}, ] [package.dependencies] -allure-python-commons = "2.13.2" +allure-python-commons = "2.13.5" pytest = ">=4.5.0" [[package]] name = "allure-python-commons" -version = "2.13.2" -description = "Common module for integrate allure with python-based frameworks" +version = "2.13.5" +description = "('Contains the API for end users as well as helper functions and classes to build Allure adapters for Python test frameworks',)" optional = false python-versions = ">=3.6" groups = ["main"] files = [ - {file = "allure-python-commons-2.13.2.tar.gz", hash = "sha256:8a03681330231b1deadd86b97ff68841c6591320114ae638570f1ed60d7a2033"}, - {file = "allure_python_commons-2.13.2-py3-none-any.whl", hash = "sha256:2bb3646ec3fbf5b36d178a5e735002bc130ae9f9ba80f080af97d368ba375051"}, + {file = "allure-python-commons-2.13.5.tar.gz", hash = "sha256:a232e7955811f988e49a4c1dd6c16cce7e9b81d0ea0422b1e5654d3254e2caf3"}, + {file = "allure_python_commons-2.13.5-py3-none-any.whl", hash = "sha256:8b0e837b6e32d810adec563f49e1d04127a5b6770e0232065b7cb09b9953980d"}, ] [package.dependencies] @@ -232,7 +232,7 @@ sniffio = ">=1.1" [package.extras] doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\""] trio = ["trio (>=0.23)"] [[package]] @@ -308,8 +308,8 @@ files = [ [package.extras] docs = ["Sphinx (>=8.1.3,<8.2.0)", "sphinx-rtd-theme (>=1.2.2)"] -gssauth = ["gssapi", "sspilib"] -test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi", "k5test", "mypy (>=1.8.0,<1.9.0)", "sspilib", "uvloop (>=0.15.3)"] +gssauth = ["gssapi ; platform_system != \"Windows\"", "sspilib ; platform_system == \"Windows\""] +test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi ; platform_system == \"Linux\"", "k5test ; platform_system == \"Linux\"", "mypy (>=1.8.0,<1.9.0)", "sspilib ; platform_system == \"Windows\"", "uvloop (>=0.15.3) ; platform_system != \"Windows\" and python_version < \"3.14.0\""] [[package]] name = "attrs" @@ -324,10 +324,10 @@ files = [ ] [package.extras] -dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] +dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] -tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] -tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] +tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] +tests-no-zope = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] [[package]] name = "aws-sam-translator" @@ -1074,10 +1074,10 @@ files = [ cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} [package.extras] -docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0)"] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0) ; python_version >= \"3.8\""] docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"] -nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"] -pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] +nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2) ; python_version >= \"3.8\""] +pep8test = ["check-sdist ; python_version >= \"3.8\"", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] sdist = ["build (>=1.0.0)"] ssh = ["bcrypt (>=3.1.5)"] test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] @@ -1359,7 +1359,7 @@ idna = "*" sniffio = "*" [package.extras] -brotli = ["brotli", "brotlicffi"] +brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] @@ -1545,8 +1545,8 @@ files = [ [package.extras] docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] -testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] -testing-libs = ["simplejson", "ujson", "yajl"] +testing = ["ecdsa", "enum34 ; python_version == \"2.7\"", "feedparser", "jsonlib ; python_version == \"2.7\"", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0) ; python_version <= \"3.6\"", "pytest-flake8 (>=1.1.1) ; python_version >= \"3.7\"", "scikit-learn", "sqlalchemy"] +testing-libs = ["simplejson", "ujson", "yajl ; python_version == \"2.7\""] [[package]] name = "jsonpointer" @@ -1867,7 +1867,7 @@ files = [ [package.extras] develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] docs = ["sphinx"] -gmpy = ["gmpy2 (>=2.1.0a4)"] +gmpy = ["gmpy2 (>=2.1.0a4) ; platform_python_implementation != \"PyPy\""] tests = ["pytest (>=4.6)"] [[package]] @@ -2330,7 +2330,7 @@ files = [ ] [package.extras] -test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +test = ["enum34 ; python_version <= \"3.4\"", "ipaddress ; python_version < \"3.0\"", "mock ; python_version < \"3.0\"", "pywin32 ; sys_platform == \"win32\"", "wmi ; sys_platform == \"win32\""] [[package]] name = "psycopg2-binary" @@ -2456,7 +2456,7 @@ typing-extensions = ">=4.12.2" [package.extras] email = ["email-validator (>=2.0.0)"] -timezone = ["tzdata"] +timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] [[package]] name = "pydantic-core" @@ -3068,7 +3068,7 @@ requests = ">=2.30.0,<3.0" urllib3 = ">=1.25.10,<3.0" [package.extras] -tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli", "tomli-w", "types-PyYAML", "types-requests"] +tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli ; python_version < \"3.11\"", "tomli-w", "types-PyYAML", "types-requests"] [[package]] name = "rfc3339-validator" @@ -3161,7 +3161,7 @@ files = [ [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -3407,8 +3407,8 @@ files = [ ] [package.extras] -brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress ; python_version == \"2.7\"", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -3820,4 +3820,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a" +content-hash = "9711c5479c867fa614ce3d352f1bbc63dba1cb2376d347f96fbeda6f512ee308" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 6a381bf094..5964b76ecf 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "proxy" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 7503b4eac9..dd48384c03 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -8,16 +8,16 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; use super::ComputeCredentialKeys; -use crate::auth::backend::ComputeUserInfo; use crate::auth::IpPattern; +use crate::auth::backend::ComputeUserInfo; use crate::cache::Cached; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::client::cplane_proxy_v1; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; -use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; +use crate::proxy::connect_compute::ComputeConnectBackend; use crate::stream::PqStream; use crate::types::RoleName; use crate::{auth, compute, waiters}; diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 5d032c0deb..942f1e13d1 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -6,9 +6,9 @@ use std::time::{Duration, SystemTime}; use arc_swap::ArcSwapOption; use clashmap::ClashMap; use jose_jwk::crypto::KeyInfo; -use reqwest::{redirect, Client}; -use reqwest_retry::policies::ExponentialBackoff; +use reqwest::{Client, redirect}; use reqwest_retry::RetryTransientMiddleware; +use reqwest_retry::policies::ExponentialBackoff; use serde::de::Visitor; use serde::{Deserialize, Deserializer}; use serde_json::value::RawValue; @@ -498,8 +498,8 @@ fn verify_rsa_signature( alg: &jose_jwa::Algorithm, ) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; - use rsa::pkcs1v15::{Signature, VerifyingKey}; use rsa::RsaPublicKey; + use rsa::pkcs1v15::{Signature, VerifyingKey}; let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index d10f0e82b2..9c3a3772cd 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -8,8 +8,8 @@ use crate::auth::backend::jwt::FetchAuthRulesError; use crate::compute::ConnCfg; use crate::compute_ctl::ComputeCtlApi; use crate::context::RequestContext; -use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::control_plane::NodeInfo; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::http; use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}; use crate::types::EndpointId; diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index dc595844c5..83feed5094 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -18,7 +18,7 @@ use tracing::{debug, info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::{ - self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern, + self, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern, validate_password_and_exchange, }; use crate::cache::Cached; use crate::config::AuthenticationConfig; @@ -32,8 +32,8 @@ use crate::control_plane::{ use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::protocol2::ConnectionInfoExtra; -use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; +use crate::proxy::connect_compute::ComputeConnectBackend; use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter}; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; @@ -308,10 +308,7 @@ async fn auth_quirks( let incoming_vpc_endpoint_id = match ctx.extra() { None => return Err(AuthError::MissingEndpointName), - Some(ConnectionInfoExtra::Aws { vpce_id }) => { - // Convert the vcpe_id to a string - String::from_utf8(vpce_id.to_vec()).unwrap_or_default() - } + Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), }; let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?; @@ -451,7 +448,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { Ok((Backend::ControlPlane(api, credentials), ip_allowlist)) } Self::Local(_) => { - return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) + return Err(auth::AuthError::bad_auth_method("invalid for local proxy")); } }; @@ -545,7 +542,7 @@ mod tests { use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use super::jwt::JwkCache; - use super::{auth_quirks, AuthRateLimiter}; + use super::{AuthRateLimiter, auth_quirks}; use crate::auth::backend::MaskedIp; use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; use crate::config::AuthenticationConfig; @@ -556,8 +553,8 @@ mod tests { }; use crate::proxy::NeonOptions; use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; - use crate::scram::threadpool::ThreadPool; use crate::scram::ServerSecret; + use crate::scram::threadpool::ThreadPool; use crate::stream::{PqStream, Stream}; struct Auth { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index eff49a402a..c1b7718e4f 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -197,7 +197,10 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern { type Value = IpPattern; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask") + write!( + formatter, + "comma separated list with ip address, ip address range, or ip address subnet mask" + ) } fn visit_str(self, v: &str) -> Result @@ -252,8 +255,8 @@ fn project_name_valid(name: &str) -> bool { #[cfg(test)] #[expect(clippy::unwrap_used)] mod tests { - use serde_json::json; use ComputeUserInfoParseError::*; + use serde_json::json; use super::*; diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index 6082695a6b..5670f8e43d 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -5,13 +5,13 @@ pub use backend::Backend; mod credentials; pub(crate) use credentials::{ - check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, - ComputeUserInfoParseError, IpPattern, + ComputeUserInfoMaybeEndpoint, ComputeUserInfoParseError, IpPattern, check_peer_addr_is_in_list, + endpoint_sni, }; mod password_hack; -pub(crate) use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; +pub(crate) use password_hack::parse_endpoint_param; mod flow; use std::io; diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index 4ab11f828c..dedd225cba 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -4,7 +4,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{bail, ensure, Context}; +use anyhow::{Context, bail, ensure}; use camino::{Utf8Path, Utf8PathBuf}; use clap::Parser; use compute_api::spec::LocalProxySpec; @@ -19,7 +19,7 @@ use utils::sentry_init::init_sentry; use utils::{pid_file, project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; -use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; +use crate::auth::backend::local::{JWKS_ROLE_MAP, LocalBackend}; use crate::auth::{self}; use crate::cancellation::CancellationHandler; use crate::config::{ diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 94e771a61c..1aa290399c 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -5,24 +5,24 @@ /// the outside. Similar to an ingress controller for HTTPS. use std::{net::SocketAddr, sync::Arc}; -use anyhow::{anyhow, bail, ensure, Context}; +use anyhow::{Context, anyhow, bail, ensure}; use clap::Arg; -use futures::future::Either; use futures::TryFutureExt; +use futures::future::Either; use itertools::Itertools; use rustls::crypto::ring; use rustls::pki_types::PrivateKeyDer; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpListener; use tokio_util::sync::CancellationToken; -use tracing::{error, info, Instrument}; +use tracing::{Instrument, error, info}; use utils::project_git_version; use utils::sentry_init::init_sentry; use crate::context::RequestContext; use crate::metrics::{Metrics, ThreadPoolMetrics}; use crate::protocol2::ConnectionInfo; -use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; +use crate::proxy::{ErrorSource, copy_bidirectional_client_compute, run_until_cancelled}; use crate::stream::{PqStream, Stream}; use crate::tls::TlsServerEndPoint; diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index b72799df54..eec0bf8f99 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -9,16 +9,16 @@ use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; +use tracing::{Instrument, info, warn}; use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; -use crate::cancellation::{handle_cancel_messages, CancellationHandler}; +use crate::cancellation::{CancellationHandler, handle_cancel_messages}; use crate::config::{ - self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, - ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, + self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, + ProxyConfig, ProxyProtocolV2, remote_storage_from_toml, }; use crate::context::parquet::ParquetUploadArgs; use crate::http::health_server::AppMetrics; @@ -30,8 +30,8 @@ use crate::redis::connection_with_credentials_provider::ConnectionWithCredential use crate::redis::kv_ops::RedisKVClient; use crate::redis::{elasticache, notifications}; use crate::scram::threadpool::ThreadPool; -use crate::serverless::cancel_set::CancelSet; use crate::serverless::GlobalConnPoolOptions; +use crate::serverless::cancel_set::CancelSet; use crate::tls::client_config::compute_client_config_with_root_certs; use crate::{auth, control_plane, http, serverless, usage_metrics}; @@ -331,7 +331,9 @@ pub async fn run() -> anyhow::Result<()> { ), ), (None, None) => { - warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); + warn!( + "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" + ); None } _ => { diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 7651eb71a2..e153e9f61f 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -1,12 +1,12 @@ use std::collections::HashSet; use std::convert::Infallible; -use std::sync::atomic::AtomicU64; use std::sync::Arc; +use std::sync::atomic::AtomicU64; use std::time::Duration; use async_trait::async_trait; use clashmap::ClashMap; -use rand::{thread_rng, Rng}; +use rand::{Rng, thread_rng}; use smol_str::SmolStr; use tokio::sync::Mutex; use tokio::time::Instant; diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 06eaeb9a30..7cfe5100ea 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -11,11 +11,11 @@ use std::time::{Duration, Instant}; // This severely hinders its usage both in terms of creating wrappers and supported key types. // // On the other hand, `hashlink` has good download stats and appears to be maintained. -use hashlink::{linked_hash_map::RawEntryMut, LruCache}; +use hashlink::{LruCache, linked_hash_map::RawEntryMut}; use tracing::debug; use super::common::Cached; -use super::{timed_lru, Cache}; +use super::{Cache, timed_lru}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 1f9c8a48b7..8263e5aa2a 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -3,8 +3,8 @@ use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; -use postgres_client::tls::MakeTlsConnect; use postgres_client::CancelToken; +use postgres_client::tls::MakeTlsConnect; use pq_proto::CancelKeyData; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -13,7 +13,7 @@ use tokio::sync::{mpsc, oneshot}; use tracing::{debug, info}; use crate::auth::backend::ComputeUserInfo; -use crate::auth::{check_peer_addr_is_in_list, AuthError}; +use crate::auth::{AuthError, check_peer_addr_is_in_list}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::ControlPlaneApi; @@ -358,10 +358,7 @@ impl CancellationHandler { let incoming_vpc_endpoint_id = match ctx.extra() { None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)), - Some(ConnectionInfoExtra::Aws { vpce_id }) => { - // Convert the vcpe_id to a string - String::from_utf8(vpce_id.to_vec()).unwrap_or_default() - } + Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), }; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 460e0cff54..1bcd22e98f 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -2,18 +2,18 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{bail, ensure, Context, Ok}; +use anyhow::{Context, Ok, bail, ensure}; use clap::ValueEnum; use remote_storage::RemoteStorageConfig; -use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::AuthRateLimiter; +use crate::auth::backend::jwt::JwkCache; use crate::control_plane::locks::ApiLocks; use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}; use crate::scram::threadpool::ThreadPool; -use crate::serverless::cancel_set::CancelSet; use crate::serverless::GlobalConnPoolOptions; -pub use crate::tls::server_config::{configure_tls, TlsConfig}; +use crate::serverless::cancel_set::CancelSet; +pub use crate::tls::server_config::{TlsConfig, configure_tls}; use crate::types::Host; pub struct ProxyConfig { @@ -97,8 +97,7 @@ pub struct EndpointCacheConfig { impl EndpointCacheConfig { /// Default options for [`crate::control_plane::NodeInfoCache`]. /// Notice that by default the limiter is empty, which means that cache is disabled. - pub const CACHE_DEFAULT_OPTIONS: &'static str = - "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + pub const CACHE_DEFAULT_OPTIONS: &'static str = "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; /// Parse cache options passed via cmdline. /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 1044f5f8e2..4662860b3f 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use futures::{FutureExt, TryFutureExt}; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, Instrument}; +use tracing::{Instrument, debug, error, info}; use crate::auth::backend::ConsoleRedirectBackend; use crate::cancellation::CancellationHandler; @@ -11,12 +11,12 @@ use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; -use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; -use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism}; -use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; +use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; +use crate::proxy::handshake::{HandshakeData, handshake}; use crate::proxy::passthrough::ProxyPassthrough; use crate::proxy::{ - prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, + ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled, }; pub async fn task_main( @@ -64,22 +64,34 @@ pub async fn task_main( debug!("healthcheck received"); return; } - Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + Ok((_socket, ConnectHeader::Missing)) + if config.proxy_protocol_v2 == ProxyProtocolV2::Required => + { error!("missing required proxy protocol header"); return; } - Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + Ok((_socket, ConnectHeader::Proxy(_))) + if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => + { error!("proxy protocol header not supported"); return; } Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }), + Ok((socket, ConnectHeader::Missing)) => ( + socket, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + ), }; match socket.inner.set_nodelay(true) { Ok(()) => {} Err(e) => { - error!("per-client task finished with an error: failed to set socket option: {e:#}"); + error!( + "per-client task finished with an error: failed to set socket option: {e:#}" + ); return; } } @@ -118,10 +130,16 @@ pub async fn task_main( match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { - error!(?session_id, "per-client task finished with an IO error from the client: {e:#}"); + error!( + ?session_id, + "per-client task finished with an IO error from the client: {e:#}" + ); } Err(ErrorSource::Compute(e)) => { - error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}"); + error!( + ?session_id, + "per-client task finished with an IO error from the compute: {e:#}" + ); } } } @@ -241,6 +259,7 @@ pub(crate) async fn handle_client( Ok(Some(ProxyPassthrough { client: stream, aux: node.aux.clone(), + private_link_id: None, compute: node, session_id: ctx.session_id(), cancel: session, diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 3236b2e1bf..74b48a1bea 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams; use smol_str::SmolStr; use tokio::sync::mpsc; use tracing::field::display; -use tracing::{debug, error, info_span, Span}; +use tracing::{Span, debug, error, info_span}; use try_lock::TryLock; use uuid::Uuid; diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 0537ae6a62..f029327266 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -8,7 +8,7 @@ use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; use parquet::basic::Compression; use parquet::file::metadata::RowGroupMetaDataPtr; -use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}; +use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties, WriterPropertiesPtr}; use parquet::file::writer::SerializedFileWriter; use parquet::record::RecordWriter; use pq_proto::StartupMessageParams; @@ -17,10 +17,10 @@ use serde::ser::SerializeMap; use tokio::sync::mpsc; use tokio::time; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, Span}; +use tracing::{Span, debug, info}; use utils::backoff; -use super::{RequestContextInner, LOG_CHAN}; +use super::{LOG_CHAN, RequestContextInner}; use crate::config::remote_storage_from_toml; use crate::context::LOG_CHAN_DISCONNECT; use crate::ext::TaskExt; @@ -425,20 +425,20 @@ mod tests { use futures::{Stream, StreamExt}; use itertools::Itertools; use parquet::basic::{Compression, ZstdLevel}; - use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE}; + use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties}; use parquet::file::reader::FileReader; use parquet::file::serialized_reader::SerializedFileReader; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use remote_storage::{ - GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, }; use tokio::sync::mpsc; use tokio::time; use walkdir::WalkDir; - use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData}; + use super::{ParquetConfig, ParquetUploadArgs, RequestData, worker_inner}; #[derive(Parser)] struct ProxyCliArgs { @@ -514,26 +514,26 @@ mod tests { fn generate_request_data(rng: &mut impl Rng) -> RequestData { RequestData { - session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(), - peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(), + session_id: uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(), + peer_addr: Ipv4Addr::from(rng.r#gen::<[u8; 4]>()).to_string(), timestamp: chrono::DateTime::from_timestamp_millis( rng.gen_range(1703862754..1803862754), ) .unwrap() .naive_utc(), application_name: Some("test".to_owned()), - username: Some(hex::encode(rng.gen::<[u8; 4]>())), - endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())), - database: Some(hex::encode(rng.gen::<[u8; 16]>())), - project: Some(hex::encode(rng.gen::<[u8; 16]>())), - branch: Some(hex::encode(rng.gen::<[u8; 16]>())), + username: Some(hex::encode(rng.r#gen::<[u8; 4]>())), + endpoint_id: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + database: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + project: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + branch: Some(hex::encode(rng.r#gen::<[u8; 16]>())), pg_options: None, auth_method: None, jwt_issuer: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], region: "us-east-1", error: None, - success: rng.gen(), + success: rng.r#gen(), cold_start_info: "no", duration_us: rng.gen_range(0..30_000_000), disconnect_timestamp: None, diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index ef6621fc59..977fcf4727 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -3,16 +3,16 @@ use std::sync::Arc; use std::time::Duration; -use ::http::header::AUTHORIZATION; use ::http::HeaderName; +use ::http::header::AUTHORIZATION; use futures::TryFutureExt; use postgres_client::config::SslMode; use tokio::time::Instant; -use tracing::{debug, info, info_span, warn, Instrument}; +use tracing::{Instrument, debug, info, info_span, warn}; use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute}; -use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::jwt::AuthRule; use crate::cache::Cached; use crate::context::RequestContext; use crate::control_plane::caches::ApiCaches; diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 1e6cde8fb0..7da5464aa5 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -6,11 +6,11 @@ use std::sync::Arc; use futures::TryFutureExt; use thiserror::Error; use tokio_postgres::Client; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; -use crate::auth::backend::jwt::AuthRule; -use crate::auth::backend::ComputeUserInfo; use crate::auth::IpPattern; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::jwt::AuthRule; use crate::cache::Cached; use crate::context::RequestContext; use crate::control_plane::client::{ diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index c28ff4789d..746595de38 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -10,15 +10,15 @@ use clashmap::ClashMap; use tokio::time::Instant; use tracing::{debug, info}; -use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::cache::endpoints::EndpointsCache; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; use crate::context::RequestContext; use crate::control_plane::{ - errors, CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds, - CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache, + CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, + CachedRoleSecret, ControlPlaneApi, NodeInfoCache, errors, }; use crate::error::ReportableError; use crate::metrics::ApiLockMetrics; diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs index d6f565e34a..bc30cffd27 100644 --- a/proxy/src/control_plane/errors.rs +++ b/proxy/src/control_plane/errors.rs @@ -2,7 +2,7 @@ use thiserror::Error; use crate::control_plane::client::ApiLockError; use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason}; -use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError}; +use crate::error::{ErrorKind, ReportableError, UserFacingError, io_error}; use crate::proxy::retry::CouldRetry; /// A go-to error message which doesn't leak any detail. diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs index 2f7359240d..df31abcc8c 100644 --- a/proxy/src/control_plane/mgmt.rs +++ b/proxy/src/control_plane/mgmt.rs @@ -6,7 +6,7 @@ use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, Instrument}; +use tracing::{Instrument, error, info, info_span}; use crate::control_plane::messages::{DatabaseInfo, KickSession}; use crate::waiters::{self, Waiter, Waiters}; diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index 89ec4f9b33..d592223be1 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -11,9 +11,9 @@ pub(crate) mod errors; use std::sync::Arc; +use crate::auth::IpPattern; use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::auth::IpPattern; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::cache::{Cached, TimedLru}; use crate::config::ComputeConfig; diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 141f319567..5278fe2a3e 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -9,8 +9,8 @@ use http_utils::json::json_response; use http_utils::{RouterBuilder, RouterService}; use hyper0::header::CONTENT_TYPE; use hyper0::{Body, Request, Response, StatusCode}; -use measured::text::BufferedTextEncoder; use measured::MetricGroup; +use measured::text::BufferedTextEncoder; use metrics::NeonMetrics; use tracing::{info, info_span}; diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index ed88c77256..96f600d836 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -13,8 +13,8 @@ use hyper::body::Body; pub(crate) use reqwest::{Request, Response}; use reqwest_middleware::RequestBuilder; pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; -pub(crate) use reqwest_retry::policies::ExponentialBackoff; pub(crate) use reqwest_retry::RetryTransientMiddleware; +pub(crate) use reqwest_retry::policies::ExponentialBackoff; use thiserror::Error; use crate::metrics::{ConsoleRequest, Metrics}; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index fbd4811b54..3c34918d84 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -8,7 +8,7 @@ use opentelemetry::trace::TraceContextExt; use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; use tracing::subscriber::Interest; -use tracing::{callsite, span, Event, Metadata, Span, Subscriber}; +use tracing::{Event, Metadata, Span, Subscriber, callsite, span}; use tracing_opentelemetry::OpenTelemetrySpanExt; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; use tracing_subscriber::fmt::format::{Format, Full}; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index f3447e063e..db1f096de1 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -543,11 +543,7 @@ impl Drop for LatencyTimer { impl From for Bool { fn from(value: bool) -> Self { - if value { - Bool::True - } else { - Bool::False - } + if value { Bool::True } else { Bool::False } } } diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 74a15d9bf4..41180fa6c1 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -9,6 +9,7 @@ use std::task::{Context, Poll}; use bytes::{Buf, Bytes, BytesMut}; use pin_project_lite::pin_project; +use smol_str::SmolStr; use strum_macros::FromRepr; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; use zerocopy::{FromBytes, FromZeroes}; @@ -99,7 +100,7 @@ impl fmt::Display for ConnectionInfo { #[derive(PartialEq, Eq, Clone, Debug)] pub enum ConnectionInfoExtra { - Aws { vpce_id: Bytes }, + Aws { vpce_id: SmolStr }, Azure { link_id: u32 }, } @@ -193,7 +194,7 @@ fn process_proxy_payload( return Err(io::Error::new( io::ErrorKind::Other, "invalid proxy protocol address family/transport protocol.", - )) + )); } }; @@ -207,9 +208,14 @@ fn process_proxy_payload( } let subtype = tlv.value.get_u8(); match Pp2AwsType::from_repr(subtype) { - Some(Pp2AwsType::VpceId) => { - extra = Some(ConnectionInfoExtra::Aws { vpce_id: tlv.value }); - } + Some(Pp2AwsType::VpceId) => match std::str::from_utf8(&tlv.value) { + Ok(s) => { + extra = Some(ConnectionInfoExtra::Aws { vpce_id: s.into() }); + } + Err(e) => { + tracing::warn!("invalid aws vpce id: {e}"); + } + }, None => { tracing::warn!("unknown aws tlv: subtype={subtype}"); } @@ -401,7 +407,7 @@ mod tests { use tokio::io::AsyncReadExt; use crate::protocol2::{ - read_proxy_protocol, ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6, + ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6, read_proxy_protocol, }; #[tokio::test] diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 26fb1754bf..b8b39fa121 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -5,7 +5,7 @@ use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; +use crate::compute::{self, COULD_NOT_CONNECT, PostgresConnection}; use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; @@ -15,7 +15,7 @@ use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, }; -use crate::proxy::retry::{retry_after, should_retry, CouldRetry}; +use crate::proxy::retry::{CouldRetry, retry_after, should_retry}; use crate::proxy::wake_compute::wake_compute; use crate::types::Host; diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 861f1766e8..6f8b972348 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -1,7 +1,7 @@ use std::future::poll_fn; use std::io; use std::pin::Pin; -use std::task::{ready, Context, Poll}; +use std::task::{Context, Poll, ready}; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tracing::info; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 2a406fcb34..0c6d352600 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -9,28 +9,28 @@ pub(crate) mod retry; pub(crate) mod wake_compute; use std::sync::Arc; -pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource}; +pub use copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, CancelKeyData, StartupMessageParams}; use regex::Regex; use serde::{Deserialize, Serialize}; -use smol_str::{format_smolstr, SmolStr}; +use smol_str::{SmolStr, ToSmolStr, format_smolstr}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn, Instrument}; +use tracing::{Instrument, debug, error, info, warn}; -use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::connect_compute::{TcpMechanism, connect_to_compute}; use self::passthrough::ProxyPassthrough; use crate::cancellation::{self, CancellationHandler}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; -use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; -use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; +use crate::proxy::handshake::{HandshakeData, handshake}; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; use crate::types::EndpointCacheKey; @@ -100,22 +100,34 @@ pub async fn task_main( debug!("healthcheck received"); return; } - Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + Ok((_socket, ConnectHeader::Missing)) + if config.proxy_protocol_v2 == ProxyProtocolV2::Required => + { warn!("missing required proxy protocol header"); return; } - Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + Ok((_socket, ConnectHeader::Proxy(_))) + if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => + { warn!("proxy protocol header not supported"); return; } Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }), + Ok((socket, ConnectHeader::Missing)) => ( + socket, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + ), }; match socket.inner.set_nodelay(true) { Ok(()) => {} Err(e) => { - error!("per-client task finished with an error: failed to set socket option: {e:#}"); + error!( + "per-client task finished with an error: failed to set socket option: {e:#}" + ); return; } } @@ -156,10 +168,16 @@ pub async fn task_main( match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { - warn!(?session_id, "per-client task finished with an IO error from the client: {e:#}"); + warn!( + ?session_id, + "per-client task finished with an IO error from the client: {e:#}" + ); } Err(ErrorSource::Compute(e)) => { - error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}"); + error!( + ?session_id, + "per-client task finished with an IO error from the compute: {e:#}" + ); } } } @@ -374,9 +392,16 @@ pub(crate) async fn handle_client( let (stream, read_buf) = stream.into_inner(); node.stream.write_all(&read_buf).await?; + let private_link_id = match ctx.extra() { + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + None => None, + }; + Ok(Some(ProxyPassthrough { client: stream, aux: node.aux.clone(), + private_link_id, compute: node, session_id: ctx.session_id(), cancel: session, diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 08871380d6..23b9897155 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -1,3 +1,4 @@ +use smol_str::SmolStr; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::debug; use utils::measured_stream::MeasuredStream; @@ -9,7 +10,7 @@ use crate::config::ComputeConfig; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; use crate::stream::Stream; -use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounterRecorder, TrafficDirection, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] @@ -17,10 +18,14 @@ pub(crate) async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, aux: MetricsAuxInfo, + private_link_id: Option, ) -> Result<(), ErrorSource> { - let usage = USAGE_METRICS.register(Ids { + // we will report ingress at a later date + let usage_tx = USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, + direction: TrafficDirection::Egress, + private_link_id, }); let metrics = &Metrics::get().proxy.io_bytes; @@ -31,7 +36,7 @@ pub(crate) async fn proxy_pass( |cnt| { // Number of bytes we sent to the client (outbound). metrics.get_metric(m_sent).inc_by(cnt as u64); - usage.record_egress(cnt as u64); + usage_tx.record_egress(cnt as u64); }, ); @@ -61,6 +66,7 @@ pub(crate) struct ProxyPassthrough { pub(crate) compute: PostgresConnection, pub(crate) aux: MetricsAuxInfo, pub(crate) session_id: uuid::Uuid, + pub(crate) private_link_id: Option, pub(crate) cancel: cancellation::Session, pub(crate) _req: NumConnectionRequestsGuard<'static>, @@ -72,7 +78,13 @@ impl ProxyPassthrough { self, compute_config: &ComputeConfig, ) -> Result<(), ErrorSource> { - let res = proxy_pass(self.client, self.compute.stream, self.aux).await; + let res = proxy_pass( + self.client, + self.compute.stream, + self.aux, + self.private_link_id, + ) + .await; if let Err(err) = self .compute .cancel_closure diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index d8c00a9b41..171f539b1e 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -5,12 +5,12 @@ mod mitm; use std::time::Duration; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use async_trait::async_trait; use http::StatusCode; use postgres_client::config::SslMode; use postgres_client::tls::{MakeTlsConnect, NoTls}; -use retry::{retry_after, ShouldRetryWakeCompute}; +use retry::{ShouldRetryWakeCompute, retry_after}; use rstest::rstest; use rustls::crypto::ring; use rustls::pki_types; @@ -334,8 +334,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); - use rand::distributions::Alphanumeric; use rand::Rng; + use rand::distributions::Alphanumeric; let password: String = rand::thread_rng() .sample_iter(&Alphanumeric) .take(rand::random::() as usize) diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 4e9206feff..9d8915e24a 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -3,8 +3,8 @@ use tracing::{error, info}; use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestContext; -use crate::control_plane::errors::{ControlPlaneError, WakeComputeError}; use crate::control_plane::CachedNodeInfo; +use crate::control_plane::errors::{ControlPlaneError, WakeComputeError}; use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index 9645eaf725..b3853d48e4 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -3,7 +3,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; use clashmap::ClashMap; -use rand::{thread_rng, Rng}; +use rand::{Rng, thread_rng}; use tokio::time::Instant; use tracing::info; use utils::leaky_bucket::LeakyBucketState; diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index b74a9ab17e..f8eeb89f05 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -5,8 +5,8 @@ use std::time::Duration; use parking_lot::Mutex; use tokio::sync::Notify; -use tokio::time::error::Elapsed; use tokio::time::Instant; +use tokio::time::error::Elapsed; use self::aimd::Aimd; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index ef6c39f230..71e2a92da6 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,8 +1,8 @@ use std::borrow::Cow; use std::collections::hash_map::RandomState; use std::hash::{BuildHasher, Hash}; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Mutex; +use std::sync::atomic::{AtomicUsize, Ordering}; use anyhow::bail; use clashmap::ClashMap; diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs index bf6dde9332..58e3c889a7 100644 --- a/proxy/src/redis/elasticache.rs +++ b/proxy/src/redis/elasticache.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use std::time::{Duration, SystemTime}; +use aws_config::Region; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; @@ -8,7 +9,6 @@ use aws_config::meta::region::RegionProviderChain; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; -use aws_config::Region; use aws_sdk_iam::config::ProvideCredentials; use aws_sigv4::http_request::{ self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs index dcb9a59f87..7527bca6d0 100644 --- a/proxy/src/redis/keys.rs +++ b/proxy/src/redis/keys.rs @@ -1,7 +1,7 @@ use std::io::ErrorKind; use anyhow::Ok; -use pq_proto::{id_to_cancel_key, CancelKeyData}; +use pq_proto::{CancelKeyData, id_to_cancel_key}; use serde::{Deserialize, Serialize}; pub mod keyspace { diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index ac77556566..46e6a439e5 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -5,8 +5,8 @@ use std::io; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; -use super::messages::ServerMessage; use super::Mechanism; +use super::messages::ServerMessage; use crate::stream::PqStream; /// Abstracts away all peculiarities of the libpq's protocol. diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 87ab6e0d5f..9d56c465ec 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -90,7 +90,7 @@ mod tests { // number of insert operations let m = rng.gen_range(1..100); - let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid(); + let id = uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(); ids.push((id, n, m)); // N = sum(actual) diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 77853db3db..abd5aeae5b 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -5,6 +5,7 @@ use std::convert::Infallible; use hmac::{Hmac, Mac}; use sha2::Sha256; +use super::ScramKey; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; @@ -12,7 +13,6 @@ use super::pbkdf2::Pbkdf2; use super::secret::ServerSecret; use super::signature::SignatureBuilder; use super::threadpool::ThreadPool; -use super::ScramKey; use crate::intern::EndpointIdInt; use crate::sasl::{self, ChannelBinding, Error as SaslError}; @@ -208,8 +208,8 @@ impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { - use sasl::Step; use ExchangeState; + use sasl::Step; match &self.state { ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index 0e54e7ded9..7b0b861ce9 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -4,7 +4,7 @@ use std::fmt; use std::ops::Range; use super::base64_decode_array; -use super::key::{ScramKey, SCRAM_KEY_LEN}; +use super::key::{SCRAM_KEY_LEN, ScramKey}; use super::signature::SignatureBuilder; use crate::sasl::ChannelBinding; diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index cfa571cbe1..24f991d4d9 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -15,7 +15,7 @@ mod secret; mod signature; pub mod threadpool; -pub(crate) use exchange::{exchange, Exchange}; +pub(crate) use exchange::{Exchange, exchange}; use hmac::{Hmac, Mac}; pub(crate) use key::ScramKey; pub(crate) use secret::ServerSecret; diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs index d3255cf2ca..a5b1c3e9f4 100644 --- a/proxy/src/scram/signature.rs +++ b/proxy/src/scram/signature.rs @@ -1,6 +1,6 @@ //! Tools for client/server signature management. -use super::key::{ScramKey, SCRAM_KEY_LEN}; +use super::key::{SCRAM_KEY_LEN, ScramKey}; /// A collection of message parts needed to derive the client's signature. #[derive(Debug)] diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index f35c375ba2..72029102e0 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -7,27 +7,27 @@ use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; use jose_jwk::jose_b64; use rand::rngs::OsRng; -use tokio::net::{lookup_host, TcpStream}; +use tokio::net::{TcpStream, lookup_host}; use tracing::field::display; use tracing::{debug, info}; use super::conn_pool::poll_client; use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; -use super::http_conn_pool::{self, poll_http2_client, HttpConnPool, Send}; -use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION}; +use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client}; +use super::local_conn_pool::{self, EXT_NAME, EXT_SCHEMA, EXT_VERSION, LocalConnPool}; use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; -use crate::auth::{self, check_peer_addr_is_in_list, AuthError}; +use crate::auth::{self, AuthError, check_peer_addr_is_in_list}; use crate::compute; use crate::compute_ctl::{ ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, }; use crate::config::{ComputeConfig, ProxyConfig}; use crate::context::RequestContext; +use crate::control_plane::CachedNodeInfo; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::control_plane::locks::ApiLocks; -use crate::control_plane::CachedNodeInfo; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::intern::EndpointIdInt; use crate::protocol2::ConnectionInfoExtra; @@ -75,10 +75,7 @@ impl PoolingBackend { let extra = ctx.extra(); let incoming_endpoint_id = match extra { None => String::new(), - Some(ConnectionInfoExtra::Aws { vpce_id }) => { - // Convert the vcpe_id to a string - String::from_utf8(vpce_id.to_vec()).unwrap_or_default() - } + Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), }; diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs index 6db986f1f7..ba8945afc5 100644 --- a/proxy/src/serverless/cancel_set.rs +++ b/proxy/src/serverless/cancel_set.rs @@ -6,7 +6,7 @@ use std::time::Duration; use indexmap::IndexMap; use parking_lot::Mutex; -use rand::{thread_rng, Rng}; +use rand::{Rng, thread_rng}; use rustc_hash::FxHasher; use tokio::time::Instant; use tokio_util::sync::CancellationToken; @@ -40,7 +40,7 @@ impl CancelSet { pub(crate) fn take(&self) -> Option { for _ in 0..4 { - if let Some(token) = self.take_raw(thread_rng().gen()) { + if let Some(token) = self.take_raw(thread_rng().r#gen()) { return Some(token); } tracing::trace!("failed to get cancel token"); @@ -68,7 +68,7 @@ impl CancelShard { fn take(&mut self, rng: usize) -> Option { NonZeroUsize::new(self.tokens.len()).and_then(|len| { // 10 second grace period so we don't cancel new connections - if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) { + if self.tokens.get_index(rng % len)?.1.0.elapsed() < Duration::from_secs(10) { return None; } diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 447103edce..6a9089fc2a 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,17 +1,17 @@ use std::fmt; use std::pin::pin; use std::sync::{Arc, Weak}; -use std::task::{ready, Poll}; +use std::task::{Poll, ready}; -use futures::future::poll_fn; use futures::Future; -use postgres_client::tls::NoTlsStream; +use futures::future::poll_fn; use postgres_client::AsyncMessage; +use postgres_client::tls::NoTlsStream; use smallvec::SmallVec; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; #[cfg(test)] use { super::conn_pool_lib::GlobalConnPoolOptions, diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index a300198de4..933204994b 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -9,7 +9,8 @@ use clashmap::ClashMap; use parking_lot::RwLock; use postgres_client::ReadyForQueryStatus; use rand::Rng; -use tracing::{debug, info, Span}; +use smol_str::ToSmolStr; +use tracing::{Span, debug, info}; use super::backend::HttpConnError; use super::conn_pool::ClientDataRemote; @@ -19,8 +20,9 @@ use crate::auth::backend::ComputeUserInfo; use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::protocol2::ConnectionInfoExtra; use crate::types::{DbName, EndpointCacheKey, RoleName}; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; #[derive(Debug, Clone)] pub(crate) struct ConnInfo { @@ -473,7 +475,9 @@ where .http_pool_opened_connections .get_metric() .dec_by(clients_removed as i64); - info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + info!( + "pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}" + ); } let removed = current_len - new_len; @@ -635,15 +639,28 @@ impl Client { (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn metrics(&self) -> Arc { + pub(crate) fn metrics( + &self, + direction: TrafficDirection, + ctx: &RequestContext, + ) -> Arc { let aux = &self .inner .as_ref() .expect("client inner should not be removed") .aux; + + let private_link_id = match ctx.extra() { + None => None, + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + }; + USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, + direction, + private_link_id, }) } } @@ -700,7 +717,9 @@ impl Discard<'_, C> { pub(crate) fn discard(&mut self) { let conn_info = &self.conn_info; if std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); + info!( + "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state" + ); } } } diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index fde38d0de3..338a79b4b3 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -5,8 +5,9 @@ use std::sync::{Arc, Weak}; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; +use smol_str::ToSmolStr; use tokio::net::TcpStream; -use tracing::{debug, error, info, info_span, Instrument}; +use tracing::{Instrument, debug, error, info, info_span}; use super::backend::HttpConnError; use super::conn_pool_lib::{ @@ -16,8 +17,9 @@ use super::conn_pool_lib::{ use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::protocol2::ConnectionInfoExtra; use crate::types::EndpointCacheKey; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = @@ -264,11 +266,24 @@ impl Client { Self { inner } } - pub(crate) fn metrics(&self) -> Arc { + pub(crate) fn metrics( + &self, + direction: TrafficDirection, + ctx: &RequestContext, + ) -> Arc { let aux = &self.inner.aux; + + let private_link_id = match ctx.extra() { + None => None, + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + }; + USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, + direction, + private_link_id, }) } } diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index ab012bd020..fbd12ad9cb 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,5 +1,5 @@ -use postgres_client::types::{Kind, Type}; use postgres_client::Row; +use postgres_client::types::{Kind, Type}; use serde_json::{Map, Value}; // diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 137a2d6377..8426a0810e 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -11,24 +11,24 @@ use std::collections::HashMap; use std::pin::pin; -use std::sync::atomic::AtomicUsize; use std::sync::Arc; -use std::task::{ready, Poll}; +use std::sync::atomic::AtomicUsize; +use std::task::{Poll, ready}; use std::time::Duration; use ed25519_dalek::{Signature, Signer, SigningKey}; -use futures::future::poll_fn; use futures::Future; +use futures::future::poll_fn; use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use parking_lot::RwLock; -use postgres_client::tls::NoTlsStream; use postgres_client::AsyncMessage; +use postgres_client::tls::NoTlsStream; use serde_json::value::RawValue; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, warn, Instrument}; +use tracing::{Instrument, debug, error, info, info_span, warn}; use super::backend::HttpConnError; use super::conn_pool_lib::{ @@ -389,6 +389,9 @@ mod tests { // }); // println!("{}", serde_json::to_string(&jwk).unwrap()); - assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg"); + assert_eq!( + jwt, + "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg" + ); } } diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 8289500159..dd0fb9c5b4 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -15,7 +15,7 @@ mod sql_over_http; mod websocket; use std::net::{IpAddr, SocketAddr}; -use std::pin::{pin, Pin}; +use std::pin::{Pin, pin}; use std::sync::Arc; use anyhow::Context; @@ -23,8 +23,8 @@ use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; pub use conn_pool_lib::GlobalConnPoolOptions; -use futures::future::{select, Either}; use futures::TryFutureExt; +use futures::future::{Either, select}; use http::{Method, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Empty}; @@ -32,23 +32,23 @@ use http_utils::error::ApiError; use hyper::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; -use rand::rngs::StdRng; use rand::SeedableRng; -use sql_over_http::{uuid_to_header_value, NEON_REQUEST_ID}; +use rand::rngs::StdRng; +use sql_over_http::{NEON_REQUEST_ID, uuid_to_header_value}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; use tokio_rustls::TlsAcceptor; use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; -use tracing::{info, warn, Instrument}; +use tracing::{Instrument, info, warn}; use crate::cancellation::CancellationHandler; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::ext::TaskExt; use crate::metrics::Metrics; -use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo}; +use crate::protocol2::{ChainRW, ConnectHeader, ConnectionInfo, read_proxy_protocol}; use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 5982fe225d..8babfb5cd2 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -2,23 +2,23 @@ use std::pin::pin; use std::sync::Arc; use bytes::Bytes; -use futures::future::{select, try_join, Either}; +use futures::future::{Either, select, try_join}; use futures::{StreamExt, TryFutureExt}; -use http::header::AUTHORIZATION; use http::Method; +use http::header::AUTHORIZATION; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; use http_utils::error::ApiError; use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; -use hyper::{header, HeaderMap, Request, Response, StatusCode}; +use hyper::{HeaderMap, Request, Response, StatusCode, header}; use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; -use serde_json::value::RawValue; use serde_json::Value; +use serde_json::value::RawValue; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; @@ -31,18 +31,18 @@ use super::conn_pool::{AuthData, ConnInfoWithAuth}; use super::conn_pool_lib::{self, ConnInfo}; use super::error::HttpCodeError; use super::http_util::json_response; -use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; +use super::json::{JsonConversionError, json_to_pg_text, pg_text_row_to_json}; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; +use crate::auth::{ComputeUserInfoParseError, endpoint_sni}; use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::http::{read_body_with_limit, ReadBodyError}; +use crate::http::{ReadBodyError, read_body_with_limit}; use crate::metrics::{HttpDirection, Metrics}; -use crate::proxy::{run_until_cancelled, NeonOptions}; +use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; -use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder, TrafficDirection}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] @@ -209,7 +209,7 @@ fn get_conn_info( } } Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { - return Err(ConnInfoError::MissingHostname) + return Err(ConnInfoError::MissingHostname); } }; ctx.set_endpoint_id(endpoint.clone()); @@ -745,7 +745,7 @@ async fn handle_db_inner( } }; - let metrics = client.metrics(); + let metrics = client.metrics(TrafficDirection::Egress, ctx); let len = json_output.len(); let response = response @@ -818,7 +818,7 @@ async fn handle_auth_broker_inner( .expect("all headers and params received via hyper should be valid for request"); // todo: map body to count egress - let _metrics = client.metrics(); + let _metrics = client.metrics(TrafficDirection::Egress, ctx); Ok(client .inner @@ -1021,7 +1021,7 @@ async fn query_to_json( data: QueryData, current_size: &mut usize, parsed_headers: HttpHeaders, -) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { +) -> Result<(ReadyForQueryStatus, impl Serialize + use), SqlOverHttpError> { let query_start = Instant::now(); let query_params = data.params; @@ -1118,10 +1118,10 @@ enum Discard<'a> { } impl Client { - fn metrics(&self) -> Arc { + fn metrics(&self, direction: TrafficDirection, ctx: &RequestContext) -> Arc { match self { - Client::Remote(client) => client.metrics(), - Client::Local(local_client) => local_client.metrics(), + Client::Remote(client) => client.metrics(direction, ctx), + Client::Local(local_client) => local_client.metrics(direction, ctx), } } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 585a7d63b2..c4baeeb5cc 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,6 +1,6 @@ use std::pin::Pin; use std::sync::Arc; -use std::task::{ready, Context, Poll}; +use std::task::{Context, Poll, ready}; use anyhow::Context as _; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -15,9 +15,9 @@ use tracing::warn; use crate::cancellation::CancellationHandler; use crate::config::ProxyConfig; use crate::context::RequestContext; -use crate::error::{io_error, ReportableError}; +use crate::error::{ReportableError, io_error}; use crate::metrics::Metrics; -use crate::proxy::{handle_client, ClientMode, ErrorSource}; +use crate::proxy::{ClientMode, ErrorSource, handle_client}; use crate::rate_limiter::EndpointRateLimiter; pin_project! { @@ -184,11 +184,11 @@ mod tests { use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt}; + use tokio::io::{AsyncReadExt, AsyncWriteExt, duplex}; use tokio::task::JoinSet; - use tokio_tungstenite::tungstenite::protocol::Role; - use tokio_tungstenite::tungstenite::Message; use tokio_tungstenite::WebSocketStream; + use tokio_tungstenite::tungstenite::Message; + use tokio_tungstenite::tungstenite::protocol::Role; use super::WebSocketRw; diff --git a/proxy/src/signals.rs b/proxy/src/signals.rs index 0b675683c0..32b2344a1c 100644 --- a/proxy/src/signals.rs +++ b/proxy/src/signals.rs @@ -12,7 +12,7 @@ pub async fn handle( where F: FnMut(), { - use tokio::signal::unix::{signal, SignalKind}; + use tokio::signal::unix::{SignalKind, signal}; let mut hangup = signal(SignalKind::hangup())?; let mut interrupt = signal(SignalKind::interrupt())?; diff --git a/proxy/src/tls/postgres_rustls.rs b/proxy/src/tls/postgres_rustls.rs index 0ad279b635..f09e916a1d 100644 --- a/proxy/src/tls/postgres_rustls.rs +++ b/proxy/src/tls/postgres_rustls.rs @@ -2,8 +2,8 @@ use std::convert::TryFrom; use std::sync::Arc; use postgres_client::tls::MakeTlsConnect; -use rustls::pki_types::ServerName; use rustls::ClientConfig; +use rustls::pki_types::ServerName; use tokio::io::{AsyncRead, AsyncWrite}; mod private { @@ -15,8 +15,8 @@ mod private { use postgres_client::tls::{ChannelBinding, TlsConnect}; use rustls::pki_types::ServerName; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; - use tokio_rustls::client::TlsStream; use tokio_rustls::TlsConnector; + use tokio_rustls::client::TlsStream; use crate::tls::TlsServerEndPoint; diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 2cc1657eea..903c0b712b 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -1,12 +1,12 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use itertools::Itertools; use rustls::crypto::ring::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; -use super::{TlsServerEndPoint, PG_ALPN_PROTOCOL}; +use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; pub struct TlsConfig { pub config: Arc, diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index d369e3742f..004d268fa1 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -2,20 +2,21 @@ //! and push them to a HTTP endpoint. use std::borrow::Cow; use std::convert::Infallible; -use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::time::Duration; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; -use clashmap::mapref::entry::Entry; use clashmap::ClashMap; -use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; +use clashmap::mapref::entry::Entry; +use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, EventType, idempotency_key}; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; +use smol_str::SmolStr; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace, warn}; @@ -43,6 +44,33 @@ const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); pub(crate) struct Ids { pub(crate) endpoint_id: EndpointIdInt, pub(crate) branch_id: BranchIdInt, + pub(crate) direction: TrafficDirection, + #[serde(with = "none_as_empty_string")] + pub(crate) private_link_id: Option, +} + +mod none_as_empty_string { + use serde::Deserialize; + use smol_str::SmolStr; + + #[allow(clippy::ref_option)] + pub fn serialize(t: &Option, s: S) -> Result { + s.serialize_str(t.as_deref().unwrap_or("")) + } + + pub fn deserialize<'de, D: serde::Deserializer<'de>>( + d: D, + ) -> Result, D::Error> { + let s = SmolStr::deserialize(d)?; + if s.is_empty() { Ok(None) } else { Ok(Some(s)) } + } +} + +#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "lowercase")] +pub(crate) enum TrafficDirection { + Ingress, + Egress, } pub(crate) trait MetricCounterRecorder { @@ -505,6 +533,8 @@ mod tests { let counter = metrics.register(Ids { endpoint_id: (&EndpointId::from("e1")).into(), branch_id: (&BranchId::from("b1")).into(), + direction: TrafficDirection::Egress, + private_link_id: None, }); // the counter should be observed despite 0 egress diff --git a/pyproject.toml b/pyproject.toml index 92a660c233..c6e5073bcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ prometheus-client = "^0.14.1" pytest-timeout = "^2.3.1" Werkzeug = "^3.0.6" pytest-order = "^1.1.0" -allure-pytest = "^2.13.2" +allure-pytest = "^2.13.5" pytest-asyncio = "^0.21.0" toml = "^0.10.2" psutil = "^5.9.4" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 38a7f202ba..591d60ea79 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.84.1" +channel = "1.85.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index d12ebc1030..c86ac576ad 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -31,7 +31,6 @@ futures.workspace = true once_cell.workspace = true parking_lot.workspace = true pageserver_api.workspace = true -postgres.workspace = true postgres-protocol.workspace = true pprof.workspace = true rand.workspace = true diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 40e5afc4aa..5c305769dd 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -137,7 +137,7 @@ impl Client { } pub async fn utilization(&self) -> Result { - let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint); + let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); let resp = self.get(&uri).await?; resp.json().await.map_err(Error::ReceiveBody) } diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index 61647c16b0..35394eb6ed 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -343,8 +343,11 @@ async fn recovery_stream( cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical); let connect_timeout = Duration::from_millis(10000); - let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls)) - .await + let (client, connection) = match time::timeout( + connect_timeout, + cfg.connect(tokio_postgres::NoTls), + ) + .await { Ok(client_and_conn) => client_and_conn?, Err(_elapsed) => { diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 5916675c3f..fb06339604 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -295,6 +295,10 @@ impl InterpretedWalReader { let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version); + // Tracks the start of the PG WAL LSN from which the current batch of + // interpreted records originated. + let mut current_batch_wal_start_lsn: Option = None; + loop { tokio::select! { // Main branch for reading WAL and forwarding it @@ -302,7 +306,7 @@ impl InterpretedWalReader { let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below")); let WalBytes { wal, - wal_start_lsn: _, + wal_start_lsn, wal_end_lsn, available_wal_end_lsn, } = match wal { @@ -315,6 +319,12 @@ impl InterpretedWalReader { } }; + // We will already have a value if the previous chunks of WAL + // did not decode into anything useful. + if current_batch_wal_start_lsn.is_none() { + current_batch_wal_start_lsn = Some(wal_start_lsn); + } + wal_decoder.feed_bytes(&wal); // Deserialize and interpret WAL records from this batch of WAL. @@ -363,7 +373,9 @@ impl InterpretedWalReader { let max_next_record_lsn = match max_next_record_lsn { Some(lsn) => lsn, - None => { continue; } + None => { + continue; + } }; // Update the current position such that new receivers can decide @@ -377,21 +389,38 @@ impl InterpretedWalReader { } } + let batch_wal_start_lsn = current_batch_wal_start_lsn.take().unwrap(); + // Send interpreted records downstream. Anything that has already been seen // by a shard is filtered out. let mut shard_senders_to_remove = Vec::new(); for (shard, states) in &mut self.shard_senders { for state in states { - if max_next_record_lsn <= state.next_record_lsn { - continue; - } - let shard_sender_id = ShardSenderId::new(*shard, state.sender_id); - let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default(); - let batch = InterpretedWalRecords { - records, - next_record_lsn: Some(max_next_record_lsn), + let batch = if max_next_record_lsn > state.next_record_lsn { + // This batch contains at least one record that this shard has not + // seen yet. + let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default(); + + InterpretedWalRecords { + records, + next_record_lsn: max_next_record_lsn, + raw_wal_start_lsn: Some(batch_wal_start_lsn), + } + } else if wal_end_lsn > state.next_record_lsn { + // All the records in this batch were seen by the shard + // However, the batch maps to a chunk of WAL that the + // shard has not yet seen. Notify it of the start LSN + // of the PG WAL chunk such that it doesn't look like a gap. + InterpretedWalRecords { + records: Vec::default(), + next_record_lsn: state.next_record_lsn, + raw_wal_start_lsn: Some(batch_wal_start_lsn), + } + } else { + // The shard has seen this chunk of WAL before. Skip it. + continue; }; let res = state.tx.send(Batch { @@ -403,7 +432,7 @@ impl InterpretedWalReader { if res.is_err() { shard_senders_to_remove.push(shard_sender_id); } else { - state.next_record_lsn = max_next_record_lsn; + state.next_record_lsn = std::cmp::max(state.next_record_lsn, max_next_record_lsn); } } } diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py index a2f553d290..915eb33673 100644 --- a/scripts/generate_image_maps.py +++ b/scripts/generate_image_maps.py @@ -6,6 +6,9 @@ build_tag = os.environ["BUILD_TAG"] branch = os.environ["BRANCH"] dev_acr = os.environ["DEV_ACR"] prod_acr = os.environ["PROD_ACR"] +dev_aws = os.environ["DEV_AWS"] +prod_aws = os.environ["PROD_AWS"] +aws_region = os.environ["AWS_REGION"] components = { "neon": ["neon"], @@ -24,11 +27,11 @@ components = { registries = { "dev": [ "docker.io/neondatabase", - "369495373322.dkr.ecr.eu-central-1.amazonaws.com", + f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{dev_acr}.azurecr.io/neondatabase", ], "prod": [ - "093970136003.dkr.ecr.eu-central-1.amazonaws.com", + f"{prod_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{prod_acr}.azurecr.io/neondatabase", ], } diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index a93bbdeaaf..08c80bc141 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -24,6 +24,7 @@ hex.workspace = true hyper0.workspace = true humantime.workspace = true itertools.workspace = true +json-structural-diff.workspace = true lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true @@ -34,6 +35,7 @@ reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true safekeeper_api.workspace = true safekeeper_client.workspace = true +tikv-jemallocator.workspace = true regex.workspace = true rustls-native-certs.workspace = true serde.workspace = true diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql new file mode 100644 index 0000000000..0f051d3ac3 --- /dev/null +++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes DROP listen_https_port; diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql new file mode 100644 index 0000000000..172237d477 --- /dev/null +++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ADD listen_https_port INTEGER; diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 6f110d3294..52b6110667 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -10,7 +10,10 @@ use std::{ }; use tokio_util::sync::CancellationToken; -use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization}; +use pageserver_api::{ + controller_api::{NodeAvailability, SkSchedulingPolicy}, + models::PageserverUtilization, +}; use thiserror::Error; use utils::{id::NodeId, logging::SecretString}; @@ -137,8 +140,13 @@ where request = self.receiver.recv() => { match request { Some(req) => { + if req.reply.is_closed() { + // Prevent a possibly infinite buildup of the receiver channel, if requests arrive faster than we can handle them + continue; + } let res = self.heartbeat(req.servers).await; - req.reply.send(res).unwrap(); + // Ignore the return value in order to not panic if the heartbeat function's future was cancelled + _ = req.reply.send(res); }, None => { return; } } @@ -311,6 +319,9 @@ impl HeartBeat for HeartbeaterTask for HeartbeaterTask SafekeeperState::Offline, + Err(e) => { + tracing::info!( + "Marking safekeeper {} at as offline: {e}", + sk.base_url() + ); + SafekeeperState::Offline + } }; Some((*node_id, status)) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 8994721267..33b3d88c25 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -9,7 +9,10 @@ use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECON use anyhow::Context; use futures::Future; use http_utils::{ - endpoint::{self, auth_middleware, check_permission_with, request_span}, + endpoint::{ + self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, + request_span, + }, error::ApiError, failpoints::failpoints_handler, json::{json_request, json_response}, @@ -54,7 +57,7 @@ pub struct HttpState { service: Arc, auth: Option>, neon_metrics: NeonMetrics, - allowlist_routes: Vec, + allowlist_routes: &'static [&'static str], } impl HttpState { @@ -63,15 +66,17 @@ impl HttpState { auth: Option>, build_info: BuildInfo, ) -> Self { - let allowlist_routes = ["/status", "/ready", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect::>(); Self { service, auth, neon_metrics: NeonMetrics::new(build_info), - allowlist_routes, + allowlist_routes: &[ + "/status", + "/ready", + "/metrics", + "/profile/cpu", + "/profile/heap", + ], } } } @@ -593,7 +598,10 @@ async fn handle_tenant_timeline_passthrough( let _timer = latency.start_timer(labels.clone()); - let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref()); + let client = mgmt_api::Client::new( + node.base_url(), + service.get_config().pageserver_jwt_token.as_deref(), + ); let resp = client.get_raw(path).await.map_err(|e| // We return 503 here because if we can't successfully send a request to the pageserver, // either we aren't available or the pageserver is unavailable. @@ -1349,10 +1357,7 @@ async fn handle_safekeeper_scheduling_policy( .set_safekeeper_scheduling_policy(id, body.scheduling_policy) .await?; - Ok(Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::empty()) - .unwrap()) + json_response(StatusCode::OK, ()) } /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only @@ -1416,23 +1421,26 @@ pub fn prologue_leadership_status_check_middleware< let state = get_state(&req); let leadership_status = state.service.get_leadership_status(); - enum AllowedRoutes<'a> { + enum AllowedRoutes { All, - Some(Vec<&'a str>), + Some(&'static [&'static str]), } let allowed_routes = match leadership_status { LeadershipStatus::Leader => AllowedRoutes::All, LeadershipStatus::SteppedDown => AllowedRoutes::All, - LeadershipStatus::Candidate => { - AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) - } + LeadershipStatus::Candidate => AllowedRoutes::Some(&[ + "/ready", + "/status", + "/metrics", + "/profile/cpu", + "/profile/heap", + ]), }; - let uri = req.uri().to_string(); match allowed_routes { AllowedRoutes::All => Ok(req), - AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req), + AllowedRoutes::Some(allowed) if allowed.contains(&req.uri().path()) => Ok(req), _ => { tracing::info!( "Request {} not allowed due to current leadership state", @@ -1541,7 +1549,8 @@ enum ForwardOutcome { /// Potentially forward the request to the current storage controler leader. /// More specifically we forward when: -/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] +/// 1. Request is not one of: +/// ["/control/v1/step_down", "/status", "/ready", "/metrics", "/profile/cpu", "/profile/heap"] /// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state /// 3. There is a leader in the database to forward to /// 4. Leader from step (3) is not the current instance @@ -1562,10 +1571,17 @@ enum ForwardOutcome { /// Hence, if we are in the edge case scenario the leader persisted in the database is the /// stepped down instance that received the request. Condition (4) above covers this scenario. async fn maybe_forward(req: Request) -> ForwardOutcome { - const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"]; + const NOT_FOR_FORWARD: &[&str] = &[ + "/control/v1/step_down", + "/status", + "/ready", + "/metrics", + "/profile/cpu", + "/profile/heap", + ]; - let uri = req.uri().to_string(); - let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + let uri = req.uri(); + let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.path()); // Fast return before trying to take any Service locks, if we will never forward anyway if !uri_for_forward { @@ -1765,7 +1781,7 @@ pub fn make_router( if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); - if state.allowlist_routes.contains(request.uri()) { + if state.allowlist_routes.contains(&request.uri().path()) { None } else { state.auth.as_deref() @@ -1778,13 +1794,19 @@ pub fn make_router( .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) - // Non-prefixed generic endpoints (status, metrics) + // Non-prefixed generic endpoints (status, metrics, profiling) .get("/status", |r| { named_request_span(r, handle_status, RequestName("status")) }) .get("/ready", |r| { named_request_span(r, handle_ready, RequestName("ready")) }) + .get("/profile/cpu", |r| { + named_request_span(r, profile_cpu_handler, RequestName("profile_cpu")) + }) + .get("/profile/heap", |r| { + named_request_span(r, profile_heap_handler, RequestName("profile_heap")) + }) // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix .post("/upcall/v1/re-attach", |r| { named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach")) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index ea6bc38e89..18922b9e05 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -27,6 +27,16 @@ use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; + #[derive(Parser)] #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] @@ -43,6 +53,10 @@ struct Cli { #[arg(long)] jwt_token: Option, + /// Token for authenticating this service with the safekeepers it controls + #[arg(long)] + safekeeper_jwt_token: Option, + /// Token for authenticating this service with the control plane, when calling /// the compute notification endpoint #[arg(long)] @@ -116,6 +130,10 @@ struct Cli { #[arg(long)] long_reconcile_threshold: Option, + + // Flag to use https for requests to pageserver API. + #[arg(long, default_value = "false")] + use_https_pageserver_api: bool, } enum StrictMode { @@ -139,7 +157,8 @@ impl Default for StrictMode { struct Secrets { database_url: String, public_key: Option, - jwt_token: Option, + pageserver_jwt_token: Option, + safekeeper_jwt_token: Option, control_plane_jwt_token: Option, peer_jwt_token: Option, } @@ -147,6 +166,7 @@ struct Secrets { impl Secrets { const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; + const SAFEKEEPER_JWT_TOKEN_ENV: &'static str = "SAFEKEEPER_JWT_TOKEN"; const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN"; const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY"; @@ -170,7 +190,14 @@ impl Secrets { let this = Self { database_url, public_key, - jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV), + pageserver_jwt_token: Self::load_secret( + &args.jwt_token, + Self::PAGESERVER_JWT_TOKEN_ENV, + ), + safekeeper_jwt_token: Self::load_secret( + &args.safekeeper_jwt_token, + Self::SAFEKEEPER_JWT_TOKEN_ENV, + ), control_plane_jwt_token: Self::load_secret( &args.control_plane_jwt_token, Self::CONTROL_PLANE_JWT_TOKEN_ENV, @@ -250,11 +277,17 @@ async fn async_main() -> anyhow::Result<()> { let secrets = Secrets::load(&args).await?; + // TODO: once we've rolled out the safekeeper JWT token everywhere, put it into the validation code below + tracing::info!( + "safekeeper_jwt_token set: {:?}", + secrets.safekeeper_jwt_token.is_some() + ); + // Validate required secrets and arguments are provided in strict mode match strict_mode { StrictMode::Strict if (secrets.public_key.is_none() - || secrets.jwt_token.is_none() + || secrets.pageserver_jwt_token.is_none() || secrets.control_plane_jwt_token.is_none()) => { // Production systems should always have secrets configured: if public_key was not set @@ -279,7 +312,8 @@ async fn async_main() -> anyhow::Result<()> { } let config = Config { - jwt_token: secrets.jwt_token, + pageserver_jwt_token: secrets.pageserver_jwt_token, + safekeeper_jwt_token: secrets.safekeeper_jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, peer_jwt_token: secrets.peer_jwt_token, compute_hook_url: args.compute_hook_url, @@ -311,6 +345,7 @@ async fn async_main() -> anyhow::Result<()> { address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, + use_https_pageserver_api: args.use_https_pageserver_api, }; // Validate that we can connect to the database diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index f5c2d329e0..3762d13c10 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -1,5 +1,6 @@ use std::{str::FromStr, time::Duration}; +use anyhow::anyhow; use pageserver_api::{ controller_api::{ AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, @@ -32,12 +33,16 @@ pub(crate) struct Node { listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, listen_pg_addr: String, listen_pg_port: u16, availability_zone_id: AvailabilityZone, + // Flag from storcon's config to use https for pageserver admin API. + // Invariant: if |true|, listen_https_port should contain a value. + use_https: bool, // This cancellation token means "stop any RPCs in flight to this node, and don't start // any more". It is not related to process shutdown. #[serde(skip)] @@ -56,7 +61,16 @@ pub(crate) enum AvailabilityTransition { impl Node { pub(crate) fn base_url(&self) -> String { - format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + if self.use_https { + format!( + "https://{}:{}", + self.listen_http_addr, + self.listen_https_port + .expect("https port should be specified if use_https is on") + ) + } else { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } } pub(crate) fn get_id(&self) -> NodeId { @@ -82,11 +96,20 @@ impl Node { self.id == register_req.node_id && self.listen_http_addr == register_req.listen_http_addr && self.listen_http_port == register_req.listen_http_port + // Note: listen_https_port may change. See [`Self::need_update`] for mode details. + // && self.listen_https_port == register_req.listen_https_port && self.listen_pg_addr == register_req.listen_pg_addr && self.listen_pg_port == register_req.listen_pg_port && self.availability_zone_id == register_req.availability_zone_id } + // Do we need to update an existing record in DB on this registration request? + pub(crate) fn need_update(&self, register_req: &NodeRegisterRequest) -> bool { + // listen_https_port is checked here because it may change during migration to https. + // After migration, this check may be moved to registration_match. + self.listen_https_port != register_req.listen_https_port + } + /// For a shard located on this node, populate a response object /// with this node's address information. pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard { @@ -95,6 +118,7 @@ impl Node { node_id: self.id, listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, + listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, } @@ -175,25 +199,34 @@ impl Node { } } + #[allow(clippy::too_many_arguments)] pub(crate) fn new( id: NodeId, listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, listen_pg_addr: String, listen_pg_port: u16, availability_zone_id: AvailabilityZone, - ) -> Self { - Self { + use_https: bool, + ) -> anyhow::Result { + if use_https && listen_https_port.is_none() { + return Err(anyhow!("https is enabled, but node has no https port")); + } + + Ok(Self { id, listen_http_addr, listen_http_port, + listen_https_port, listen_pg_addr, listen_pg_port, scheduling: NodeSchedulingPolicy::Active, availability: NodeAvailability::Offline, availability_zone_id, + use_https, cancel: CancellationToken::new(), - } + }) } pub(crate) fn to_persistent(&self) -> NodePersistence { @@ -202,14 +235,19 @@ impl Node { scheduling_policy: self.scheduling.into(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port as i32, + listen_https_port: self.listen_https_port.map(|x| x as i32), listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port as i32, availability_zone_id: self.availability_zone_id.0.clone(), } } - pub(crate) fn from_persistent(np: NodePersistence) -> Self { - Self { + pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result { + if use_https && np.listen_https_port.is_none() { + return Err(anyhow!("https is enabled, but node has no https port")); + } + + Ok(Self { id: NodeId(np.node_id as u64), // At startup we consider a node offline until proven otherwise. availability: NodeAvailability::Offline, @@ -217,11 +255,13 @@ impl Node { .expect("Bad scheduling policy in DB"), listen_http_addr: np.listen_http_addr, listen_http_port: np.listen_http_port as u16, + listen_https_port: np.listen_https_port.map(|x| x as u16), listen_pg_addr: np.listen_pg_addr, listen_pg_port: np.listen_pg_port as u16, availability_zone_id: AvailabilityZone(np.availability_zone_id), + use_https, cancel: CancellationToken::new(), - } + }) } /// Wrapper for issuing requests to pageserver management API: takes care of generic @@ -285,8 +325,9 @@ impl Node { warn_threshold, max_retries, &format!( - "Call to node {} ({}:{}) management API", - self.id, self.listen_http_addr, self.listen_http_port + "Call to node {} ({}) management API", + self.id, + self.base_url(), ), cancel, ) @@ -302,6 +343,7 @@ impl Node { availability_zone_id: self.availability_zone_id.0.clone(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, + listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 67b60eadf3..459c11add9 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -375,18 +375,23 @@ impl Persistence { Ok(nodes) } - pub(crate) async fn update_node( + pub(crate) async fn update_node( &self, input_node_id: NodeId, - input_scheduling: NodeSchedulingPolicy, - ) -> DatabaseResult<()> { + values: V, + ) -> DatabaseResult<()> + where + V: diesel::AsChangeset + Clone + Send + Sync, + V::Changeset: diesel::query_builder::QueryFragment + Send, // valid Postgres SQL + { use crate::schema::nodes::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| { + let values = values.clone(); Box::pin(async move { let updated = diesel::update(nodes) .filter(node_id.eq(input_node_id.0 as i64)) - .set((scheduling_policy.eq(String::from(input_scheduling)),)) + .set(values) .execute(conn) .await?; Ok(updated) @@ -403,6 +408,32 @@ impl Persistence { } } + pub(crate) async fn update_node_scheduling_policy( + &self, + input_node_id: NodeId, + input_scheduling: NodeSchedulingPolicy, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.update_node( + input_node_id, + scheduling_policy.eq(String::from(input_scheduling)), + ) + .await + } + + pub(crate) async fn update_node_on_registration( + &self, + input_node_id: NodeId, + input_https_port: Option, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.update_node( + input_node_id, + listen_https_port.eq(input_https_port.map(|x| x as i32)), + ) + .await + } + /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. /// @@ -1452,6 +1483,7 @@ pub(crate) struct NodePersistence { pub(crate) listen_pg_addr: String, pub(crate) listen_pg_port: i32, pub(crate) availability_zone_id: String, + pub(crate) listen_https_port: Option, } /// Tenant metadata health status that are stored durably. diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 48f0804926..4f0f170284 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,6 +1,7 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::{compute_hook, service}; +use json_structural_diff::JsonDiff; use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy}; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest, @@ -24,7 +25,7 @@ use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation}; -const DEFAULT_HEATMAP_PERIOD: &str = "60s"; +const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60); /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. @@ -296,7 +297,7 @@ impl Reconciler { .location_config(tenant_shard_id, config.clone(), flush_ms, lazy) .await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, timeout, @@ -417,7 +418,7 @@ impl Reconciler { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.service_config.jwt_token.as_deref(), + self.service_config.pageserver_jwt_token.as_deref(), ); client @@ -440,7 +441,7 @@ impl Reconciler { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.service_config.jwt_token.as_deref(), + self.service_config.pageserver_jwt_token.as_deref(), ); let timelines = client.timeline_list(&tenant_shard_id).await?; @@ -478,7 +479,7 @@ impl Reconciler { ) .await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, request_download_timeout * 2, @@ -771,7 +772,7 @@ impl Reconciler { let observed_conf = match attached_node .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 1, Duration::from_secs(5), @@ -880,7 +881,27 @@ impl Reconciler { self.generation = Some(generation); wanted_conf.generation = generation.into(); } - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + + let diff = match observed { + Some(ObservedStateLocation { + conf: Some(observed), + }) => { + let diff = JsonDiff::diff( + &serde_json::to_value(observed.clone()).unwrap(), + &serde_json::to_value(wanted_conf.clone()).unwrap(), + false, + ); + + if let Some(json_diff) = diff.diff { + serde_json::to_string(&json_diff).unwrap_or("diff err".to_string()) + } else { + "unknown".to_string() + } + } + _ => "full".to_string(), + }; + + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}"); // Because `node` comes from a ref to &self, clone it before calling into a &mut self // function: this could be avoided by refactoring the state mutated by location_config into @@ -1099,7 +1120,7 @@ impl Reconciler { match origin .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, Duration::from_secs(5), @@ -1180,7 +1201,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig let mut config = config.clone(); if has_secondaries { if config.heatmap_period.is_none() { - config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string()); + config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD); } } else { config.heatmap_period = None; diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index be073d0cb9..53cd8a908b 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -18,12 +18,14 @@ pub struct Safekeeper { cancel: CancellationToken, listen_http_addr: String, listen_http_port: u16, + scheduling_policy: SkSchedulingPolicy, id: NodeId, availability: SafekeeperState, } impl Safekeeper { pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self { + let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap(); Self { cancel, listen_http_addr: skp.host.clone(), @@ -31,6 +33,7 @@ impl Safekeeper { id: NodeId(skp.id as u64), skp, availability: SafekeeperState::Offline, + scheduling_policy, } } pub(crate) fn base_url(&self) -> String { @@ -46,6 +49,13 @@ impl Safekeeper { pub(crate) fn set_availability(&mut self, availability: SafekeeperState) { self.availability = availability; } + pub(crate) fn scheduling_policy(&self) -> SkSchedulingPolicy { + self.scheduling_policy + } + pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) { + self.scheduling_policy = scheduling_policy; + self.skp.scheduling_policy = String::from(scheduling_policy); + } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries pub(crate) async fn with_client_retries( &self, @@ -102,7 +112,7 @@ impl Safekeeper { warn_threshold, max_retries, &format!( - "Call to node {} ({}:{}) management API", + "Call to safekeeper {} ({}:{}) management API", self.id, self.listen_http_addr, self.listen_http_port ), cancel, @@ -129,10 +139,8 @@ impl Safekeeper { self.id.0 ); } - self.skp = crate::persistence::SafekeeperPersistence::from_upsert( - record, - SkSchedulingPolicy::from_str(&self.skp.scheduling_policy).unwrap(), - ); + self.skp = + crate::persistence::SafekeeperPersistence::from_upsert(record, self.scheduling_policy); self.listen_http_port = http_port as u16; self.listen_http_addr = host; } diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 106a7b2699..44936d018a 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -930,13 +930,16 @@ pub(crate) mod test_utils { NodeId(i), format!("httphost-{i}"), 80 + i as u16, + None, format!("pghost-{i}"), 5432 + i as u16, az_iter .next() .cloned() .unwrap_or(AvailabilityZone("test-az".to_string())), - ); + false, + ) + .unwrap(); node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0))); assert!(node.is_available()); node diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 14c30c296d..361253bd19 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -26,6 +26,7 @@ diesel::table! { listen_pg_addr -> Varchar, listen_pg_port -> Int4, availability_zone_id -> Varchar, + listen_https_port -> Nullable, } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 5aa744f076..b9c2711192 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -348,7 +348,12 @@ pub struct Config { // All pageservers managed by one instance of this service must have // the same public key. This JWT token will be used to authenticate // this service to the pageservers it manages. - pub jwt_token: Option, + pub pageserver_jwt_token: Option, + + // All safekeepers managed by one instance of this service must have + // the same public key. This JWT token will be used to authenticate + // this service to the safekeepers it manages. + pub safekeeper_jwt_token: Option, // This JWT token will be used to authenticate this service to the control plane. pub control_plane_jwt_token: Option, @@ -399,6 +404,8 @@ pub struct Config { pub http_service_port: i32, pub long_reconcile_threshold: Duration, + + pub use_https_pageserver_api: bool, } impl From for ApiError { @@ -815,11 +822,12 @@ impl Service { }; tracing::info!("Sending initial heartbeats..."); - let res_ps = self - .heartbeater_ps - .heartbeat(Arc::new(nodes_to_heartbeat)) - .await; - let res_sk = self.heartbeater_sk.heartbeat(all_sks).await; + // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime + const SK_TIMEOUT: Duration = Duration::from_secs(5); + let (res_ps, res_sk) = tokio::join!( + self.heartbeater_ps.heartbeat(Arc::new(nodes_to_heartbeat)), + tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)) + ); let mut online_nodes = HashMap::new(); if let Ok(deltas) = res_ps { @@ -837,7 +845,7 @@ impl Service { } let mut online_sks = HashMap::new(); - if let Ok(deltas) = res_sk { + if let Ok(Ok(deltas)) = res_sk { for (node_id, status) in deltas.0 { match status { SafekeeperState::Available { @@ -879,7 +887,7 @@ impl Service { let response = node .with_client_retries( |client| async move { client.list_location_config().await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, timeout, @@ -980,7 +988,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); match client .location_config( @@ -1031,12 +1039,11 @@ impl Service { let reconciles_spawned = self.reconcile_all(); if reconciles_spawned == 0 { // Run optimizer only when we didn't find any other work to do - let optimizations = self.optimize_all().await; - if optimizations == 0 { - // Run new splits only when no optimizations are pending - self.autosplit_tenants().await; - } + self.optimize_all().await; } + // Always attempt autosplits. Sharding is crucial for bulk ingest performance, so we + // must be responsive when new projects begin ingesting and reach the threshold. + self.autosplit_tenants().await; } _ = self.reconcilers_cancel.cancelled() => return } @@ -1063,8 +1070,12 @@ impl Service { locked.safekeepers.clone() }; - let res_ps = self.heartbeater_ps.heartbeat(nodes).await; - let res_sk = self.heartbeater_sk.heartbeat(safekeepers).await; + const SK_TIMEOUT: Duration = Duration::from_secs(3); + let (res_ps, res_sk) = tokio::join!( + self.heartbeater_ps.heartbeat(nodes), + tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(safekeepers)) + ); + if let Ok(deltas) = res_ps { let mut to_handle = Vec::default(); @@ -1166,7 +1177,7 @@ impl Service { } } } - if let Ok(deltas) = res_sk { + if let Ok(Ok(deltas)) = res_sk { let mut locked = self.inner.write().unwrap(); let mut safekeepers = (*locked.safekeepers).clone(); for (id, state) in deltas.0 { @@ -1397,8 +1408,8 @@ impl Service { .list_nodes() .await? .into_iter() - .map(Node::from_persistent) - .collect::>(); + .map(|x| Node::from_persistent(x, config.use_https_pageserver_api)) + .collect::>>()?; let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} nodes from database.", nodes.len()); metrics::METRICS_REGISTRY @@ -1497,10 +1508,13 @@ impl Service { NodeId(node_id as u64), "".to_string(), 123, + None, "".to_string(), 123, AvailabilityZone("test_az".to_string()), - ); + false, + ) + .unwrap(); scheduler.node_upsert(&node); } @@ -1544,14 +1558,14 @@ impl Service { let reconcilers_cancel = cancel.child_token(); let heartbeater_ps = Heartbeater::new( - config.jwt_token.clone(), + config.pageserver_jwt_token.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), ); let heartbeater_sk = Heartbeater::new( - config.jwt_token.clone(), + config.safekeeper_jwt_token.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), @@ -1898,7 +1912,7 @@ impl Service { let configs = match node .with_client_retries( |client| async move { client.list_location_config().await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -1956,7 +1970,7 @@ impl Service { .location_config(tenant_shard_id, config, None, false) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -2912,7 +2926,9 @@ impl Service { first }; - let updated_config = base.apply_patch(patch); + let updated_config = base + .apply_patch(patch) + .map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?; self.set_tenant_config_and_reconcile(tenant_id, updated_config) .await } @@ -3091,7 +3107,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); @@ -3152,7 +3168,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); futs.push(async move { let result = client @@ -3275,7 +3291,7 @@ impl Service { .tenant_delete(TenantShardId::unsharded(tenant_id)) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 3, RECONCILE_TIMEOUT, @@ -3494,7 +3510,7 @@ impl Service { let timeline_info = create_one( shard_zero_tid, shard_zero_locations, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), create_req.clone(), ) .await?; @@ -3510,7 +3526,7 @@ impl Service { // Create timeline on remaining shards with number >0 if !targets.0.is_empty() { // If we had multiple shards, issue requests for the remainder now. - let jwt = &self.config.jwt_token; + let jwt = &self.config.pageserver_jwt_token; self.tenant_for_shards( targets .0 @@ -3593,7 +3609,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), req.clone(), )) }) @@ -3674,7 +3690,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), )) }) .await?; @@ -3748,7 +3764,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), dir, )) }) @@ -3863,7 +3879,7 @@ impl Service { futs.push(async move { node.with_client_retries( |client| op(tenant_shard_id, client), - &self.config.jwt_token, + &self.config.pageserver_jwt_token, warn_threshold, max_retries, timeout, @@ -4112,7 +4128,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), )) }) .await?; @@ -4134,7 +4150,7 @@ impl Service { shard_zero_tid, timeline_id, shard_zero_locations.latest.node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), ) .await?; Ok(shard_zero_status) @@ -4533,7 +4549,7 @@ impl Service { client.location_config(child_id, config, None, false).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 10, Duration::from_secs(5), @@ -5133,7 +5149,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); let response = client .tenant_shard_split( @@ -5459,7 +5475,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); let scan_result = client @@ -5903,8 +5919,10 @@ impl Service { ) .await; + #[derive(PartialEq)] enum RegistrationStatus { - Matched, + UpToDate, + NeedUpdate, Mismatched, New, } @@ -5913,7 +5931,11 @@ impl Service { let locked = self.inner.read().unwrap(); if let Some(node) = locked.nodes.get(®ister_req.node_id) { if node.registration_match(®ister_req) { - RegistrationStatus::Matched + if node.need_update(®ister_req) { + RegistrationStatus::NeedUpdate + } else { + RegistrationStatus::UpToDate + } } else { RegistrationStatus::Mismatched } @@ -5923,9 +5945,9 @@ impl Service { }; match registration_status { - RegistrationStatus::Matched => { + RegistrationStatus::UpToDate => { tracing::info!( - "Node {} re-registered with matching address", + "Node {} re-registered with matching address and is up to date", register_req.node_id ); @@ -5943,7 +5965,7 @@ impl Service { "Node is already registered with different address".to_string(), )); } - RegistrationStatus::New => { + RegistrationStatus::New | RegistrationStatus::NeedUpdate => { // fallthrough } } @@ -5972,6 +5994,16 @@ impl Service { )); } + if self.config.use_https_pageserver_api && register_req.listen_https_port.is_none() { + return Err(ApiError::PreconditionFailed( + format!( + "Node {} has no https port, but use_https is enabled", + register_req.node_id + ) + .into(), + )); + } + // Ordering: we must persist the new node _before_ adding it to in-memory state. // This ensures that before we use it for anything or expose it via any external // API, it is guaranteed to be available after a restart. @@ -5979,13 +6011,29 @@ impl Service { register_req.node_id, register_req.listen_http_addr, register_req.listen_http_port, + register_req.listen_https_port, register_req.listen_pg_addr, register_req.listen_pg_port, register_req.availability_zone_id.clone(), + self.config.use_https_pageserver_api, ); + let new_node = match new_node { + Ok(new_node) => new_node, + Err(error) => return Err(ApiError::InternalServerError(error)), + }; - // TODO: idempotency if the node already exists in the database - self.persistence.insert_node(&new_node).await?; + match registration_status { + RegistrationStatus::New => self.persistence.insert_node(&new_node).await?, + RegistrationStatus::NeedUpdate => { + self.persistence + .update_node_on_registration( + register_req.node_id, + register_req.listen_https_port, + ) + .await? + } + _ => unreachable!("Other statuses have been processed earlier"), + } let mut locked = self.inner.write().unwrap(); let mut new_nodes = (*locked.nodes).clone(); @@ -6000,12 +6048,24 @@ impl Service { .storage_controller_pageserver_nodes .set(locked.nodes.len() as i64); - tracing::info!( - "Registered pageserver {} ({}), now have {} pageservers", - register_req.node_id, - register_req.availability_zone_id, - locked.nodes.len() - ); + match registration_status { + RegistrationStatus::New => { + tracing::info!( + "Registered pageserver {} ({}), now have {} pageservers", + register_req.node_id, + register_req.availability_zone_id, + locked.nodes.len() + ); + } + RegistrationStatus::NeedUpdate => { + tracing::info!( + "Re-registered and updated node {} ({})", + register_req.node_id, + register_req.availability_zone_id, + ); + } + _ => unreachable!("Other statuses have been processed earlier"), + } Ok(()) } @@ -6023,7 +6083,9 @@ impl Service { if let Some(scheduling) = scheduling { // Scheduling is a persistent part of Node: we must write updates to the database before // applying them in memory - self.persistence.update_node(node_id, scheduling).await?; + self.persistence + .update_node_scheduling_policy(node_id, scheduling) + .await?; } // If we're activating a node, then before setting it active we must reconcile any shard locations @@ -6594,11 +6656,12 @@ impl Service { ) -> Option { let reconcile_needed = shard.get_reconcile_needed(nodes); - match reconcile_needed { + let reconcile_reason = match reconcile_needed { ReconcileNeeded::No => return None, ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), - ReconcileNeeded::Yes => { + ReconcileNeeded::Yes(reason) => { // Fall through to try and acquire units for spawning reconciler + reason } }; @@ -6637,6 +6700,7 @@ impl Service { }; shard.spawn_reconciler( + reconcile_reason, &self.result_tx, nodes, &self.compute_hook, @@ -6761,7 +6825,7 @@ impl Service { // with the frequency of background calls, this acts as an implicit rate limit that runs a small // trickle of optimizations in the background, rather than executing a large number in parallel // when a change occurs. - const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2; + const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 16; // Synchronous prepare: scan shards for possible scheduling optimizations let candidate_work = self.optimize_all_plan(); @@ -6812,7 +6876,7 @@ impl Service { // How many candidate optimizations we will generate, before evaluating them for readniess: setting // this higher than the execution limit gives us a chance to execute some work even if the first // few optimizations we find are not ready. - const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8; + const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 64; let mut work = Vec::new(); let mut locked = self.inner.write().unwrap(); @@ -7039,7 +7103,7 @@ impl Service { match attached_node .with_client_retries( |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7075,7 +7139,7 @@ impl Service { ) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7130,7 +7194,7 @@ impl Service { let request = request_ref.clone(); client.top_tenant_shards(request.clone()).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 3, Duration::from_secs(5), @@ -7303,7 +7367,7 @@ impl Service { match node .with_client_retries( |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 3, Duration::from_millis(250), @@ -7961,7 +8025,7 @@ impl Service { let sk = safekeepers .get_mut(&node_id) .ok_or(DatabaseError::Logical("Not found".to_string()))?; - sk.skp.scheduling_policy = String::from(scheduling_policy); + sk.set_scheduling_policy(scheduling_policy); locked.safekeepers = Arc::new(safekeepers); } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 219c0dffe7..56a36dc2df 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -481,7 +481,14 @@ pub(crate) enum ReconcileNeeded { /// spawned: wait for the existing reconciler rather than spawning a new one. WaitExisting(ReconcilerWaiter), /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] - Yes, + Yes(ReconcileReason), +} + +#[derive(Debug)] +pub(crate) enum ReconcileReason { + ActiveNodesDirty, + UnknownLocation, + PendingComputeNotification, } /// Pending modification to the observed state of a tenant shard. @@ -1341,12 +1348,18 @@ impl TenantShard { let active_nodes_dirty = self.dirty(pageservers); - // Even if there is no pageserver work to be done, if we have a pending notification to computes, - // wake up a reconciler to send it. - let do_reconcile = - active_nodes_dirty || dirty_observed || self.pending_compute_notification; + let reconcile_needed = match ( + active_nodes_dirty, + dirty_observed, + self.pending_compute_notification, + ) { + (true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty), + (_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation), + (_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification), + _ => ReconcileNeeded::No, + }; - if !do_reconcile { + if matches!(reconcile_needed, ReconcileNeeded::No) { tracing::debug!("Not dirty, no reconciliation needed."); return ReconcileNeeded::No; } @@ -1389,7 +1402,7 @@ impl TenantShard { } } - ReconcileNeeded::Yes + reconcile_needed } /// Ensure the sequence number is set to a value where waiting for this value will make us wait @@ -1479,6 +1492,7 @@ impl TenantShard { #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( &mut self, + reason: ReconcileReason, result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, @@ -1538,7 +1552,7 @@ impl TenantShard { let reconcile_seq = self.sequence; let long_reconcile_threshold = service_config.long_reconcile_threshold; - tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); + tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})"); let must_notify = self.pending_compute_notification; let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, tenant_id=%reconciler.tenant_shard_id.tenant_id, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 12b096a2a0..1d282971b1 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -96,7 +96,7 @@ from fixtures.utils import ( ATTACHMENT_NAME_REGEX, COMPONENT_BINARIES, USE_LFC, - allure_add_grafana_links, + allure_add_grafana_link, assert_no_errors, get_dir_size, print_gc_result, @@ -1167,15 +1167,15 @@ class NeonEnv: "max_batch_size": 32, } - # Concurrent IO (https://github.com/neondatabase/neon/issues/9378): - # enable concurrent IO by default in tests and benchmarks. - # Compat tests are exempt because old versions fail to parse the new config. - get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io if config.test_may_use_compatibility_snapshot_binaries: log.info( - "Forcing use of binary-built-in default to avoid forward-compatibility related test failures" + "Skipping WAL contiguity validation to avoid forward-compatibility related test failures" ) - get_vectored_concurrent_io = None + else: + # Look for gaps in WAL received from safekeepeers + ps_cfg["validate_wal_contiguity"] = True + + get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io if get_vectored_concurrent_io is not None: ps_cfg["get_vectored_concurrent_io"] = { "mode": self.pageserver_get_vectored_concurrent_io, @@ -1630,6 +1630,7 @@ def neon_env_builder( class PageserverPort: pg: int http: int + https: int | None = None class LogUtils: @@ -1886,6 +1887,7 @@ class NeonStorageController(MetricsGetter, LogUtils): "node_id": int(node.id), "listen_http_addr": "localhost", "listen_http_port": node.service_port.http, + "listen_https_port": node.service_port.https, "listen_pg_addr": "localhost", "listen_pg_port": node.service_port.pg, "availability_zone_id": node.az_id, @@ -3255,7 +3257,7 @@ def remote_pg( end_ms = int(datetime.utcnow().timestamp() * 1000) if is_neon: # Add 10s margin to the start and end times - allure_add_grafana_links( + allure_add_grafana_link( host, timeline_id, start_ms - 10_000, diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 2a59eab710..84d62fb877 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -312,62 +312,46 @@ def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False): GRAFANA_URL = "https://neonprod.grafana.net" -GRAFANA_EXPLORE_URL = f"{GRAFANA_URL}/explore" -GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL = f"{GRAFANA_URL}/d/8G011dlnk/timeline-inspector" -LOGS_STAGING_DATASOURCE_ID = "xHHYY0dVz" +GRAFANA_DASHBOARD_URL = f"{GRAFANA_URL}/d/cdya0okb81zwga/cross-service-endpoint-debugging" -def allure_add_grafana_links(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int): - """Add links to server logs in Grafana to Allure report""" - links: dict[str, str] = {} - # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build +def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int): + """ + Add a link to the cross-service endpoint debugging dashboard in Grafana to Allure report. + + Args: + host (str): The host string in the format 'ep-..'. + timeline_id (TimelineId): The timeline identifier for the Grafana dashboard. + (currently ignored but may be needed in future verions of the dashboard) + start_ms (int): The start time in milliseconds for the Grafana dashboard. + end_ms (int): The end time in milliseconds for the Grafana dashboard. + + Example: + Given + host = '' + timeline_id = '996926d1f5ddbe7381b8840083f8fc9a' + + The generated link would be something like: + https://neonprod.grafana.net/d/cdya0okb81zwga/cross-service-endpoint-debugging?orgId=1&from=2025-02-17T21:10:00.000Z&to=2025-02-17T21:20:00.000Z&timezone=utc&var-env=dev%7Cstaging&var-input_endpoint_id=ep-holy-mouse-w2u462gi + + """ + # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build endpoint_id, region_id, _ = host.split(".", 2) - expressions = { - "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}', - "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"', - "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"', - "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}', + params = { + "orgId": 1, + "from": start_ms, + "to": end_ms, + "timezone": "utc", + "var-env": "dev|staging", + "var-input_endpoint_id": endpoint_id, } - params: dict[str, Any] = { - "datasource": LOGS_STAGING_DATASOURCE_ID, - "queries": [ - { - "expr": "", - "refId": "A", - "datasource": {"type": "loki", "uid": LOGS_STAGING_DATASOURCE_ID}, - "editorMode": "code", - "queryType": "range", - } - ], - "range": { - "from": str(start_ms), - "to": str(end_ms), - }, - } - for name, expr in expressions.items(): - params["queries"][0]["expr"] = expr - query_string = urlencode({"orgId": 1, "left": json.dumps(params)}) - links[name] = f"{GRAFANA_EXPLORE_URL}?{query_string}" + query_string = urlencode(params) + link = f"{GRAFANA_DASHBOARD_URL}?{query_string}" - timeline_qs = urlencode( - { - "orgId": 1, - "var-environment": "victoria-metrics-aws-dev", - "var-timeline_id": timeline_id, - "var-endpoint_id": endpoint_id, - "var-log_datasource": "grafanacloud-neonstaging-logs", - "from": start_ms, - "to": end_ms, - } - ) - link = f"{GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL}?{timeline_qs}" - links["Timeline Inspector"] = link - - for name, link in links.items(): - allure.dynamic.link(link, name=name) - log.info(f"{name}: {link}") + allure.dynamic.link(link, name="Cross-Service Endpoint Debugging") + log.info(f"Cross-Service Endpoint Debugging: {link}") def start_in_background( diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 124e62999a..d49686b57c 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -29,6 +29,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): ".*failed to load metadata.*", ".*load failed.*load local timeline.*", ".*: layer load failed, assuming permanent failure:.*", + ".*failed to get checkpoint bytes.*", + ".*failed to get control bytes.*", ] ) @@ -75,7 +77,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err: + with pytest.raises(Exception, match="failed to get checkpoint bytes") as err: pg1.start() log.info( f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 99d41e410a..b360162dc1 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -501,19 +501,31 @@ def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv): """ Test that the compute_installed_extensions properly reports accurate results. Important to note that currently this metric is only gathered on - compute start. + compute start. We install the neon extension into a database other than + postgres because compute_ctl will run `ALTER EXTENSION neon UPDATE` during + Postgres startup in the postgres database, creating a race condition. """ + DB_NAME = "test" + env = neon_simple_env endpoint = env.endpoints.create_start("main") + endpoint.safe_psql(f"CREATE DATABASE {DB_NAME}") + + # The metric is only gathered on compute start, so restart to check that + # plpgsql is now in 3 databases, instead of its regular 2, template1 and + # postgres. + endpoint.stop() + endpoint.start() client = endpoint.http_client() def __has_plpgsql(samples: list[Sample]) -> bool: """ - Check that plpgsql is installed in the template1 and postgres databases + Check that plpgsql is installed in the template1, postgres, and test + databases """ - return len(samples) == 1 and samples[0].value == 2 + return len(samples) == 1 and samples[0].value == 3 wait_until( collect_metric( @@ -525,8 +537,8 @@ def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv): name="compute_installed_extensions", ) - # Install the neon extension, so we can check for it on the restart - endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'") + # Install the neon extension, so we can check for it on the restart. + endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'", dbname=DB_NAME) # The metric is only gathered on compute start, so restart to check if the # neon extension will now be there. diff --git a/test_runner/regress/test_lfc_prefetch.py b/test_runner/regress/test_lfc_prefetch.py new file mode 100644 index 0000000000..dd422d996e --- /dev/null +++ b/test_runner/regress/test_lfc_prefetch.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import time + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import USE_LFC + + +@pytest.mark.timeout(600) +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_lfc_prefetch(neon_simple_env: NeonEnv): + """ + Test resizing the Local File Cache + """ + env = neon_simple_env + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "effective_io_concurrency=100", + "shared_buffers=1MB", + "enable_bitmapscan=off", + "enable_seqscan=off", + "autovacuum=off", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon") + cur.execute("create table t(pk integer, sk integer, filler text default repeat('x',200))") + cur.execute("set statement_timeout=0") + cur.execute("select setseed(0.5)") + cur.execute("insert into t values (generate_series(1,1000000),random()*1000000)") + cur.execute("create index on t(sk)") + cur.execute("vacuum t") + + # reset LFC + cur.execute("alter system set neon.file_cache_size_limit=0") + cur.execute("select pg_reload_conf()") + time.sleep(1) + cur.execute("alter system set neon.file_cache_size_limit='1GB'") + cur.execute("select pg_reload_conf()") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 100000 and 200000 limit 100) s1" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 200000 and 300000 limit 100) s2" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 300000 and 400000 limit 100) s3" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 100000 and 200000 limit 100) s4" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + # if prefetch requests are not stored in LFC, we continue to sent unused prefetch request tyo PS + assert prefetch_expired > 0 + + cur.execute("set neon.store_prefetch_result_in_lfc=on") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 500000 and 600000 limit 100) s5" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 600000 and 700000 limit 100) s6" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 700000 and 800000 limit 100) s7" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 500000 and 600000 limit 100) s8" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + # No redundant prefethc requrests if prefetch results are stored in LFC + assert prefetch_expired == 0 diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index aa375604f4..a9b897b741 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -8,9 +8,10 @@ from pathlib import Path from typing import TYPE_CHECKING import pytest -from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( + DEFAULT_BRANCH_NAME, NeonEnvBuilder, NeonPageserver, StorageControllerMigrationConfig, @@ -903,6 +904,9 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): remote_storage_kind=RemoteStorageKind.MOCK_S3, ) + tenant_conf = TENANT_CONF.copy() + tenant_conf["heatmap_period"] = "0s" + env = neon_env_builder.init_configs() env.start() @@ -910,7 +914,7 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.create_tenant(tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}') + env.create_tenant(tenant_id, timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}') env.storage_controller.reconcile_until_idle() @@ -924,8 +928,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): workload.write_rows(128, upload=True) workload.write_rows(128, upload=True) workload.write_rows(128, upload=True) + + child_timeline_id = env.create_branch( + "foo", tenant_id, ancestor_branch_name=DEFAULT_BRANCH_NAME + ) + workload.write_rows(128, upload=True) - workload.stop() # Expect lots of layers assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 @@ -934,9 +942,19 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): for ps in env.pageservers: ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + def timeline_heatmap(tlid): + assert env.pageserver_remote_storage is not None + + heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) + for htl in heatmap["timelines"]: + if htl["timeline_id"] == str(tlid): + return htl + + raise RuntimeError(f"No heatmap for timeline: {tlid}") + # Upload a heatmap, so that secondaries have something to download ps_attached.http_client().tenant_heatmap_upload(tenant_id) - heatmap_before_migration = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_before_migration = timeline_heatmap(timeline_id) # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms. # However, it pulls the heatmap, which will be important later. @@ -968,17 +986,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): assert env.storage_controller.locate(tenant_id)[0]["node_id"] == ps_secondary.id ps_secondary.http_client().tenant_heatmap_upload(tenant_id) - heatmap_after_migration = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_after_migration = timeline_heatmap(timeline_id) - assert len(heatmap_before_migration["timelines"][0]["layers"]) > 0 + assert len(heatmap_before_migration["layers"]) > 0 - # The new layer map should contain all the layers in the pre-migration one - # and a new in memory layer - after_migration_heatmap_layers_count = len(heatmap_after_migration["timelines"][0]["layers"]) - assert ( - len(heatmap_before_migration["timelines"][0]["layers"]) + 1 - == after_migration_heatmap_layers_count - ) + after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"]) + assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") @@ -986,10 +999,71 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id ) - def all_layers_downloaded(): + # Now simulate the case where a child timeline is archived, parent layers + # are evicted and the child is unarchived. When the child is unarchived, + # itself and the parent update their heatmaps to contain layers needed by the + # child. One can warm up the timeline hierarchy since the heatmaps are ready. + + def all_layers_downloaded(expected_layer_count: int): local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") - assert local_layers_count == after_migration_heatmap_layers_count + assert local_layers_count >= expected_layer_count - wait_until(all_layers_downloaded) + wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count)) + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + + before = ( + ps_secondary.http_client() + .get_metrics() + .query_one("pageserver_remote_ondemand_downloaded_layers_total") + .value + ) + workload.validate() + after = ( + ps_secondary.http_client() + .get_metrics() + .query_one("pageserver_remote_ondemand_downloaded_layers_total") + .value + ) + + workload.stop() + assert before == after + + def check_archival_state(state: TimelineArchivalState, tline): + timelines = ( + timeline["timeline_id"] + for timeline in ps_secondary.http_client().timeline_list(tenant_id=tenant_id) + ) + + if state == TimelineArchivalState.ARCHIVED: + assert str(tline) not in timelines + elif state == TimelineArchivalState.UNARCHIVED: + assert str(tline) in timelines + + ps_secondary.http_client().timeline_archival_config( + tenant_id, child_timeline_id, TimelineArchivalState.ARCHIVED + ) + ps_secondary.http_client().timeline_offload(tenant_id, child_timeline_id) + wait_until(lambda: check_archival_state(TimelineArchivalState.ARCHIVED, child_timeline_id)) + + ps_secondary.http_client().evict_all_layers(tenant_id, timeline_id) + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + assert len(timeline_heatmap(timeline_id)["layers"]) == 0 + + ps_secondary.http_client().timeline_archival_config( + tenant_id, child_timeline_id, TimelineArchivalState.UNARCHIVED + ) + wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id)) + + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}") + log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}") + + expected_locally = len(timeline_heatmap(timeline_id)["layers"]) + assert expected_locally > 0 + + env.storage_controller.download_heatmap_layers( + TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + ) + wait_until(lambda: all_layers_downloaded(expected_locally)) diff --git a/test_runner/regress/test_pgstat.py b/test_runner/regress/test_pgstat.py new file mode 100644 index 0000000000..c31e5ef7f8 --- /dev/null +++ b/test_runner/regress/test_pgstat.py @@ -0,0 +1,83 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv +from fixtures.pg_version import PgVersion + + +# +# Test that pgstat statistic is preserved across sessions +# +def test_pgstat(neon_simple_env: NeonEnv): + env = neon_simple_env + if env.pg_version == PgVersion.V14: + pytest.skip("PG14 doesn't support pgstat statistic persistence") + + n = 10000 + endpoint = env.endpoints.create_start( + "main", config_lines=["neon_pgstat_file_size_limit=100kB", "autovacuum=off"] + ) + + con = endpoint.connect() + cur = con.cursor() + + cur.execute("create table t(x integer)") + cur.execute(f"insert into t values (generate_series(1,{n}))") + cur.execute("vacuum analyze t") + cur.execute("select sum(x) from t") + cur.execute("update t set x=x+1") + + cur.execute("select pg_stat_force_next_flush()") + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + assert rec == (2, n * 2, n, n, n * 2, n, 1, 1) + + endpoint.stop() + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + assert rec == (2, n * 2, n, n, n * 2, n, 1, 1) + + cur.execute("update t set x=x+1") + + # stop without checkpoint + endpoint.stop(mode="immediate") + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + # pgstat information should be discarded in case of abnormal termination + assert rec == (0, 0, 0, 0, 0, 0, 0, 0) + + cur.execute("select sum(x) from t") + + # create more relations to increase size of statistics + for i in range(1, 1000): + cur.execute(f"create table t{i}(pk integer primary key)") + + cur.execute("select pg_stat_force_next_flush()") + + endpoint.stop() + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + # pgstat information is not restored because its size exeeds 100k threshold + assert rec == (0, 0, 0, 0, 0, 0, 0, 0) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 88d30308f7..d5acc257b2 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -182,6 +182,13 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) time.sleep(1) assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0 + # Exercise live migration of a tenant back to the original pageserver + migrate_tenant = env.pageservers[1].http_client().tenant_list_locations()["tenant_shards"][0][0] + env.storage_controller.tenant_shard_migrate( + TenantShardId.parse(migrate_tenant), env.pageservers[0].id + ) + assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 1 + # Restarting a pageserver should not detach any tenants (i.e. /re-attach works) before_restart = env.pageservers[1].http_client().tenant_list_locations() env.pageservers[1].stop() @@ -2139,8 +2146,9 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto workload.validate() +@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) @pytest.mark.parametrize("num_azs", [1, 2]) -def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int): +def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int, combination): """ Graceful reststart of storage controller clusters use the drain and fill hooks in order to migrate attachments away from pageservers before @@ -3238,12 +3246,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): newest_info = target.get_safekeeper(inserted["id"]) assert newest_info assert newest_info["scheduling_policy"] == "Pause" - target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + target.safekeeper_scheduling_policy(inserted["id"], "Active") newest_info = target.get_safekeeper(inserted["id"]) assert newest_info - assert newest_info["scheduling_policy"] == "Decomissioned" + assert newest_info["scheduling_policy"] == "Active" # Ensure idempotency - target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + target.safekeeper_scheduling_policy(inserted["id"], "Active") + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Active" + # change back to paused again + target.safekeeper_scheduling_policy(inserted["id"], "Pause") def storcon_heartbeat(): assert env.storage_controller.log_contains( @@ -3252,6 +3265,9 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): wait_until(storcon_heartbeat) + # Now decomission it + target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: compared = [dict(a), dict(b)] @@ -3756,3 +3772,96 @@ def test_storage_controller_node_flap_detach_race( assert len(locs) == 1, f"{shard} has {len(locs)} attached locations" wait_until(validate_locations, timeout=10) + + +def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder): + """ + Check that storage controller handles node_register requests with updated fields correctly. + 1. Run storage controller and register 1 pageserver without https port. + 2. Register the same pageserver with https port. Check that port has been updated. + 3. Restart the storage controller. Check that https port is persistent. + 4. Register the same pageserver without https port again (rollback). Check that port has been removed. + """ + neon_env_builder.num_pageservers = 1 + env = neon_env_builder.init_configs() + + env.storage_controller.start() + env.storage_controller.wait_until_ready() + + pageserver = env.pageservers[0] + + # Step 1. Register pageserver without https port. + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] is None + + # Step 2. Register pageserver with https port. + pageserver.service_port.https = 1234 + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] == 1234 + + # Step 3. Restart storage controller. + env.storage_controller.stop() + env.storage_controller.start() + env.storage_controller.wait_until_ready() + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] == 1234 + + # Step 4. Register pageserver with no https port again. + pageserver.service_port.https = None + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] is None + + +def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvBuilder): + """ + Validate that a storage controller restart with no shards in a transient state + performs zero reconciliations at start-up. Implicitly, this means that the location + configs returned by the pageserver are identical to the persisted state in the + storage controller database. + """ + neon_env_builder.num_pageservers = 1 + neon_env_builder.storage_controller_config = { + "start_as_candidate": False, + } + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, shard_count=2, tenant_config={"pitr_interval": "1h2m3s"} + ) + + env.storage_controller.reconcile_until_idle() + + reconciles_before_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_before_restart != 0 + + env.storage_controller.stop() + env.storage_controller.start() + + env.storage_controller.reconcile_until_idle() + + reconciles_after_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_after_restart == 0 diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py index 849d4f024d..6175643389 100644 --- a/test_runner/regress/test_subscriber_branching.py +++ b/test_runner/regress/test_subscriber_branching.py @@ -1,9 +1,10 @@ from __future__ import annotations +import threading import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync from fixtures.utils import query_scalar, wait_until @@ -239,3 +240,173 @@ def test_subscriber_branching(neon_simple_env: NeonEnv): res = scur_postgres.fetchall() assert len(res) == 1 assert str(sub_child_2_timeline_id) == res[0][0] + + +def test_multiple_subscription_branching(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can handle concurrent deletion of subscriptions in a multiple databases + """ + env = neon_simple_env + + NUMBER_OF_DBS = 5 + + # Create and start endpoint so that neon_local put all the generated + # stuff into the spec.json file. + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "max_replication_slots = 10", + "max_logical_replication_workers=10", + "max_worker_processes=10", + ], + ) + + TEST_DB_NAMES = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "publisher_db", + "owner": "cloud_admin", + }, + ] + + for i in range(NUMBER_OF_DBS): + TEST_DB_NAMES.append( + { + "name": f"db{i}", + "owner": "cloud_admin", + } + ) + + # Update the spec.json file to create the databases + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''") + + # create table, replication and subscription for each of the databases + with endpoint.cursor(dbname="publisher_db") as publisher_cursor: + for i in range(NUMBER_OF_DBS): + publisher_cursor.execute(f"CREATE TABLE t{i}(a int)") + publisher_cursor.execute(f"CREATE PUBLICATION mypub{i} FOR TABLE t{i}") + publisher_cursor.execute( + f"select pg_catalog.pg_create_logical_replication_slot('mysub{i}', 'pgoutput');" + ) + publisher_cursor.execute(f"INSERT INTO t{i} VALUES ({i})") + + with endpoint.cursor(dbname=f"db{i}") as cursor: + cursor.execute(f"CREATE TABLE t{i}(a int)") + cursor.execute( + f"CREATE SUBSCRIPTION mysub{i} CONNECTION '{connstr}' PUBLICATION mypub{i} WITH (create_slot = false) " + ) + + # wait for the subscription to be active + for i in range(NUMBER_OF_DBS): + logical_replication_sync( + endpoint, + endpoint, + f"mysub{i}", + sub_dbname=f"db{i}", + pub_dbname="publisher_db", + ) + + # Check that replication is working + for i in range(NUMBER_OF_DBS): + with endpoint.cursor(dbname=f"db{i}") as cursor: + cursor.execute(f"SELECT * FROM t{i}") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == i + + last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();") + + def start_publisher_workload(table_num: int, duration: int): + start = time.time() + with endpoint.cursor(dbname="publisher_db") as cur: + while time.time() - start < duration: + cur.execute(f"INSERT INTO t{i} SELECT FROM generate_series(1,1000)") + + LOAD_DURATION = 5 + threads = [ + threading.Thread(target=start_publisher_workload, args=(i, LOAD_DURATION)) + for i in range(NUMBER_OF_DBS) + ] + + for thread in threads: + thread.start() + + sub_child_1_timeline_id = env.create_branch( + "subscriber_child_1", + ancestor_branch_name="main", + ancestor_start_lsn=last_insert_lsn, + ) + + sub_child_1 = env.endpoints.create("subscriber_child_1") + + sub_child_1.respec( + skip_pg_catalog_updates=False, + reconfigure_concurrency=5, + drop_subscriptions_before_start=True, + cluster={ + "databases": TEST_DB_NAMES, + "roles": [], + }, + ) + + sub_child_1.start() + + # ensure that subscription deletion happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + log.info(f"res = {res}") + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + # ensure that there are no subscriptions in the databases + for i in range(NUMBER_OF_DBS): + with sub_child_1.cursor(dbname=f"db{i}") as cursor: + cursor.execute("SELECT * FROM pg_catalog.pg_subscription") + res = cursor.fetchall() + assert len(res) == 0 + + # ensure that there are no unexpected rows in the tables + cursor.execute(f"SELECT * FROM t{i}") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == i + + for thread in threads: + thread.join() + + # ensure that logical replication is still working in main endpoint + # wait for it to catch up + for i in range(NUMBER_OF_DBS): + logical_replication_sync( + endpoint, + endpoint, + f"mysub{i}", + sub_dbname=f"db{i}", + pub_dbname="publisher_db", + ) + + # verify that the data is the same in publisher and subscriber tables + with endpoint.cursor(dbname="publisher_db") as publisher_cursor: + for i in range(NUMBER_OF_DBS): + with endpoint.cursor(dbname=f"db{i}") as cursor: + publisher_cursor.execute(f"SELECT count(*) FROM t{i}") + cursor.execute(f"SELECT count(*) FROM t{i}") + pub_res = publisher_cursor.fetchone() + sub_res = cursor.fetchone() + log.info(f"for table t{i}: pub_res = {pub_res}, sub_res = {sub_res}") + assert pub_res == sub_res diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 2706ddf2f0..c17840d31c 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -823,6 +823,8 @@ def test_timeline_retain_lsn( [ ".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*", ".*page_service_conn_main.*could not find data for key.*", + ".*failed to get checkpoint bytes.*", + ".*failed to get control bytes.*", ] ) if offload_child is None or "no-restart" not in offload_child: diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 81e2eef061..6ff5044377 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 81e2eef0616c65c2233c75b06f25766ae4c080c4 +Subproject commit 6ff50443773b69749e16da6db9d4f4b19064b4b7 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 9422247c58..261ed10e9b 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 9422247c582e7c1a08a4855d04af0874f8df2f34 +Subproject commit 261ed10e9b8c8dda01ad7aefb18e944e30aa161d diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index a8fea8b4be..59b2fe851f 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit a8fea8b4be43039f0782347c88a9b9b25f50c9d8 +Subproject commit 59b2fe851f8e0595f6c830b90ee766f4f1c17a0f diff --git a/vendor/revisions.json b/vendor/revisions.json index 72d97d7f6a..f85cec3a0b 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,15 +1,15 @@ { "v17": [ "17.4", - "a8fea8b4be43039f0782347c88a9b9b25f50c9d8" + "59b2fe851f8e0595f6c830b90ee766f4f1c17a0f" ], "v16": [ "16.8", - "9422247c582e7c1a08a4855d04af0874f8df2f34" + "261ed10e9b8c8dda01ad7aefb18e944e30aa161d" ], "v15": [ "15.12", - "81e2eef0616c65c2233c75b06f25766ae4c080c4" + "6ff50443773b69749e16da6db9d4f4b19064b4b7" ], "v14": [ "14.17",