diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 11adc8df86..2bdb727719 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -183,7 +183,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 330e875d56..037b9aeb1e 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -88,7 +88,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 5fc6aa247a..3aa671fab1 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -124,28 +124,28 @@ jobs: uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v16 build id: cache_pg_16 uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v17 build id: cache_pg_17 uses: actions/cache@v4 with: path: pg_install/v17 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index ca5ff573e1..0f05276579 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -19,9 +19,16 @@ defaults: run: shell: bash -euo pipefail {0} -concurrency: - group: build-build-tools-image-${{ inputs.image-tag }} - cancel-in-progress: false +# The initial idea was to prevent the waste of resources by not re-building the `build-tools` image +# for the same tag in parallel workflow runs, and queue them to be skipped once we have +# the first image pushed to Docker registry, but GitHub's concurrency mechanism is not working as expected. +# GitHub can't have more than 1 job in a queue and removes the previous one, it causes failures if the dependent jobs. +# +# Ref https://github.com/orgs/community/discussions/41518 +# +# concurrency: +# group: build-build-tools-image-${{ inputs.image-tag }} +# cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -36,6 +43,7 @@ jobs: strategy: matrix: + debian-version: [ bullseye, bookworm ] arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -74,22 +82,22 @@ jobs: - uses: docker/build-push-action@v6 with: + file: Dockerfile.build-tools context: . provenance: false push: true pull: true - file: Dockerfile.build-tools - cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} - tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} + build-args: | + DEBIAN_VERSION=${{ matrix.debian-version }} + cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }} + tags: | + neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.debian-version }}-${{ matrix.arch }} merge-images: needs: [ build-image ] runs-on: ubuntu-22.04 - env: - IMAGE_TAG: ${{ inputs.image-tag }} - steps: - uses: docker/login-action@v3 with: @@ -97,7 +105,17 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Create multi-arch image + env: + DEFAULT_DEBIAN_VERSION: bullseye + IMAGE_TAG: ${{ inputs.image-tag }} run: | - docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \ - neondatabase/build-tools:${IMAGE_TAG}-x64 \ - neondatabase/build-tools:${IMAGE_TAG}-arm64 + for debian_version in bullseye bookworm; do + tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}") + if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then + tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}") + fi + + docker buildx imagetools create "${tags[@]}" \ + neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \ + neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64 + done diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e7193cfe19..b669eaeb11 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -92,7 +92,7 @@ jobs: needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -106,7 +106,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -120,6 +120,24 @@ jobs: - name: Run mypy to check types run: poetry run mypy . + check-codestyle-jsonnet: + needs: [ check-permissions, build-build-tools-image ] + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Check Jsonnet code formatting + run: | + make -C compute jsonnetfmt-test + # Check that the vendor/postgres-* submodules point to the # corresponding REL_*_STABLE_neon branches. check-submodules: @@ -181,7 +199,7 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -261,7 +279,7 @@ jobs: uses: ./.github/workflows/_build-and-test-locally.yml with: arch: ${{ matrix.arch }} - build-tools-image: ${{ needs.build-build-tools-image.outputs.image }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds @@ -276,7 +294,7 @@ jobs: needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -289,7 +307,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -309,7 +327,7 @@ jobs: needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -367,7 +385,7 @@ jobs: runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -415,7 +433,7 @@ jobs: needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -559,15 +577,16 @@ jobs: ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm + DEBIAN_VERSION=bookworm provenance: false push: true pull: true file: Dockerfile - cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: needs: [ neon-image-arch, tag ] @@ -582,8 +601,9 @@ jobs: - name: Create multi-arch image run: | docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64 + -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 - uses: docker/login-action@v3 with: @@ -604,17 +624,16 @@ jobs: version: # Much data was already generated on old PG versions with bullseye's # libraries, the locales of which can cause data incompatibilities. - # However, new PG versions should check if they can be built on newer - # images, as that reduces the support burden of old and ancient - # distros. + # However, new PG versions should be build on newer images, + # as that reduces the support burden of old and ancient distros. - pg: v14 - debian: bullseye-slim + debian: bullseye - pg: v15 - debian: bullseye-slim + debian: bullseye - pg: v16 - debian: bullseye-slim + debian: bullseye - pg: v17 - debian: bookworm-slim + debian: bookworm arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -659,16 +678,16 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node - cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg == 'v16' @@ -679,17 +698,17 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node target: neon-pg-ext-test - cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once @@ -704,14 +723,16 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: needs: [ compute-node-image-arch, tag ] @@ -719,7 +740,16 @@ jobs: strategy: matrix: - version: [ v14, v15, v16, v17 ] + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + debian: bullseye + - pg: v15 + debian: bullseye + - pg: v16 + debian: bullseye + - pg: v17 + debian: bookworm steps: - uses: docker/login-action@v3 @@ -729,23 +759,26 @@ jobs: - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image - if: matrix.version == 'v16' + if: matrix.version.pg == 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch compute-tools image - if: matrix.version == 'v17' + if: matrix.version.pg == 'v16' run: | docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64 + -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - uses: docker/login-action@v3 with: @@ -753,13 +786,13 @@ jobs: username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR + - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Push multi-arch compute-tools image to ECR - if: matrix.version == 'v17' + if: matrix.version.pg == 'v16' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} @@ -770,7 +803,16 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16, v17 ] + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + debian: bullseye + - pg: v15 + debian: bullseye + - pg: v16 + debian: bullseye + - pg: v17 + debian: bookworm env: VM_BUILDER_VERSION: v0.35.0 @@ -792,18 +834,18 @@ jobs: # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ - -spec=compute/vm-image-spec.yaml \ - -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ + -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] @@ -1058,7 +1100,6 @@ jobs: run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release" ]]; then gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 140aac032a..287c9ea281 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -155,7 +155,7 @@ jobs: github.ref_name == 'main' runs-on: [ self-hosted, large ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index 23a2e3876c..df40b5beda 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -55,7 +55,7 @@ jobs: runs-on: ubuntu-22.04 container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -150,7 +150,7 @@ jobs: runs-on: ubuntu-22.04 container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 2e79498fc4..c196d07d3e 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -71,7 +71,6 @@ jobs: steps: - uses: docker/login-action@v3 - with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -94,8 +93,22 @@ jobs: az acr login --name=neoneastus2 - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR + env: + DEFAULT_DEBIAN_VERSION: bullseye run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ - -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \ - -t neondatabase/build-tools:${TO_TAG} \ - neondatabase/build-tools:${FROM_TAG} + for debian_version in bullseye bookworm; do + tags=() + + tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}") + tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}") + tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}") + + if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then + tags+=("-t" "neondatabase/build-tools:${TO_TAG}") + tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}") + tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}") + fi + + docker buildx imagetools create "${tags[@]}" \ + neondatabase/build-tools:${FROM_TAG}-${debian_version} + done diff --git a/Cargo.lock b/Cargo.lock index 5edf5cf7b4..ad29fa4634 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,9 +148,9 @@ dependencies = [ [[package]] name = "asn1-rs" -version = "0.5.2" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0" +checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048" dependencies = [ "asn1-rs-derive", "asn1-rs-impl", @@ -164,25 +164,25 @@ dependencies = [ [[package]] name = "asn1-rs-derive" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c" +checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", "synstructure", ] [[package]] name = "asn1-rs-impl" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed" +checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -310,6 +310,33 @@ dependencies = [ "zeroize", ] +[[package]] +name = "aws-lc-rs" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070" +dependencies = [ + "aws-lc-sys", + "mirai-annotations", + "paste", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62" +dependencies = [ + "bindgen 0.69.5", + "cc", + "cmake", + "dunce", + "fs_extra", + "libc", + "paste", +] + [[package]] name = "aws-runtime" version = "1.4.3" @@ -595,7 +622,7 @@ dependencies = [ "once_cell", "pin-project-lite", "pin-utils", - "rustls 0.21.11", + "rustls 0.21.12", "tokio", "tracing", ] @@ -915,6 +942,29 @@ dependencies = [ "serde", ] +[[package]] +name = "bindgen" +version = "0.69.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +dependencies = [ + "bitflags 2.4.1", + "cexpr", + "clang-sys", + "itertools 0.10.5", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.52", + "which", +] + [[package]] name = "bindgen" version = "0.70.1" @@ -924,7 +974,7 @@ dependencies = [ "bitflags 2.4.1", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools 0.10.5", "log", "prettyplease", "proc-macro2", @@ -1038,12 +1088,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.83" +version = "1.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" dependencies = [ "jobserver", "libc", + "shlex", ] [[package]] @@ -1169,6 +1220,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +[[package]] +name = "cmake" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -1624,9 +1684,9 @@ dependencies = [ [[package]] name = "der-parser" -version = "8.2.0" +version = "9.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e" +checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553" dependencies = [ "asn1-rs", "displaydoc", @@ -1755,6 +1815,12 @@ dependencies = [ "syn 2.0.52", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "dyn-clone" version = "1.0.14" @@ -2059,6 +2125,12 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "fsevent-sys" version = "4.1.0" @@ -2412,6 +2484,15 @@ dependencies = [ "digest", ] +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "hostname" version = "0.4.0" @@ -2581,7 +2662,7 @@ dependencies = [ "http 0.2.9", "hyper 0.14.30", "log", - "rustls 0.21.11", + "rustls 0.21.12", "rustls-native-certs 0.6.2", "tokio", "tokio-rustls 0.24.0", @@ -2695,6 +2776,7 @@ checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" dependencies = [ "equivalent", "hashbrown 0.14.5", + "serde", ] [[package]] @@ -2794,15 +2876,15 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] @@ -2906,6 +2988,12 @@ dependencies = [ "spin", ] +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "libc" version = "0.2.150" @@ -3136,6 +3224,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "mirai-annotations" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1" + [[package]] name = "multimap" version = "0.8.3" @@ -3355,9 +3449,9 @@ dependencies = [ [[package]] name = "oid-registry" -version = "0.6.1" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bedf36ffb6ba96c2eb7144ef6270557b52e54b20c0a8e1eb2ff99a6c6959bff" +checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9" dependencies = [ "asn1-rs", ] @@ -4052,14 +4146,14 @@ dependencies = [ "bytes", "once_cell", "pq_proto", - "rustls 0.22.4", + "rustls 0.23.7", "rustls-pemfile 2.1.1", "serde", "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.25.0", + "tokio-rustls 0.26.0", "tokio-util", "tracing", ] @@ -4081,7 +4175,7 @@ name = "postgres_ffi" version = "0.1.0" dependencies = [ "anyhow", - "bindgen", + "bindgen 0.70.1", "bytes", "crc32c", "env_logger", @@ -4218,7 +4312,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" dependencies = [ "bytes", "heck 0.5.0", - "itertools 0.12.1", + "itertools 0.10.5", "log", "multimap", "once_cell", @@ -4238,7 +4332,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" dependencies = [ "anyhow", - "itertools 0.12.1", + "itertools 0.10.5", "proc-macro2", "quote", "syn 2.0.52", @@ -4296,6 +4390,7 @@ dependencies = [ "indexmap 2.0.1", "ipnet", "itertools 0.10.5", + "itoa", "jose-jwa", "jose-jwk", "lasso", @@ -4325,8 +4420,8 @@ dependencies = [ "rsa", "rstest", "rustc-hash", - "rustls 0.22.4", - "rustls-native-certs 0.7.0", + "rustls 0.23.7", + "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", "scopeguard", "serde", @@ -4343,7 +4438,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.25.0", + "tokio-rustls 0.26.0", "tokio-tungstenite", "tokio-util", "tracing", @@ -4507,12 +4602,13 @@ dependencies = [ [[package]] name = "rcgen" -version = "0.12.1" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1" +checksum = "54077e1872c46788540de1ea3d7f4ccb1983d12f9aa909b234468676c1a36779" dependencies = [ "pem", "ring", + "rustls-pki-types", "time", "yasna", ] @@ -4646,9 +4742,10 @@ dependencies = [ "camino-tempfile", "futures", "futures-util", + "http-body-util", "http-types", "humantime-serde", - "hyper 0.14.30", + "hyper 1.4.1", "itertools 0.10.5", "metrics", "once_cell", @@ -4690,7 +4787,7 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.11", + "rustls 0.21.12", "rustls-pemfile 1.0.2", "serde", "serde_json", @@ -4988,9 +5085,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.11" +version = "0.21.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ "log", "ring", @@ -5018,6 +5115,7 @@ version = "0.23.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b" dependencies = [ + "aws-lc-rs", "log", "once_cell", "ring", @@ -5086,9 +5184,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.3.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" +checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" [[package]] name = "rustls-webpki" @@ -5106,6 +5204,7 @@ version = "0.102.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -5309,7 +5408,7 @@ checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02" dependencies = [ "httpdate", "reqwest 0.12.4", - "rustls 0.21.11", + "rustls 0.21.12", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -5804,8 +5903,8 @@ dependencies = [ "postgres_ffi", "remote_storage", "reqwest 0.12.4", - "rustls 0.22.4", - "rustls-native-certs 0.7.0", + "rustls 0.23.7", + "rustls-native-certs 0.8.0", "serde", "serde_json", "storage_controller_client", @@ -5927,14 +6026,13 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" [[package]] name = "synstructure" -version = "0.12.6" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", - "unicode-xid", + "syn 2.0.52", ] [[package]] @@ -6233,16 +6331,15 @@ dependencies = [ [[package]] name = "tokio-postgres-rustls" -version = "0.11.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677" +checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab" dependencies = [ - "futures", "ring", - "rustls 0.22.4", + "rustls 0.23.7", "tokio", "tokio-postgres", - "tokio-rustls 0.25.0", + "tokio-rustls 0.26.0", "x509-certificate", ] @@ -6252,7 +6349,7 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls 0.21.11", + "rustls 0.21.12", "tokio", ] @@ -6675,16 +6772,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "2.9.7" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd" +checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a" dependencies = [ "base64 0.22.1", "log", "once_cell", - "rustls 0.22.4", + "rustls 0.23.7", "rustls-pki-types", - "rustls-webpki 0.102.2", "url", "webpki-roots 0.26.1", ] @@ -6873,7 +6969,7 @@ name = "walproposer" version = "0.1.0" dependencies = [ "anyhow", - "bindgen", + "bindgen 0.70.1", "postgres_ffi", "utils", ] @@ -7048,6 +7144,18 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + [[package]] name = "whoami" version = "1.5.1" @@ -7292,7 +7400,6 @@ dependencies = [ "digest", "either", "fail", - "futures", "futures-channel", "futures-executor", "futures-io", @@ -7307,7 +7414,8 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "indexmap 1.9.3", - "itertools 0.12.1", + "indexmap 2.0.1", + "itertools 0.10.5", "lazy_static", "libc", "log", @@ -7328,6 +7436,8 @@ dependencies = [ "regex-automata 0.4.3", "regex-syntax 0.8.2", "reqwest 0.12.4", + "rustls 0.23.7", + "rustls-webpki 0.102.2", "scopeguard", "serde", "serde_json", @@ -7336,7 +7446,6 @@ dependencies = [ "smallvec", "spki 0.7.3", "subtle", - "syn 1.0.109", "syn 2.0.52", "sync_wrapper 0.1.2", "tikv-jemalloc-sys", @@ -7344,6 +7453,7 @@ dependencies = [ "time-macros", "tokio", "tokio-postgres", + "tokio-rustls 0.26.0", "tokio-stream", "tokio-util", "toml_edit", @@ -7379,9 +7489,9 @@ dependencies = [ [[package]] name = "x509-parser" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634" +checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69" dependencies = [ "asn1-rs", "data-encoding", diff --git a/Cargo.toml b/Cargo.toml index dde80f5020..4c6a24ecde 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -107,6 +107,7 @@ indexmap = "2" indoc = "2" ipnet = "2.9.0" itertools = "0.10" +itoa = "1.0.11" jsonwebtoken = "9" lasso = "0.7" libc = "0.2" @@ -141,7 +142,7 @@ reqwest-retry = "0.5" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" -rustls = "0.22" +rustls = "0.23" rustls-pemfile = "2" scopeguard = "1.1" sysinfo = "0.29.2" @@ -171,8 +172,8 @@ tikv-jemalloc-ctl = "0.5" tokio = { version = "1.17", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" -tokio-postgres-rustls = "0.11.0" -tokio-rustls = "0.25" +tokio-postgres-rustls = "0.12.0" +tokio-rustls = "0.26" tokio-stream = "0.1" tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } @@ -191,8 +192,8 @@ url = "2.2" urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" -rustls-native-certs = "0.7" -x509-parser = "0.15" +rustls-native-certs = "0.8" +x509-parser = "0.16" whoami = "1.5.1" ## TODO replace this with tracing @@ -243,7 +244,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.5.1" -rcgen = "0.12" +rcgen = "0.13" rstest = "0.18" camino-tempfile = "1.0.2" tonic-build = "0.12" diff --git a/Dockerfile b/Dockerfile index bdb76a4f4f..785dd4598e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,8 @@ ARG IMAGE=build-tools ARG TAG=pinned ARG DEFAULT_PG_VERSION=17 ARG STABLE_PG_VERSION=16 +ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build @@ -57,7 +59,7 @@ RUN set -e \ # Build final image # -FROM debian:bullseye-slim +FROM debian:${DEBIAN_FLAVOR} ARG DEFAULT_PG_VERSION WORKDIR /data diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index d8bcacf228..f05c60661c 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -1,12 +1,7 @@ -FROM debian:bullseye-slim +ARG DEBIAN_VERSION=bullseye -# Use ARG as a build-time environment variable here to allow. -# It's not supposed to be set outside. -# Alternatively it can be obtained using the following command -# ``` -# . /etc/os-release && echo "${VERSION_CODENAME}" -# ``` -ARG DEBIAN_VERSION_CODENAME=bullseye +FROM debian:${DEBIAN_VERSION}-slim +ARG DEBIAN_VERSION # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home @@ -32,6 +27,7 @@ RUN set -e \ gnupg \ gzip \ jq \ + jsonnet \ libcurl4-openssl-dev \ libbz2-dev \ libffi-dev \ @@ -42,14 +38,14 @@ RUN set -e \ libseccomp-dev \ libsqlite3-dev \ libssl-dev \ - libstdc++-10-dev \ + $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \ libtool \ libxml2-dev \ libxmlsec1-dev \ libxxhash-dev \ lsof \ make \ - netcat \ + netcat-openbsd \ net-tools \ openssh-client \ parallel \ @@ -76,9 +72,9 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/ && mv s5cmd /usr/local/bin/s5cmd # LLVM -ENV LLVM_VERSION=18 +ENV LLVM_VERSION=19 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ - && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ @@ -86,7 +82,7 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ # Install docker RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \ && apt update \ && apt install -y docker-ce docker-ce-cli \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -103,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws && rm awscliv2.zip # Mold: A Modern Linker -ENV MOLD_VERSION=v2.33.0 +ENV MOLD_VERSION=v2.34.1 RUN set -e \ && git clone https://github.com/rui314/mold.git \ && mkdir mold/build \ @@ -196,7 +192,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.81.0 +ENV RUSTC_VERSION=1.82.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/Makefile b/Makefile index 5e227ed3f5..33cfda2661 100644 --- a/Makefile +++ b/Makefile @@ -291,6 +291,7 @@ postgres-check: \ # This doesn't remove the effects of 'configure'. .PHONY: clean clean: postgres-clean neon-pg-clean-ext + $(MAKE) -C compute clean $(CARGO_CMD_PREFIX) cargo clean # This removes everything diff --git a/README.md b/README.md index cfc63b4708..e68ef70bdf 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ -libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev +libprotobuf-dev libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev ``` * On Fedora, these packages are needed: ```bash diff --git a/compute/.gitignore b/compute/.gitignore new file mode 100644 index 0000000000..70980d335a --- /dev/null +++ b/compute/.gitignore @@ -0,0 +1,5 @@ +# sql_exporter config files generated from Jsonnet +etc/neon_collector.yml +etc/neon_collector_autoscaling.yml +etc/sql_exporter.yml +etc/sql_exporter_autoscaling.yml diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 15afb9897f..74970696b5 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -3,7 +3,8 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG -ARG DEBIAN_FLAVOR=bullseye-slim +ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim ######################################################################################### # @@ -11,19 +12,23 @@ ARG DEBIAN_FLAVOR=bullseye-slim # ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS build-deps -ARG DEBIAN_FLAVOR +ARG DEBIAN_VERSION -RUN case $DEBIAN_FLAVOR in \ +RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. # Install newer version (3.25) from backports. - bullseye*) \ + # libstdc++-10-dev is required for plv8 + bullseye) \ echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \ - VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \ + VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports libstdc++-10-dev"; \ ;; \ # Version-specific installs for Bookworm (PG17): - bookworm*) \ - VERSION_INSTALLS="cmake"; \ + bookworm) \ + VERSION_INSTALLS="cmake libstdc++-12-dev"; \ + ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ ;; \ esac && \ apt update && \ @@ -223,18 +228,33 @@ FROM build-deps AS plv8-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt update && \ +RUN apt update && \ apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ +# plv8 3.2.3 supports v17 +# last release v3.2.3 - Sep 7, 2024 +# +# clone the repo instead of downloading the release tarball because plv8 has submodule dependencies +# and the release tarball doesn't include them +# +# Use new version only for v17 +# because since v3.2, plv8 doesn't include plcoffee and plls extensions +ENV PLV8_TAG=v3.2.3 + +RUN case "${PG_VERSION}" in \ + "v17") \ + export PLV8_TAG=v3.2.3 \ + ;; \ + "v14" | "v15" | "v16") \ + export PLV8_TAG=v3.1.10 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ esac && \ - wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ - echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ - mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ + git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ + tar -czf plv8.tar.gz --exclude .git plv8-src && \ + cd plv8-src && \ # generate and copy upgrade scripts mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ @@ -244,8 +264,17 @@ RUN case "${PG_VERSION}" in "v17") \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ # don't break computes with installed old version of plv8 cd /usr/local/pgsql/lib/ && \ - ln -s plv8-3.1.10.so plv8-3.1.5.so && \ - ln -s plv8-3.1.10.so plv8-3.1.8.so && \ + case "${PG_VERSION}" in \ + "v17") \ + ln -s plv8-3.2.3.so plv8-3.1.8.so && \ + ln -s plv8-3.2.3.so plv8-3.1.5.so && \ + ln -s plv8-3.2.3.so plv8-3.1.10.so \ + ;; \ + "v14" | "v15" | "v16") \ + ln -s plv8-3.1.10.so plv8-3.1.5.so && \ + ln -s plv8-3.1.10.so plv8-3.1.8.so \ + ;; \ + esac && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control @@ -323,11 +352,11 @@ COPY compute/patches/pgvector.patch /pgvector.patch # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ - echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ +# +# vector 0.7.4 supports v17 +# last release v0.7.4 - Aug 5, 2024 +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \ + echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -345,7 +374,7 @@ ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific -# doesn't use releases, last commit f3d82fd - Mar 2, 2023 +# doesn't use releases, last commit f3d82fd - Mar 2, 2023 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ @@ -362,11 +391,10 @@ FROM build-deps AS hypopg-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ - echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ +# HypoPG 1.4.1 supports v17 +# last release 1.4.1 - Apr 28, 2024 +RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \ + echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -403,6 +431,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/rum.patch /rum.patch +# maybe version-specific +# support for v17 is unknown +# last release 1.3.13 - Sep 19, 2022 RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ @@ -424,11 +455,10 @@ FROM build-deps AS pgtap-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ - echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ +# pgtap 1.3.3 supports v17 +# last release v1.3.3 - Apr 8, 2024 +RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \ + echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -501,11 +531,10 @@ FROM build-deps AS plpgsql-check-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ - echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ +# plpgsql_check v2.7.11 supports v17 +# last release v2.7.11 - Sep 16, 2024 +RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \ + echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -523,18 +552,19 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ ;; \ - *) \ + "v16") \ export TIMESCALEDB_VERSION=2.13.0 \ export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \ ;; \ + "v17") \ + export TIMESCALEDB_VERSION=2.17.0 \ + export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \ + ;; \ esac && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ @@ -557,10 +587,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +# version-specific, has separate releases for each version +RUN case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -574,7 +602,8 @@ RUN case "${PG_VERSION}" in "v17") \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ "v17") \ - echo "TODO: PG17 pg_hint_plan support" && exit 0 \ + export PG_HINT_PLAN_VERSION=17_1_7_0 \ + export PG_HINT_PLAN_CHECKSUM=06dd306328c67a4248f48403c50444f30959fb61ebe963248dbc2afb396fe600 \ ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ @@ -598,6 +627,10 @@ FROM build-deps AS pg-cron-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# 1.6.4 available, supports v17 +# This is an experimental extension that we do not support on prod yet. +# !Do not remove! +# We set it in shared_preload_libraries and computes will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ @@ -619,23 +652,37 @@ FROM build-deps AS rdkit-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt-get update && \ +RUN apt-get update && \ apt-get install --no-install-recommends -y \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ - libeigen3-dev + libeigen3-dev \ + libboost-all-dev +# rdkit Release_2024_09_1 supports v17 +# last release Release_2024_09_1 - Sep 27, 2024 +# +# Use new version only for v17 +# because Release_2024_09_1 has some backward incompatible changes +# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ +RUN case "${PG_VERSION}" in \ + "v17") \ + export RDKIT_VERSION=Release_2024_09_1 \ + export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \ + ;; \ + "v14" | "v15" | "v16") \ + export RDKIT_VERSION=Release_2023_03_3 \ + export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ esac && \ - wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ - echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ + wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \ + echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ @@ -674,12 +721,11 @@ FROM build-deps AS pg-uuidv7-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# not version-specific +# last release v1.6.0 - Oct 9, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ - echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \ + echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -750,6 +796,8 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# This is our extension, support stopped in favor of pgvector +# TODO: deprecate it ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ @@ -776,6 +824,8 @@ FROM build-deps AS pg-anon-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# This is an experimental extension, never got to real production. +# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ @@ -925,8 +975,8 @@ ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \ esac && \ - wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \ - echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \ + wget https://github.com/neondatabase/pg_session_jwt/archive/e642528f429dd3f5403845a50191b78d434b84a6.tar.gz -O pg_session_jwt.tar.gz && \ + echo "1a69210703cc91224785e59a0a67562dd9eed9a0914ac84b11447582ca0d5b93 pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release @@ -942,13 +992,12 @@ FROM build-deps AS wal2json-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# wal2json wal2json_2_6 supports v17 +# last release wal2json_2_6 - Apr 25, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \ - esac && \ - wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ - echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ - mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ +RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \ + echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \ + mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install @@ -962,12 +1011,11 @@ FROM build-deps AS pg-ivm-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# pg_ivm v1.9 supports v17 +# last release v1.9 - Jul 31 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \ - esac && \ - wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ - echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \ + echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -983,12 +1031,11 @@ FROM build-deps AS pg-partman-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# should support v17 https://github.com/pgpartman/pg_partman/discussions/693 +# last release 5.1.0 Apr 2, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "pg_partman doesn't support PG17 yet" && exit 0;; \ - esac && \ - wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ - echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \ + echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -1091,7 +1138,6 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS compute-tools-image -ARG DEBIAN_FLAVOR COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl @@ -1102,7 +1148,6 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS pgbouncer -ARG DEBIAN_FLAVOR RUN set -e \ && apt-get update \ && apt-get install --no-install-recommends -y \ @@ -1167,6 +1212,19 @@ RUN rm -r /usr/local/pgsql/include # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a +######################################################################################### +# +# Preprocess the sql_exporter configuration files +# +######################################################################################### +FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor +ARG PG_VERSION + +USER nonroot + +COPY --chown=nonroot compute compute + +RUN make PG_VERSION="${PG_VERSION}" -C compute ######################################################################################### # @@ -1257,7 +1315,7 @@ ENV PGDATABASE=postgres # ######################################################################################### FROM debian:$DEBIAN_FLAVOR -ARG DEBIAN_FLAVOR +ARG DEBIAN_VERSION # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo "postgres:test_console_pass" | chpasswd && \ @@ -1285,10 +1343,10 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter -COPY --chmod=0644 compute/etc/sql_exporter.yml /etc/sql_exporter.yml -COPY --chmod=0644 compute/etc/neon_collector.yml /etc/neon_collector.yml -COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml -COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Create remote extension download directory RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions @@ -1305,19 +1363,22 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca RUN apt update && \ - case $DEBIAN_FLAVOR in \ + case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # libicu67, locales for collations (including ICU and plpgsql_check) # libgdal28, libproj19 for PostGIS - bullseye*) \ + bullseye) \ VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \ ;; \ # Version-specific installs for Bookworm (PG17): # libicu72, locales for collations (including ICU and plpgsql_check) # libgdal32, libproj25 for PostGIS - bookworm*) \ + bookworm) \ VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \ ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ + ;; \ esac && \ apt install --no-install-recommends -y \ gdb \ diff --git a/compute/Makefile b/compute/Makefile new file mode 100644 index 0000000000..e2896fe390 --- /dev/null +++ b/compute/Makefile @@ -0,0 +1,49 @@ +jsonnet_files = $(wildcard \ + etc/*.jsonnet \ + etc/sql_exporter/*.libsonnet) + +.PHONY: all +all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml + +neon_collector.yml: $(jsonnet_files) + JSONNET_PATH=jsonnet:etc jsonnet \ + --output-file etc/$@ \ + --ext-str pg_version=$(PG_VERSION) \ + etc/neon_collector.jsonnet + +neon_collector_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=jsonnet:etc jsonnet \ + --output-file etc/$@ \ + --ext-str pg_version=$(PG_VERSION) \ + etc/neon_collector_autoscaling.jsonnet + +sql_exporter.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_name=neon_collector \ + --tla-str collector_file=neon_collector.yml \ + etc/sql_exporter.jsonnet + +sql_exporter_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_name=neon_collector_autoscaling \ + --tla-str collector_file=neon_collector_autoscaling.yml \ + --tla-str application_name=sql_exporter_autoscaling \ + etc/sql_exporter.jsonnet + +.PHONY: clean +clean: + rm --force \ + etc/neon_collector.yml \ + etc/neon_collector_autoscaling.yml \ + etc/sql_exporter.yml \ + etc/sql_exporter_autoscaling.yml + +.PHONY: jsonnetfmt-test +jsonnetfmt-test: + jsonnetfmt --test $(jsonnet_files) + +.PHONY: jsonnetfmt-format +jsonnetfmt-format: + jsonnetfmt --in-place $(jsonnet_files) diff --git a/compute/etc/README.md b/compute/etc/README.md new file mode 100644 index 0000000000..70b108146c --- /dev/null +++ b/compute/etc/README.md @@ -0,0 +1,17 @@ +# Compute Configuration + +These files are the configuration files for various other pieces of software +that will be running in the compute alongside Postgres. + +## `sql_exporter` + +### Adding a `sql_exporter` Metric + +We use `sql_exporter` to export various metrics from Postgres. In order to add +a metric, you will need to create two files: a `libsonnet` and a `sql` file. You +will then import the `libsonnet` file in one of the collector files, and the +`sql` file will be imported in the `libsonnet` file. + +In the event your statistic is an LSN, you may want to cast it to a `float8` +because Prometheus only supports floats. It's probably fine because `float8` can +store integers from `-2^53` to `+2^53` exactly. diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet new file mode 100644 index 0000000000..8b43ebe7a3 --- /dev/null +++ b/compute/etc/neon_collector.jsonnet @@ -0,0 +1,51 @@ +{ + collector_name: 'neon_collector', + metrics: [ + import 'sql_exporter/checkpoints_req.libsonnet', + import 'sql_exporter/checkpoints_timed.libsonnet', + import 'sql_exporter/compute_current_lsn.libsonnet', + import 'sql_exporter/compute_logical_snapshot_files.libsonnet', + import 'sql_exporter/compute_receive_lsn.libsonnet', + import 'sql_exporter/compute_subscriptions_count.libsonnet', + import 'sql_exporter/connection_counts.libsonnet', + import 'sql_exporter/db_total_size.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_count.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_sum.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_count.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_sum.libsonnet', + import 'sql_exporter/getpage_prefetch_discards_total.libsonnet', + import 'sql_exporter/getpage_prefetch_misses_total.libsonnet', + import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', + import 'sql_exporter/getpage_prefetches_buffered.libsonnet', + import 'sql_exporter/getpage_sync_requests_total.libsonnet', + import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', + import 'sql_exporter/getpage_wait_seconds_count.libsonnet', + import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + import 'sql_exporter/logical_slot_restart_lsn.libsonnet', + import 'sql_exporter/max_cluster_size.libsonnet', + import 'sql_exporter/pageserver_disconnects_total.libsonnet', + import 'sql_exporter/pageserver_requests_sent_total.libsonnet', + import 'sql_exporter/pageserver_send_flushes_total.libsonnet', + import 'sql_exporter/pageserver_open_requests.libsonnet', + import 'sql_exporter/pg_stats_userdb.libsonnet', + import 'sql_exporter/replication_delay_bytes.libsonnet', + import 'sql_exporter/replication_delay_seconds.libsonnet', + import 'sql_exporter/retained_wal.libsonnet', + import 'sql_exporter/wal_is_lost.libsonnet', + ], + queries: [ + { + query_name: 'neon_perf_counters', + query: importstr 'sql_exporter/neon_perf_counters.sql', + }, + ], +} diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml deleted file mode 100644 index 92da0cdbdd..0000000000 --- a/compute/etc/neon_collector.yml +++ /dev/null @@ -1,331 +0,0 @@ -collector_name: neon_collector -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: connection_counts - type: gauge - help: 'Connection counts' - key_labels: - - datname - - state - values: [count] - query: | - select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; - -- metric_name: pg_stats_userdb - type: gauge - help: 'Stats for several oldest non-system dbs' - key_labels: - - datname - value_label: kind - values: - - db_size - - deadlocks - # Rows - - inserted - - updated - - deleted - # We export stats for 10 non-system database. Without this limit - # it is too easy to abuse the system by creating lots of databases. - query: | - select pg_database_size(datname) as db_size, deadlocks, - tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, - datname - from pg_stat_database - where datname IN ( - select datname - from pg_database - where datname <> 'postgres' and not datistemplate - order by oid - limit 10 - ); - -- metric_name: max_cluster_size - type: gauge - help: 'neon.max_cluster_size setting' - key_labels: - values: [max_cluster_size] - query: | - select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; - -- metric_name: db_total_size - type: gauge - help: 'Size of all databases' - key_labels: - values: [total] - query: | - select sum(pg_database_size(datname)) as total from pg_database; - -- metric_name: getpage_wait_seconds_count - type: counter - help: 'Number of getpage requests' - values: [getpage_wait_seconds_count] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_sum - type: counter - help: 'Time spent in getpage requests' - values: [getpage_wait_seconds_sum] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_requests_total - type: counter - help: 'Number of getpage issued for prefetching' - values: [getpage_prefetch_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_sync_requests_total - type: counter - help: 'Number of synchronous getpage issued' - values: [getpage_sync_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_misses_total - type: counter - help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read' - values: [getpage_prefetch_misses_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_discards_total - type: counter - help: 'Number of prefetch responses issued but not used' - values: [getpage_prefetch_discards_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_requests_sent_total - type: counter - help: 'Number of all requests sent to the pageserver (not just GetPage requests)' - values: [pageserver_requests_sent_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_disconnects_total - type: counter - help: 'Number of times that the connection to the pageserver was lost' - values: [pageserver_disconnects_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_send_flushes_total - type: counter - help: 'Number of flushes to the pageserver connection' - values: [pageserver_send_flushes_total] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_bucket - type: counter - help: 'Histogram buckets of getpage request latency' - key_labels: - - bucket_le - values: [value] - query_ref: getpage_wait_seconds_buckets - -# DEPRECATED -- metric_name: lfc_approximate_working_set_size - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: - values: [approximate_working_set_size] - query: | - select neon.approximate_working_set_size(false) as approximate_working_set_size; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration] - values: [size] - # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection - # of durations in a pretty-printed form. - query: | - select - x as duration, - neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size - from - (values ('5m'),('15m'),('1h')) as t (x); - -- metric_name: compute_current_lsn - type: gauge - help: 'Current LSN of the database' - key_labels: - values: [lsn] - query: | - select - case - when pg_catalog.pg_is_in_recovery() - then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 - else (pg_current_wal_lsn() - '0/0')::FLOAT8 - end as lsn; - -- metric_name: compute_receive_lsn - type: gauge - help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication' - key_labels: - values: [lsn] - query: | - SELECT - CASE - WHEN pg_catalog.pg_is_in_recovery() - THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 - ELSE 0 - END AS lsn; - -- metric_name: replication_delay_bytes - type: gauge - help: 'Bytes between received and replayed LSN' - key_labels: - values: [replication_delay_bytes] - # We use a GREATEST call here because this calculation can be negative. - # The calculation is not atomic, meaning after we've gotten the receive - # LSN, the replay LSN may have advanced past the receive LSN we - # are using for the calculation. - query: | - SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; - -- metric_name: replication_delay_seconds - type: gauge - help: 'Time since last LSN was replayed' - key_labels: - values: [replication_delay_seconds] - query: | - SELECT - CASE - WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 - ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) - END AS replication_delay_seconds; - -- metric_name: checkpoints_req - type: gauge - help: 'Number of requested checkpoints' - key_labels: - values: [checkpoints_req] - query: | - SELECT checkpoints_req FROM pg_stat_bgwriter; - -- metric_name: checkpoints_timed - type: gauge - help: 'Number of scheduled checkpoints' - key_labels: - values: [checkpoints_timed] - query: | - SELECT checkpoints_timed FROM pg_stat_bgwriter; - -- metric_name: compute_logical_snapshot_files - type: gauge - help: 'Number of snapshot files in pg_logical/snapshot' - key_labels: - - timeline_id - values: [num_logical_snapshot_files] - query: | - SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, - -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These - -- temporary snapshot files are renamed to the actual snapshot files after they are - -- completely built. We only WAL-log the completely built snapshot files. - (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; - -# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. -# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. - -# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. -- metric_name: logical_slot_restart_lsn - type: gauge - help: 'restart_lsn of logical slots' - key_labels: - - slot_name - values: [restart_lsn] - query: | - select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn - from pg_replication_slots - where slot_type = 'logical'; - -- metric_name: compute_subscriptions_count - type: gauge - help: 'Number of logical replication subscriptions grouped by enabled/disabled' - key_labels: - - enabled - values: [subscriptions_count] - query: | - select subenabled::text as enabled, count(*) as subscriptions_count - from pg_subscription - group by subenabled; - -- metric_name: retained_wal - type: gauge - help: 'Retained WAL in inactive replication slots' - key_labels: - - slot_name - values: [retained_wal] - query: | - SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal - FROM pg_replication_slots - WHERE active = false; - -- metric_name: wal_is_lost - type: gauge - help: 'Whether or not the replication slot wal_status is lost' - key_labels: - - slot_name - values: [wal_is_lost] - query: | - SELECT slot_name, - CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost - FROM pg_replication_slots; - -queries: - - query_name: neon_perf_counters - query: | - WITH c AS ( - SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters - ) - SELECT d.* - FROM pg_catalog.jsonb_to_record((select jb from c)) as d( - getpage_wait_seconds_count numeric, - getpage_wait_seconds_sum numeric, - getpage_prefetch_requests_total numeric, - getpage_sync_requests_total numeric, - getpage_prefetch_misses_total numeric, - getpage_prefetch_discards_total numeric, - pageserver_requests_sent_total numeric, - pageserver_disconnects_total numeric, - pageserver_send_flushes_total numeric - ); - - - query_name: getpage_wait_seconds_buckets - query: | - SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/neon_collector_autoscaling.jsonnet b/compute/etc/neon_collector_autoscaling.jsonnet new file mode 100644 index 0000000000..e248172a3d --- /dev/null +++ b/compute/etc/neon_collector_autoscaling.jsonnet @@ -0,0 +1,11 @@ +{ + collector_name: 'neon_collector_autoscaling', + metrics: [ + import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + ], +} diff --git a/compute/etc/neon_collector_autoscaling.yml b/compute/etc/neon_collector_autoscaling.yml deleted file mode 100644 index 5616264eba..0000000000 --- a/compute/etc/neon_collector_autoscaling.yml +++ /dev/null @@ -1,55 +0,0 @@ -collector_name: neon_collector_autoscaling -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration_seconds] - values: [size] - # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set - # size looking back 1..60 minutes, labeled with the number of minutes. - query: | - select - x::text as duration_seconds, - neon.approximate_working_set_size_seconds(x) as size - from - (select generate_series * 60 as x from generate_series(1, 60)) as t (x); diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet new file mode 100644 index 0000000000..3c36fd4f68 --- /dev/null +++ b/compute/etc/sql_exporter.jsonnet @@ -0,0 +1,40 @@ +function(collector_name, collector_file, application_name='sql_exporter') { + // Configuration for sql_exporter for autoscaling-agent + // Global defaults. + global: { + // If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: '10s', + // Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: '500ms', + // Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: '0s', + // Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + // as will concurrent scrapes. + max_connections: 1, + // Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + // always be the same as max_connections. + max_idle_connections: 1, + // Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + // If 0, connections are not closed due to a connection's age. + max_connection_lifetime: '5m', + }, + + // The target to monitor and the collectors to execute on it. + target: { + // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + // the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]), + + // Collectors (referenced by name) to execute on the target. + // Glob patterns are supported (see for syntax). + collectors: [ + collector_name, + ], + }, + + // Collector files specifies a list of globs. One collector definition is read from each matching file. + // Glob patterns are supported (see for syntax). + collector_files: [ + collector_file, + ], +} diff --git a/compute/etc/sql_exporter.yml b/compute/etc/sql_exporter.yml deleted file mode 100644 index 139d04468a..0000000000 --- a/compute/etc/sql_exporter.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector.yml" diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql new file mode 100644 index 0000000000..a4b946e8e2 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.17.sql @@ -0,0 +1 @@ +SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet new file mode 100644 index 0000000000..e5d9753507 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet @@ -0,0 +1,15 @@ +local neon = import 'neon.libsonnet'; + +local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; +local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; + +{ + metric_name: 'checkpoints_req', + type: 'gauge', + help: 'Number of requested checkpoints', + key_labels: null, + values: [ + 'checkpoints_req', + ], + query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, +} diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql new file mode 100644 index 0000000000..eb8427c883 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.sql @@ -0,0 +1 @@ +SELECT checkpoints_req FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/checkpoints_timed.17.sql b/compute/etc/sql_exporter/checkpoints_timed.17.sql new file mode 100644 index 0000000000..0d86ddb3ea --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.17.sql @@ -0,0 +1 @@ +SELECT num_timed AS checkpoints_timed FROM pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet new file mode 100644 index 0000000000..0ba0080188 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet @@ -0,0 +1,15 @@ +local neon = import 'neon.libsonnet'; + +local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; +local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; + +{ + metric_name: 'checkpoints_timed', + type: 'gauge', + help: 'Number of scheduled checkpoints', + key_labels: null, + values: [ + 'checkpoints_timed', + ], + query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, +} diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql new file mode 100644 index 0000000000..c50853134c --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.sql @@ -0,0 +1 @@ +SELECT checkpoints_timed FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/compute_current_lsn.libsonnet b/compute/etc/sql_exporter/compute_current_lsn.libsonnet new file mode 100644 index 0000000000..ccff161358 --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_current_lsn', + type: 'gauge', + help: 'Current LSN of the database', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_current_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql new file mode 100644 index 0000000000..be02b8a094 --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet new file mode 100644 index 0000000000..212f079ccf --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_logical_snapshot_files', + type: 'gauge', + help: 'Number of snapshot files in pg_logical/snapshot', + key_labels: [ + 'timeline_id', + ], + values: [ + 'num_logical_snapshot_files', + ], + query: importstr 'sql_exporter/compute_logical_snapshot_files.sql', +} diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql new file mode 100644 index 0000000000..f2454235b7 --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql @@ -0,0 +1,7 @@ +SELECT + (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. + -- These temporary snapshot files are renamed to the actual snapshot files + -- after they are completely built. We only WAL-log the completely built + -- snapshot files + (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; diff --git a/compute/etc/sql_exporter/compute_receive_lsn.libsonnet b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet new file mode 100644 index 0000000000..eb68a77ec2 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_receive_lsn', + type: 'gauge', + help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_receive_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql new file mode 100644 index 0000000000..318b31ab41 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + ELSE 0 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet new file mode 100644 index 0000000000..e1575da397 --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_subscriptions_count', + type: 'gauge', + help: 'Number of logical replication subscriptions grouped by enabled/disabled', + key_labels: [ + 'enabled', + ], + values: [ + 'subscriptions_count', + ], + query: importstr 'sql_exporter/compute_subscriptions_count.sql', +} diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql new file mode 100644 index 0000000000..50740cb5df --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql @@ -0,0 +1 @@ +SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled; diff --git a/compute/etc/sql_exporter/connection_counts.libsonnet b/compute/etc/sql_exporter/connection_counts.libsonnet new file mode 100644 index 0000000000..9f94db67a9 --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.libsonnet @@ -0,0 +1,13 @@ +{ + metric_name: 'connection_counts', + type: 'gauge', + help: 'Connection counts', + key_labels: [ + 'datname', + 'state', + ], + values: [ + 'count', + ], + query: importstr 'sql_exporter/connection_counts.sql', +} diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql new file mode 100644 index 0000000000..6824480fdb --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.sql @@ -0,0 +1 @@ +SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state; diff --git a/compute/etc/sql_exporter/db_total_size.libsonnet b/compute/etc/sql_exporter/db_total_size.libsonnet new file mode 100644 index 0000000000..6e08d5fb87 --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'db_total_size', + type: 'gauge', + help: 'Size of all databases', + key_labels: null, + values: [ + 'total', + ], + query: importstr 'sql_exporter/db_total_size.sql', +} diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql new file mode 100644 index 0000000000..9cbbdfd8a3 --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -0,0 +1 @@ +SELECT sum(pg_database_size(datname)) AS total FROM pg_database; diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..d13f657a7f --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'file_cache_read_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of LFC read operation latencies', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/file_cache_read_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql new file mode 100644 index 0000000000..09047bf0c4 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_read_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..aa028b0f5e --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_read_wait_seconds_count', + type: 'counter', + help: 'Number of read operations in LFC', + values: [ + 'file_cache_read_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..2547aabf3d --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_read_wait_seconds_sum', + type: 'counter', + help: 'Time spent in LFC read operations', + values: [ + 'file_cache_read_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..13dbc77f76 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'file_cache_write_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of LFC write operation latencies', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/file_cache_write_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql new file mode 100644 index 0000000000..d03613cf91 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_write_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..6227d3193a --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_write_wait_seconds_count', + type: 'counter', + help: 'Number of write operations in LFC', + values: [ + 'file_cache_write_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..2acfe7f608 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_write_wait_seconds_sum', + type: 'counter', + help: 'Time spent in LFC write operations', + values: [ + 'file_cache_write_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet new file mode 100644 index 0000000000..935e35d2e4 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_discards_total', + type: 'counter', + help: 'Number of prefetch responses issued but not used', + values: [ + 'getpage_prefetch_discards_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet new file mode 100644 index 0000000000..b9a9632105 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_misses_total', + type: 'counter', + help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read", + values: [ + 'getpage_prefetch_misses_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet new file mode 100644 index 0000000000..75fdb6717b --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_requests_total', + type: 'counter', + help: 'Number of getpage issued for prefetching', + values: [ + 'getpage_prefetch_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet new file mode 100644 index 0000000000..8926d867c9 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetches_buffered', + type: 'gauge', + help: 'Number of prefetched pages buffered in neon', + values: [ + 'getpage_prefetches_buffered', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet new file mode 100644 index 0000000000..f3a1e6b339 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_sync_requests_total', + type: 'counter', + help: 'Number of synchronous getpage issued', + values: [ + 'getpage_sync_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..2adda2ad03 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'getpage_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of getpage request latency', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql new file mode 100644 index 0000000000..b4a6bc1560 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..d2326974fc --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_count', + type: 'counter', + help: 'Number of getpage requests', + values: [ + 'getpage_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..844c8419ff --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_sum', + type: 'counter', + help: 'Time spent in getpage requests', + values: [ + 'getpage_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet new file mode 100644 index 0000000000..78859ce60d --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet @@ -0,0 +1,12 @@ +// DEPRECATED + +{ + metric_name: 'lfc_approximate_working_set_size', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: null, + values: [ + 'approximate_working_set_size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql new file mode 100644 index 0000000000..de509ebb47 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql @@ -0,0 +1 @@ +SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size; diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet new file mode 100644 index 0000000000..a54deca467 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration_seconds', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql new file mode 100644 index 0000000000..35fa42c34c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "internal" / "machine-readable" version. This outputs the +-- working set size looking back 1..60 minutes, labeled with the number of +-- minutes. + +SELECT + x::text as duration_seconds, + neon.approximate_working_set_size_seconds(x) AS size +FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet new file mode 100644 index 0000000000..4970bd2c7f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql new file mode 100644 index 0000000000..46c7d1610c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "public" / "human-readable" version. Here, we supply a +-- small selection of durations in a pretty-printed form. + +SELECT + x AS duration, + neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM ( + VALUES ('5m'), ('15m'), ('1h') + ) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet new file mode 100644 index 0000000000..4cbbd76621 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_cache_size_limit', + type: 'gauge', + help: 'LFC cache size limit in bytes', + key_labels: null, + values: [ + 'lfc_cache_size_limit', + ], + query: importstr 'sql_exporter/lfc_cache_size_limit.sql', +} diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql new file mode 100644 index 0000000000..378904c1fe --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql @@ -0,0 +1 @@ +SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; diff --git a/compute/etc/sql_exporter/lfc_hits.libsonnet b/compute/etc/sql_exporter/lfc_hits.libsonnet new file mode 100644 index 0000000000..4a0b7671bf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_hits', + type: 'gauge', + help: 'lfc_hits', + key_labels: null, + values: [ + 'lfc_hits', + ], + query: importstr 'sql_exporter/lfc_hits.sql', +} diff --git a/compute/etc/sql_exporter/lfc_hits.sql b/compute/etc/sql_exporter/lfc_hits.sql new file mode 100644 index 0000000000..2e14f5c73c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits'; diff --git a/compute/etc/sql_exporter/lfc_misses.libsonnet b/compute/etc/sql_exporter/lfc_misses.libsonnet new file mode 100644 index 0000000000..302998d04f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_misses', + type: 'gauge', + help: 'lfc_misses', + key_labels: null, + values: [ + 'lfc_misses', + ], + query: importstr 'sql_exporter/lfc_misses.sql', +} diff --git a/compute/etc/sql_exporter/lfc_misses.sql b/compute/etc/sql_exporter/lfc_misses.sql new file mode 100644 index 0000000000..27ed4ecf86 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses'; diff --git a/compute/etc/sql_exporter/lfc_used.libsonnet b/compute/etc/sql_exporter/lfc_used.libsonnet new file mode 100644 index 0000000000..23891dadaf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_used', + type: 'gauge', + help: 'LFC chunks used (chunk = 1MB)', + key_labels: null, + values: [ + 'lfc_used', + ], + query: importstr 'sql_exporter/lfc_used.sql', +} diff --git a/compute/etc/sql_exporter/lfc_used.sql b/compute/etc/sql_exporter/lfc_used.sql new file mode 100644 index 0000000000..4f01545f30 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used'; diff --git a/compute/etc/sql_exporter/lfc_writes.libsonnet b/compute/etc/sql_exporter/lfc_writes.libsonnet new file mode 100644 index 0000000000..6a22ee1dd9 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_writes', + type: 'gauge', + help: 'lfc_writes', + key_labels: null, + values: [ + 'lfc_writes', + ], + query: importstr 'sql_exporter/lfc_writes.sql', +} diff --git a/compute/etc/sql_exporter/lfc_writes.sql b/compute/etc/sql_exporter/lfc_writes.sql new file mode 100644 index 0000000000..37c9abc9cf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes'; diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet new file mode 100644 index 0000000000..8ef31b5d8d --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet @@ -0,0 +1,15 @@ +// Number of slots is limited by max_replication_slots, so collecting position +// for all of them shouldn't be bad. + +{ + metric_name: 'logical_slot_restart_lsn', + type: 'gauge', + help: 'restart_lsn of logical slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'restart_lsn', + ], + query: importstr 'sql_exporter/logical_slot_restart_lsn.sql', +} diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql new file mode 100644 index 0000000000..1b1c038501 --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql @@ -0,0 +1,3 @@ +SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn +FROM pg_replication_slots +WHERE slot_type = 'logical'; diff --git a/compute/etc/sql_exporter/max_cluster_size.libsonnet b/compute/etc/sql_exporter/max_cluster_size.libsonnet new file mode 100644 index 0000000000..1352fb77ee --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'max_cluster_size', + type: 'gauge', + help: 'neon.max_cluster_size setting', + key_labels: null, + values: [ + 'max_cluster_size', + ], + query: importstr 'sql_exporter/max_cluster_size.sql', +} diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql new file mode 100644 index 0000000000..2d2355a9a7 --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.sql @@ -0,0 +1 @@ +SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size'; diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql new file mode 100644 index 0000000000..4a36f3bf2f --- /dev/null +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -0,0 +1,19 @@ +WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters) + +SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( + file_cache_read_wait_seconds_count numeric, + file_cache_read_wait_seconds_sum numeric, + file_cache_write_wait_seconds_count numeric, + file_cache_write_wait_seconds_sum numeric, + getpage_wait_seconds_count numeric, + getpage_wait_seconds_sum numeric, + getpage_prefetch_requests_total numeric, + getpage_sync_requests_total numeric, + getpage_prefetch_misses_total numeric, + getpage_prefetch_discards_total numeric, + getpage_prefetches_buffered numeric, + pageserver_requests_sent_total numeric, + pageserver_disconnects_total numeric, + pageserver_send_flushes_total numeric, + pageserver_open_requests numeric +); diff --git a/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet new file mode 100644 index 0000000000..5ad9ba078e --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_disconnects_total', + type: 'counter', + help: 'Number of times that the connection to the pageserver was lost', + values: [ + 'pageserver_disconnects_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_open_requests.libsonnet b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet new file mode 100644 index 0000000000..dca89ea64a --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_open_requests', + type: 'gauge', + help: 'Number of open requests to PageServer', + values: [ + 'pageserver_open_requests', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet new file mode 100644 index 0000000000..c191e2467f --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_requests_sent_total', + type: 'counter', + help: 'Number of all requests sent to the pageserver (not just GetPage requests)', + values: [ + 'pageserver_requests_sent_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet new file mode 100644 index 0000000000..9fa5f77758 --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_send_flushes_total', + type: 'counter', + help: 'Number of flushes to the pageserver connection', + values: [ + 'pageserver_send_flushes_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.libsonnet b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet new file mode 100644 index 0000000000..46ea2f4192 --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet @@ -0,0 +1,18 @@ +{ + metric_name: 'pg_stats_userdb', + type: 'gauge', + help: 'Stats for several oldest non-system dbs', + key_labels: [ + 'datname', + ], + value_label: 'kind', + values: [ + 'db_size', + 'deadlocks', + // Rows + 'inserted', + 'updated', + 'deleted', + ], + query: importstr 'sql_exporter/pg_stats_userdb.sql', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql new file mode 100644 index 0000000000..00ada87370 --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -0,0 +1,10 @@ +-- We export stats for 10 non-system databases. Without this limit it is too +-- easy to abuse the system by creating lots of databases. + +SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, + tup_updated AS updated, tup_deleted AS deleted, datname +FROM pg_stat_database +WHERE datname IN ( + SELECT datname FROM pg_database + WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 +); diff --git a/compute/etc/sql_exporter/replication_delay_bytes.libsonnet b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet new file mode 100644 index 0000000000..3e5bb6af1f --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_bytes', + type: 'gauge', + help: 'Bytes between received and replayed LSN', + key_labels: null, + values: [ + 'replication_delay_bytes', + ], + query: importstr 'sql_exporter/replication_delay_bytes.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql new file mode 100644 index 0000000000..60a6981acd --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.sql @@ -0,0 +1,6 @@ +-- We use a GREATEST call here because this calculation can be negative. The +-- calculation is not atomic, meaning after we've gotten the receive LSN, the +-- replay LSN may have advanced past the receive LSN we are using for the +-- calculation. + +SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; diff --git a/compute/etc/sql_exporter/replication_delay_seconds.libsonnet b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet new file mode 100644 index 0000000000..d3f2c21b54 --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_seconds', + type: 'gauge', + help: 'Time since last LSN was replayed', + key_labels: null, + values: [ + 'replication_delay_seconds', + ], + query: importstr 'sql_exporter/replication_delay_seconds.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql new file mode 100644 index 0000000000..a76809ad74 --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.sql @@ -0,0 +1,5 @@ +SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; diff --git a/compute/etc/sql_exporter/retained_wal.libsonnet b/compute/etc/sql_exporter/retained_wal.libsonnet new file mode 100644 index 0000000000..f9eff5faa5 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'retained_wal', + type: 'gauge', + help: 'Retained WAL in inactive replication slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'retained_wal', + ], + query: importstr 'sql_exporter/retained_wal.sql', +} diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql new file mode 100644 index 0000000000..6c58359461 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -0,0 +1,5 @@ +SELECT + slot_name, + pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal +FROM pg_replication_slots +WHERE active = false; diff --git a/compute/etc/sql_exporter/wal_is_lost.libsonnet b/compute/etc/sql_exporter/wal_is_lost.libsonnet new file mode 100644 index 0000000000..3cd25f4b39 --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'wal_is_lost', + type: 'gauge', + help: 'Whether or not the replication slot wal_status is lost', + key_labels: [ + 'slot_name', + ], + values: [ + 'wal_is_lost', + ], + query: importstr 'sql_exporter/wal_is_lost.sql', +} diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql new file mode 100644 index 0000000000..5521270851 --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.sql @@ -0,0 +1,7 @@ +SELECT + slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_is_lost +FROM pg_replication_slots; diff --git a/compute/etc/sql_exporter_autoscaling.yml b/compute/etc/sql_exporter_autoscaling.yml deleted file mode 100644 index 044557233e..0000000000 --- a/compute/etc/sql_exporter_autoscaling.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter for autoscaling-agent -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector_autoscaling] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector_autoscaling.yml" diff --git a/compute/jsonnet/neon.libsonnet b/compute/jsonnet/neon.libsonnet new file mode 100644 index 0000000000..583b631c58 --- /dev/null +++ b/compute/jsonnet/neon.libsonnet @@ -0,0 +1,16 @@ +local MIN_SUPPORTED_VERSION = 14; +local MAX_SUPPORTED_VERSION = 17; +local SUPPORTED_VERSIONS = std.range(MIN_SUPPORTED_VERSION, MAX_SUPPORTED_VERSION); + +# If we receive the pg_version with a leading "v", ditch it. +local pg_version = std.strReplace(std.extVar('pg_version'), 'v', ''); +local pg_version_num = std.parseInt(pg_version); + +assert std.setMember(pg_version_num, SUPPORTED_VERSIONS) : + std.format('%s is an unsupported Postgres version: %s', + [pg_version, std.toString(SUPPORTED_VERSIONS)]); + +{ + PG_MAJORVERSION: pg_version, + PG_MAJORVERSION_NUM: pg_version_num, +} diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml new file mode 100644 index 0000000000..51a55b513f --- /dev/null +++ b/compute/vm-image-spec-bookworm.yaml @@ -0,0 +1,126 @@ +# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image. +--- +commands: + - name: cgconfigparser + user: root + sysvInitAction: sysinit + shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' + # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for + # running it as root. + - name: chmod-resize-swap + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/resize-swap' + - name: chmod-set-disk-quota + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/set-disk-quota' + - name: pgbouncer + user: postgres + sysvInitAction: respawn + shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini' + - name: local_proxy + user: postgres + sysvInitAction: respawn + shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + - name: postgres-exporter + user: nobody + sysvInitAction: respawn + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter' + - name: sql-exporter + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' + - name: sql-exporter-autoscaling + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' +shutdownHook: | + su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' +files: + - filename: compute_ctl-sudoers + content: | + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap + # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), + # regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + - filename: cgconfig.conf + content: | + # Configuration for cgroups in VM compute nodes + group neon-postgres { + perm { + admin { + uid = postgres; + } + task { + gid = users; + } + } + memory {} + } +build: | + # Build cgroup-tools + # + # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically + # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor + # requires cgroup v2, so we'll build cgroup-tools ourselves. + # + # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2, + # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset + # for debian version migration. + # + FROM debian:bookworm-slim as libcgroup-builder + ENV LIBCGROUP_VERSION=v2.0.3 + + RUN set -exu \ + && apt update \ + && apt install --no-install-recommends -y \ + git \ + ca-certificates \ + automake \ + cmake \ + make \ + gcc \ + byacc \ + flex \ + libtool \ + libpam0g-dev \ + && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ + && INSTALL_DIR="/libcgroup-install" \ + && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ + && cd libcgroup \ + # extracted from bootstrap.sh, with modified flags: + && (test -d m4 || mkdir m4) \ + && autoreconf -fi \ + && rm -rf autom4te.cache \ + && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ + # actually build the thing... + && make install +merge: | + # tweak nofile limits + RUN set -e \ + && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \ + && test ! -e /etc/security || ( \ + echo '* - nofile 1048576' >>/etc/security/limits.conf \ + && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ + ) + + # Allow postgres user (compute_ctl) to run swap resizer. + # Need to install sudo in order to allow this. + # + # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. + RUN set -e \ + && apt update \ + && apt install --no-install-recommends -y \ + sudo \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers + + COPY cgconfig.conf /etc/cgconfig.conf + + RUN set -e \ + && chmod 0644 /etc/cgconfig.conf + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ + COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ + COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec-bullseye.yaml similarity index 100% rename from compute/vm-image-spec.yaml rename to compute/vm-image-spec-bullseye.yaml diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 285be56264..c9dd4dcfc5 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -15,6 +15,7 @@ use std::time::Instant; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; +use compute_api::spec::PgIdent; use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; @@ -25,8 +26,9 @@ use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use compute_api::privilege::Privilege; use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec}; +use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion}; use utils::measured_stream::MeasuredReader; use nix::sys::signal::{kill, Signal}; @@ -34,6 +36,7 @@ use nix::sys::signal::{kill, Signal}; use remote_storage::{DownloadError, RemotePath}; use crate::checker::create_availability_check_data; +use crate::installed_extensions::get_installed_extensions_sync; use crate::local_proxy; use crate::logger::inlinify; use crate::pg_helpers::*; @@ -1121,6 +1124,11 @@ impl ComputeNode { self.pg_reload_conf()?; } self.post_apply_config()?; + + let connstr = self.connstr.clone(); + thread::spawn(move || { + get_installed_extensions_sync(connstr).context("get_installed_extensions") + }); } let startup_end_time = Utc::now(); @@ -1367,6 +1375,97 @@ LIMIT 100", download_size } + pub async fn set_role_grants( + &self, + db_name: &PgIdent, + schema_name: &PgIdent, + privileges: &[Privilege], + role_name: &PgIdent, + ) -> Result<()> { + use tokio_postgres::config::Config; + use tokio_postgres::NoTls; + + let mut conf = Config::from_str(self.connstr.as_str()).unwrap(); + conf.dbname(db_name); + + let (db_client, conn) = conf + .connect(NoTls) + .await + .context("Failed to connect to the database")?; + tokio::spawn(conn); + + // TODO: support other types of grants apart from schemas? + let query = format!( + "GRANT {} ON SCHEMA {} TO {}", + privileges + .iter() + // should not be quoted as it's part of the command. + // is already sanitized so it's ok + .map(|p| p.as_str()) + .collect::>() + .join(", "), + // quote the schema and role name as identifiers to sanitize them. + schema_name.pg_quote(), + role_name.pg_quote(), + ); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + + Ok(()) + } + + pub async fn install_extension( + &self, + ext_name: &PgIdent, + db_name: &PgIdent, + ext_version: ExtVersion, + ) -> Result { + use tokio_postgres::config::Config; + use tokio_postgres::NoTls; + + let mut conf = Config::from_str(self.connstr.as_str()).unwrap(); + conf.dbname(db_name); + + let (db_client, conn) = conf + .connect(NoTls) + .await + .context("Failed to connect to the database")?; + tokio::spawn(conn); + + let version_query = "SELECT extversion FROM pg_extension WHERE extname = $1"; + let version: Option = db_client + .query_opt(version_query, &[&ext_name]) + .await + .with_context(|| format!("Failed to execute query: {}", version_query))? + .map(|row| row.get(0)); + + // sanitize the inputs as postgres idents. + let ext_name: String = ext_name.pg_quote(); + let quoted_version: String = ext_version.pg_quote(); + + if let Some(installed_version) = version { + if installed_version == ext_version { + return Ok(installed_version); + } + let query = format!("ALTER EXTENSION {ext_name} UPDATE TO {quoted_version}"); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + } else { + let query = + format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}"); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + } + + Ok(ext_version) + } + #[tokio::main] pub async fn prepare_preload_libraries( &self, @@ -1484,28 +1583,6 @@ LIMIT 100", info!("Pageserver config changed"); } } - - // Gather info about installed extensions - pub fn get_installed_extensions(&self) -> Result<()> { - let connstr = self.connstr.clone(); - - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create runtime"); - let result = rt - .block_on(crate::installed_extensions::get_installed_extensions( - connstr, - )) - .expect("failed to get installed extensions"); - - info!( - "{}", - serde_json::to_string(&result).expect("failed to serialize extensions list") - ); - - Ok(()) - } } pub fn forward_termination_signal() { diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 6ef7e0837f..da2d107b54 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -107,7 +107,7 @@ pub fn get_pg_version(pgbin: &str) -> String { // pg_config --version returns a (platform specific) human readable string // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc. let human_version = get_pg_config("--version", pgbin); - return parse_pg_version(&human_version).to_string(); + parse_pg_version(&human_version).to_string() } fn parse_pg_version(human_version: &str) -> &str { diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 79e6158081..af35f71bf2 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -9,8 +9,11 @@ use crate::catalog::SchemaDumpError; use crate::catalog::{get_database_schema, get_dbs_and_roles}; use crate::compute::forward_termination_signal; use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; -use compute_api::requests::ConfigurationRequest; -use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; +use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest}; +use compute_api::responses::{ + ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError, + SetRoleGrantsResponse, +}; use anyhow::Result; use hyper::header::CONTENT_TYPE; @@ -98,6 +101,38 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /extensions POST request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for extensions request: {:?}", + status + ); + error!(msg); + return render_json_error(&msg, StatusCode::PRECONDITION_FAILED); + } + + let request = hyper::body::to_bytes(req.into_body()).await.unwrap(); + let request = serde_json::from_slice::(&request).unwrap(); + let res = compute + .install_extension(&request.extension, &request.database, request.version) + .await; + match res { + Ok(version) => render_json(Body::from( + serde_json::to_string(&ExtensionInstallResult { + extension: request.extension, + version, + }) + .unwrap(), + )), + Err(e) => { + error!("install_extension failed: {}", e); + render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + (&Method::GET, "/info") => { let num_cpus = num_cpus::get_physical(); info!("serving /info GET request. num_cpus: {}", num_cpus); @@ -165,6 +200,48 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /grants POST request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for set_role_grants request: {:?}", + status + ); + error!(msg); + return render_json_error(&msg, StatusCode::PRECONDITION_FAILED); + } + + let request = hyper::body::to_bytes(req.into_body()).await.unwrap(); + let request = serde_json::from_slice::(&request).unwrap(); + + let res = compute + .set_role_grants( + &request.database, + &request.schema, + &request.privileges, + &request.role, + ) + .await; + match res { + Ok(()) => render_json(Body::from( + serde_json::to_string(&SetRoleGrantsResponse { + database: request.database, + schema: request.schema, + role: request.role, + privileges: request.privileges, + }) + .unwrap(), + )), + Err(e) => render_json_error( + &format!("could not grant role privileges to the schema: {e}"), + // TODO: can we filter on role/schema not found errors + // and return appropriate error code? + StatusCode::INTERNAL_SERVER_ERROR, + ), + } + } + // get the list of installed extensions // currently only used in python tests // TODO: call it from cplane diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index e9fa66b323..11eee6ccfd 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -127,6 +127,41 @@ paths: schema: $ref: "#/components/schemas/GenericError" + /grants: + post: + tags: + - Grants + summary: Apply grants to the database. + description: "" + operationId: setRoleGrants + requestBody: + description: Grants request. + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/SetRoleGrantsRequest" + responses: + 200: + description: Grants applied. + content: + application/json: + schema: + $ref: "#/components/schemas/SetRoleGrantsResponse" + 412: + description: | + Compute is not in the right state for processing the request. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: Error occurred during grants application. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + /check_writability: post: tags: @@ -144,6 +179,41 @@ paths: description: Error text or 'true' if check passed. example: "true" + /extensions: + post: + tags: + - Extensions + summary: Install extension if possible. + description: "" + operationId: installExtension + requestBody: + description: Extension name and database to install it to. + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/ExtensionInstallRequest" + responses: + 200: + description: Result from extension installation + content: + application/json: + schema: + $ref: "#/components/schemas/ExtensionInstallResult" + 412: + description: | + Compute is in the wrong state for processing the request. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: Error during extension installation. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + /configure: post: tags: @@ -369,7 +439,7 @@ components: moment, when spec was received. example: "2022-10-12T07:20:50.52Z" status: - $ref: '#/components/schemas/ComputeStatus' + $ref: "#/components/schemas/ComputeStatus" last_active: type: string description: | @@ -409,6 +479,38 @@ components: - configuration example: running + ExtensionInstallRequest: + type: object + required: + - extension + - database + - version + properties: + extension: + type: string + description: Extension name. + example: "pg_session_jwt" + version: + type: string + description: Version of the extension. + example: "1.0.0" + database: + type: string + description: Database name. + example: "neondb" + + ExtensionInstallResult: + type: object + properties: + extension: + description: Name of the extension. + type: string + example: "pg_session_jwt" + version: + description: Version of the extension. + type: string + example: "1.0.0" + InstalledExtensions: type: object properties: @@ -427,6 +529,60 @@ components: n_databases: type: integer + SetRoleGrantsRequest: + type: object + required: + - database + - schema + - privileges + - role + properties: + database: + type: string + description: Database name. + example: "neondb" + schema: + type: string + description: Schema name. + example: "public" + privileges: + type: array + items: + type: string + description: List of privileges to set. + example: ["SELECT", "INSERT"] + role: + type: string + description: Role name. + example: "neon" + + SetRoleGrantsResponse: + type: object + required: + - database + - schema + - privileges + - role + properties: + database: + type: string + description: Database name. + example: "neondb" + schema: + type: string + description: Schema name. + example: "public" + privileges: + type: array + items: + type: string + description: List of privileges set. + example: ["SELECT", "INSERT"] + role: + type: string + description: Role name. + example: "neon" + # # Errors # diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 3d8b22a8a3..877f99bff7 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,6 +1,7 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions}; use std::collections::HashMap; use std::collections::HashSet; +use tracing::info; use url::Url; use anyhow::Result; @@ -33,6 +34,7 @@ fn list_dbs(client: &mut Client) -> Result> { } /// Connect to every database (see list_dbs above) and get the list of installed extensions. +/// /// Same extension can be installed in multiple databases with different versions, /// we only keep the highest and lowest version across all databases. pub async fn get_installed_extensions(connstr: Url) -> Result { @@ -78,3 +80,23 @@ pub async fn get_installed_extensions(connstr: Url) -> Result Result<()> { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create runtime"); + let result = rt + .block_on(crate::installed_extensions::get_installed_extensions( + connstr, + )) + .expect("failed to get installed extensions"); + + info!( + "[NEON_EXT_STAT] {}", + serde_json::to_string(&result).expect("failed to serialize extensions list") + ); + + Ok(()) +} diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 7cdf621737..71514daa7c 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -97,7 +97,21 @@ impl ComputeControlPlane { for endpoint_dir in std::fs::read_dir(env.endpoints_path()) .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { - let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?; + let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env); + let ep = match ep_res { + Ok(ep) => ep, + Err(e) => match e.downcast::() { + Ok(e) => { + // A parallel task could delete an endpoint while we have just scanned the directory + if e.kind() == std::io::ErrorKind::NotFound { + continue; + } else { + Err(e)? + } + } + Err(e) => Err(e)?, + }, + }; endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep)); } diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs index 210a52d089..f4f3d92fc6 100644 --- a/libs/compute_api/src/lib.rs +++ b/libs/compute_api/src/lib.rs @@ -1,5 +1,6 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] +pub mod privilege; pub mod requests; pub mod responses; pub mod spec; diff --git a/libs/compute_api/src/privilege.rs b/libs/compute_api/src/privilege.rs new file mode 100644 index 0000000000..dc0d870946 --- /dev/null +++ b/libs/compute_api/src/privilege.rs @@ -0,0 +1,35 @@ +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "UPPERCASE")] +pub enum Privilege { + Select, + Insert, + Update, + Delete, + Truncate, + References, + Trigger, + Usage, + Create, + Connect, + Temporary, + Execute, +} + +impl Privilege { + pub fn as_str(&self) -> &'static str { + match self { + Privilege::Select => "SELECT", + Privilege::Insert => "INSERT", + Privilege::Update => "UPDATE", + Privilege::Delete => "DELETE", + Privilege::Truncate => "TRUNCATE", + Privilege::References => "REFERENCES", + Privilege::Trigger => "TRIGGER", + Privilege::Usage => "USAGE", + Privilege::Create => "CREATE", + Privilege::Connect => "CONNECT", + Privilege::Temporary => "TEMPORARY", + Privilege::Execute => "EXECUTE", + } + } +} diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 5896c7dc65..fc3757d981 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,6 +1,8 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. - -use crate::spec::ComputeSpec; +use crate::{ + privilege::Privilege, + spec::{ComputeSpec, ExtVersion, PgIdent}, +}; use serde::Deserialize; /// Request of the /configure API @@ -12,3 +14,18 @@ use serde::Deserialize; pub struct ConfigurationRequest { pub spec: ComputeSpec, } + +#[derive(Deserialize, Debug)] +pub struct ExtensionInstallRequest { + pub extension: PgIdent, + pub database: PgIdent, + pub version: ExtVersion, +} + +#[derive(Deserialize, Debug)] +pub struct SetRoleGrantsRequest { + pub database: PgIdent, + pub schema: PgIdent, + pub privileges: Vec, + pub role: PgIdent, +} diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 5023fce003..79234be720 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -6,7 +6,10 @@ use std::fmt::Display; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize, Serializer}; -use crate::spec::{ComputeSpec, Database, Role}; +use crate::{ + privilege::Privilege, + spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}, +}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { @@ -168,3 +171,16 @@ pub struct InstalledExtension { pub struct InstalledExtensions { pub extensions: Vec, } + +#[derive(Clone, Debug, Default, Serialize)] +pub struct ExtensionInstallResult { + pub extension: PgIdent, + pub version: ExtVersion, +} +#[derive(Clone, Debug, Default, Serialize)] +pub struct SetRoleGrantsResponse { + pub database: PgIdent, + pub schema: PgIdent, + pub privileges: Vec, + pub role: PgIdent, +} diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 5903db7055..8a447563dc 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -16,6 +16,9 @@ use remote_storage::RemotePath; /// intended to be used for DB / role names. pub type PgIdent = String; +/// String type alias representing Postgres extension version +pub type ExtVersion = String; + /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. #[derive(Clone, Debug, Default, Deserialize, Serialize)] diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 24474d4840..896a5d8069 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -102,6 +102,7 @@ pub struct ConfigToml { pub ingest_batch_size: u64, pub max_vectored_read_bytes: MaxVectoredReadBytes, pub image_compression: ImageCompressionAlgorithm, + pub timeline_offloading: bool, pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, pub virtual_file_io_mode: Option, @@ -385,6 +386,7 @@ impl Default for ConfigToml { NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), image_compression: (DEFAULT_IMAGE_COMPRESSION), + timeline_offloading: false, ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 3ec9cac2c3..5b0b6bebe3 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -743,8 +743,6 @@ pub struct TimelineInfo { // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does // not deny unknown fields by default so it's safe to set the field to some value, though it won't be // read. - /// The last aux file policy being used on this timeline - pub last_aux_file_policy: Option, pub is_archived: Option, } diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs index f6644be635..69832b9a0d 100644 --- a/libs/pageserver_api/src/models/partitioning.rs +++ b/libs/pageserver_api/src/models/partitioning.rs @@ -16,7 +16,7 @@ impl serde::Serialize for Partitioning { { pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace); - impl<'a> serde::Serialize for KeySpace<'a> { + impl serde::Serialize for KeySpace<'_> { fn serialize(&self, serializer: S) -> std::result::Result where S: serde::Serializer, @@ -44,7 +44,7 @@ impl serde::Serialize for Partitioning { pub struct WithDisplay<'a, T>(&'a T); -impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> { +impl serde::Serialize for WithDisplay<'_, T> { fn serialize(&self, serializer: S) -> std::result::Result where S: serde::Serializer, @@ -55,7 +55,7 @@ impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> { pub struct KeyRange<'a>(&'a std::ops::Range); -impl<'a> serde::Serialize for KeyRange<'a> { +impl serde::Serialize for KeyRange<'_> { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 085540e7b9..7419798a60 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -738,6 +738,20 @@ impl PostgresBackend { QueryError::SimulatedConnectionError => { return Err(QueryError::SimulatedConnectionError) } + err @ QueryError::Reconnect => { + // Instruct the client to reconnect, stop processing messages + // from this libpq connection and, finally, disconnect from the + // server side (returning an Err achieves the later). + // + // Note the flushing is done by the caller. + let reconnect_error = short_error(&err); + self.write_message_noflush(&BeMessage::ErrorResponse( + &reconnect_error, + Some(err.pg_error_code()), + ))?; + + return Err(err); + } e => { log_query_error(query_string, &e); let short_error = short_error(&e); @@ -921,12 +935,11 @@ impl PostgresBackendReader { /// A futures::AsyncWrite implementation that wraps all data written to it in CopyData /// messages. /// - pub struct CopyDataWriter<'a, IO> { pgb: &'a mut PostgresBackend, } -impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> { +impl AsyncWrite for CopyDataWriter<'_, IO> { fn poll_write( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 900083ea7f..9d3031d699 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -2,6 +2,7 @@ use once_cell::sync::Lazy; use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; +use rustls::crypto::aws_lc_rs; use std::io::Cursor; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; @@ -92,10 +93,13 @@ static CERT: Lazy> = Lazy::new(|| { async fn simple_select_ssl() { let (client_sock, server_sock) = make_tcp_pair().await; - let server_cfg = rustls::ServerConfig::builder() - .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone_key()) - .unwrap(); + let server_cfg = + rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("aws_lc_rs should support the default protocol versions") + .with_no_client_auth() + .with_single_cert(vec![CERT.clone()], KEY.clone_key()) + .unwrap(); let tls_config = Some(Arc::new(server_cfg)); let pgbackend = PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation"); @@ -105,13 +109,16 @@ async fn simple_select_ssl() { pgbackend.run(&mut handler, &CancellationToken::new()).await }); - let client_cfg = rustls::ClientConfig::builder() - .with_root_certificates({ - let mut store = rustls::RootCertStore::empty(); - store.add(CERT.clone()).unwrap(); - store - }) - .with_no_client_auth(); + let client_cfg = + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("aws_lc_rs should support the default protocol versions") + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(CERT.clone()).unwrap(); + store + }) + .with_no_client_auth(); let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg); let tls_connect = >::make_tls_connect( &mut make_tls_connect, diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index a01191bd5d..9ffaaba584 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -727,7 +727,7 @@ pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000"; pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01"; pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000"; -impl<'a> BeMessage<'a> { +impl BeMessage<'_> { /// Serialize `message` to the given `buf`. /// Apart from smart memory managemet, BytesMut is good here as msg len /// precedes its body and it is handy to write it down first and then fill diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index be4d61f009..1816825bda 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -16,7 +16,7 @@ aws-sdk-s3.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true -hyper0 = { workspace = true, features = ["stream"] } +hyper = { workspace = true, features = ["client"] } futures.workspace = true serde.workspace = true serde_json.workspace = true @@ -36,6 +36,7 @@ azure_storage.workspace = true azure_storage_blobs.workspace = true futures-util.workspace = true http-types.workspace = true +http-body-util.workspace = true itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index c6466237bf..719608dd5f 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -19,7 +19,12 @@ mod simulate_failures; mod support; use std::{ - collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc, + collections::HashMap, + fmt::Debug, + num::NonZeroU32, + ops::Bound, + pin::{pin, Pin}, + sync::Arc, time::SystemTime, }; @@ -28,6 +33,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; use futures::{stream::Stream, StreamExt}; +use itertools::Itertools as _; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; @@ -261,7 +267,7 @@ pub trait RemoteStorage: Send + Sync + 'static { max_keys: Option, cancel: &CancellationToken, ) -> Result { - let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel)); + let mut stream = pin!(self.list_streaming(prefix, mode, max_keys, cancel)); let mut combined = stream.next().await.expect("At least one item required")?; while let Some(list) = stream.next().await { let list = list?; @@ -324,6 +330,35 @@ pub trait RemoteStorage: Send + Sync + 'static { cancel: &CancellationToken, ) -> anyhow::Result<()>; + /// Deletes all objects matching the given prefix. + /// + /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will + /// delete /a/b, /a/b/*, /a/bc, /a/bc/*, etc. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will + /// be set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went + /// through. + async fn delete_prefix( + &self, + prefix: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let mut stream = + pin!(self.list_streaming(Some(prefix), ListingMode::NoDelimiter, None, cancel)); + while let Some(result) = stream.next().await { + let keys = match result { + Ok(listing) if listing.keys.is_empty() => continue, + Ok(listing) => listing.keys.into_iter().map(|o| o.key).collect_vec(), + Err(DownloadError::Cancelled) => return Err(TimeoutOrCancel::Cancel.into()), + Err(DownloadError::Timeout) => return Err(TimeoutOrCancel::Timeout.into()), + Err(err) => return Err(err.into()), + }; + tracing::info!("Deleting {} keys from remote storage", keys.len()); + self.delete_objects(&keys, cancel).await?; + } + Ok(()) + } + /// Copy a remote object inside a bucket from one path to another. async fn copy( &self, @@ -488,6 +523,20 @@ impl GenericRemoteStorage> { } } + /// See [`RemoteStorage::delete_prefix`] + pub async fn delete_prefix( + &self, + prefix: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.delete_prefix(prefix, cancel).await, + Self::AwsS3(s) => s.delete_prefix(prefix, cancel).await, + Self::AzureBlob(s) => s.delete_prefix(prefix, cancel).await, + Self::Unreliable(s) => s.delete_prefix(prefix, cancel).await, + } + } + /// See [`RemoteStorage::copy`] pub async fn copy_object( &self, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index f950f2886c..cde32df402 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -28,13 +28,15 @@ use aws_sdk_s3::{ Client, }; use aws_smithy_async::rt::sleep::TokioSleep; +use http_body_util::StreamBody; use http_types::StatusCode; use aws_smithy_types::{body::SdkBody, DateTime}; use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; use bytes::Bytes; use futures::stream::Stream; -use hyper0::Body; +use futures_util::StreamExt; +use hyper::body::Frame; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -710,8 +712,8 @@ impl RemoteStorage for S3Bucket { let started_at = start_measuring_requests(kind); - let body = Body::wrap_stream(from); - let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body)); + let body = StreamBody::new(from.map(|x| x.map(Frame::data))); + let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body)); let upload = self .client diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index e6f33fc3f8..d5da1d48e9 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -199,6 +199,138 @@ async fn list_no_delimiter_works( Ok(()) } +/// Tests that giving a partial prefix returns all matches (e.g. "/foo" yields "/foobar/baz"), +/// but only with NoDelimiter. +#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] +#[tokio::test] +async fn list_partial_prefix( + ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs, +) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), + MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => { + anyhow::bail!("S3 init failed: {e:?}") + } + }; + + let cancel = CancellationToken::new(); + let test_client = Arc::clone(&ctx.enabled.client); + + // Prefix "fold" should match all "folder{i}" directories with NoDelimiter. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("fold")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert_eq!(&objects, &ctx.remote_blobs); + + // Prefix "fold" matches nothing with WithDelimiter. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("fold")?), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert!(objects.is_empty()); + + // Prefix "" matches everything. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert_eq!(&objects, &ctx.remote_blobs); + + // Prefix "" matches nothing with WithDelimiter. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("")?), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert!(objects.is_empty()); + + // Prefix "foo" matches nothing. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("foo")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert!(objects.is_empty()); + + // Prefix "folder2/blob" matches. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("folder2/blob")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + let expect: HashSet<_> = ctx + .remote_blobs + .iter() + .filter(|o| o.get_path().starts_with("folder2")) + .cloned() + .collect(); + assert_eq!(&objects, &expect); + + // Prefix "folder2/foo" matches nothing. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("folder2/foo")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert!(objects.is_empty()); + + Ok(()) +} + #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { @@ -265,6 +397,80 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<( Ok(()) } +/// Tests that delete_prefix() will delete all objects matching a prefix, including +/// partial prefixes (i.e. "/foo" matches "/foobar"). +#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] +#[tokio::test] +async fn delete_prefix(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), + MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => { + anyhow::bail!("S3 init failed: {e:?}") + } + }; + + let cancel = CancellationToken::new(); + let test_client = Arc::clone(&ctx.enabled.client); + + /// Asserts that the S3 listing matches the given paths. + macro_rules! assert_list { + ($expect:expr) => {{ + let listing = test_client + .list(None, ListingMode::NoDelimiter, None, &cancel) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert_eq!($expect, listing); + }}; + } + + // We start with the full set of uploaded files. + let mut expect = ctx.remote_blobs.clone(); + + // Deleting a non-existing prefix should do nothing. + test_client + .delete_prefix(&RemotePath::from_string("xyz")?, &cancel) + .await?; + assert_list!(expect); + + // Prefixes are case-sensitive. + test_client + .delete_prefix(&RemotePath::from_string("Folder")?, &cancel) + .await?; + assert_list!(expect); + + // Deleting a path which overlaps with an existing object should do nothing. We pick the first + // path in the set as our common prefix. + let path = expect.iter().next().expect("empty set").clone().join("xyz"); + test_client.delete_prefix(&path, &cancel).await?; + assert_list!(expect); + + // Deleting an exact path should work. We pick the first path in the set. + let path = expect.iter().next().expect("empty set").clone(); + test_client.delete_prefix(&path, &cancel).await?; + expect.remove(&path); + assert_list!(expect); + + // Deleting a prefix should delete all matching objects. + test_client + .delete_prefix(&RemotePath::from_string("folder0/blob_")?, &cancel) + .await?; + expect.retain(|p| !p.get_path().as_str().starts_with("folder0/")); + assert_list!(expect); + + // Deleting a common prefix should delete all objects. + test_client + .delete_prefix(&RemotePath::from_string("fold")?, &cancel) + .await?; + expect.clear(); + assert_list!(expect); + + Ok(()) +} + #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs index 0de2890bb4..25ebb1c3d8 100644 --- a/libs/tenant_size_model/src/svg.rs +++ b/libs/tenant_size_model/src/svg.rs @@ -97,7 +97,7 @@ pub fn draw_svg( Ok(result) } -impl<'a> SvgDraw<'a> { +impl SvgDraw<'_> { fn calculate_svg_layout(&mut self) { // Find x scale let segments = &self.storage.segments; diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs index e6fdf9be45..2168beee88 100644 --- a/libs/tracing-utils/src/http.rs +++ b/libs/tracing-utils/src/http.rs @@ -82,7 +82,7 @@ where fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context { struct HeaderExtractor<'a>(&'a HeaderMap); - impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> { + impl opentelemetry::propagation::Extractor for HeaderExtractor<'_> { fn get(&self, key: &str) -> Option<&str> { self.0.get(key).and_then(|value| value.to_str().ok()) } diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 5e05e4e713..02fc9e3b99 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -28,6 +28,9 @@ pub enum ApiError { #[error("Resource temporarily unavailable: {0}")] ResourceUnavailable(Cow<'static, str>), + #[error("Too many requests: {0}")] + TooManyRequests(Cow<'static, str>), + #[error("Shutting down")] ShuttingDown, @@ -73,6 +76,10 @@ impl ApiError { err.to_string(), StatusCode::SERVICE_UNAVAILABLE, ), + ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::TOO_MANY_REQUESTS, + ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 06d5c27ebf..3ec2c130bd 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -37,7 +37,7 @@ impl<'de> Deserialize<'de> for Lsn { is_human_readable_deserializer: bool, } - impl<'de> Visitor<'de> for LsnVisitor { + impl Visitor<'_> for LsnVisitor { type Value = Lsn; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs index c3e2fba20c..ab9ebb3c5a 100644 --- a/libs/utils/src/poison.rs +++ b/libs/utils/src/poison.rs @@ -73,7 +73,7 @@ impl Poison { /// and subsequent calls to [`Poison::check_and_arm`] will fail with an error. pub struct Guard<'a, T>(&'a mut Poison); -impl<'a, T> Guard<'a, T> { +impl Guard<'_, T> { pub fn data(&self) -> &T { &self.0.data } @@ -94,7 +94,7 @@ impl<'a, T> Guard<'a, T> { } } -impl<'a, T> Drop for Guard<'a, T> { +impl Drop for Guard<'_, T> { fn drop(&mut self) { match self.0.state { State::Clean => { diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index d146010b41..782cddc599 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -164,7 +164,7 @@ impl TenantShardId { } } -impl<'a> std::fmt::Display for ShardSlug<'a> { +impl std::fmt::Display for ShardSlug<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index 01750b2aef..6700f86e4a 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -152,7 +152,7 @@ pub struct RcuWriteGuard<'a, V> { inner: RwLockWriteGuard<'a, RcuInner>, } -impl<'a, V> Deref for RcuWriteGuard<'a, V> { +impl Deref for RcuWriteGuard<'_, V> { type Target = V; fn deref(&self) -> &V { @@ -160,7 +160,7 @@ impl<'a, V> Deref for RcuWriteGuard<'a, V> { } } -impl<'a, V> RcuWriteGuard<'a, V> { +impl RcuWriteGuard<'_, V> { /// /// Store a new value. The new value will be written to the Rcu immediately, /// and will be immediately seen by any `read` calls that start afterwards. diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index dc711fb028..66c2065554 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -219,7 +219,7 @@ impl<'a, T> CountWaitingInitializers<'a, T> { } } -impl<'a, T> Drop for CountWaitingInitializers<'a, T> { +impl Drop for CountWaitingInitializers<'_, T> { fn drop(&mut self) { self.0.initializers.fetch_sub(1, Ordering::Relaxed); } @@ -250,7 +250,7 @@ impl std::ops::DerefMut for Guard<'_, T> { } } -impl<'a, T> Guard<'a, T> { +impl Guard<'_, T> { /// Take the current value, and a new permit for it's deinitialization. /// /// The permit will be on a semaphore part of the new internal value, and any following diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index d24c81ad0b..add2fa7920 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -184,23 +184,23 @@ mod tests { struct MemoryIdentity<'a>(&'a dyn Extractor); - impl<'a> MemoryIdentity<'a> { + impl MemoryIdentity<'_> { fn as_ptr(&self) -> *const () { self.0 as *const _ as *const () } } - impl<'a> PartialEq for MemoryIdentity<'a> { + impl PartialEq for MemoryIdentity<'_> { fn eq(&self, other: &Self) -> bool { self.as_ptr() == other.as_ptr() } } - impl<'a> Eq for MemoryIdentity<'a> {} - impl<'a> Hash for MemoryIdentity<'a> { + impl Eq for MemoryIdentity<'_> {} + impl Hash for MemoryIdentity<'_> { fn hash(&self, state: &mut H) { self.as_ptr().hash(state); } } - impl<'a> fmt::Debug for MemoryIdentity<'a> { + impl fmt::Debug for MemoryIdentity<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:p}: {}", self.as_ptr(), self.0.id()) } diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 8ed1d16082..9dbb6ecedf 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -133,7 +133,7 @@ enum LazyLoadLayer<'a, E: CompactionJobExecutor> { Loaded(VecDeque<>::DeltaEntry<'a>>), Unloaded(&'a E::DeltaLayer), } -impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { +impl LazyLoadLayer<'_, E> { fn min_key(&self) -> E::Key { match self { Self::Loaded(entries) => entries.front().unwrap().key(), @@ -147,23 +147,23 @@ impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { } } } -impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { +impl PartialOrd for LazyLoadLayer<'_, E> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> { +impl Ord for LazyLoadLayer<'_, E> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { // reverse order so that we get a min-heap (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn())) } } -impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> { +impl PartialEq for LazyLoadLayer<'_, E> { fn eq(&self, other: &Self) -> bool { self.cmp(other) == std::cmp::Ordering::Equal } } -impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {} +impl Eq for LazyLoadLayer<'_, E> {} type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result>>; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index a32d09f3b3..975318419f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -16,7 +16,7 @@ use fail::fail_point; use pageserver_api::key::Key; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; -use std::time::SystemTime; +use std::time::{Instant, SystemTime}; use tokio::io; use tokio::io::AsyncWrite; use tracing::*; @@ -352,12 +352,25 @@ where } } - for (path, content) in self + let start_time = Instant::now(); + let aux_files = self .timeline .list_aux_files(self.lsn, self.ctx) .await - .map_err(|e| BasebackupError::Server(e.into()))? - { + .map_err(|e| BasebackupError::Server(e.into()))?; + let aux_scan_time = start_time.elapsed(); + let aux_estimated_size = aux_files + .values() + .map(|content| content.len()) + .sum::(); + info!( + "Scanned {} aux files in {}ms, aux file content size = {}", + aux_files.len(), + aux_scan_time.as_millis(), + aux_estimated_size + ); + + for (path, content) in aux_files { if path.starts_with("pg_replslot") { let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; let restart_lsn = Lsn(u64::from_le_bytes( diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8db78285e4..06d4326459 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -164,6 +164,9 @@ pub struct PageServerConf { pub image_compression: ImageCompressionAlgorithm, + /// Whether to offload archived timelines automatically + pub timeline_offloading: bool, + /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this /// is exceeded, we start proactively closing ephemeral layers to limit the total amount /// of ephemeral data. @@ -321,6 +324,7 @@ impl PageServerConf { ingest_batch_size, max_vectored_read_bytes, image_compression, + timeline_offloading, ephemeral_bytes_per_memory_kb, l0_flush, virtual_file_io_mode, @@ -364,6 +368,7 @@ impl PageServerConf { ingest_batch_size, max_vectored_read_bytes, image_compression, + timeline_offloading, ephemeral_bytes_per_memory_kb, // ------------------------------------------------------------ diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 0325ee403a..1eb25d337b 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -198,7 +198,7 @@ fn serialize_in_chunks<'a>( } } - impl<'a> ExactSizeIterator for Iter<'a> {} + impl ExactSizeIterator for Iter<'_> {} let buffer = bytes::BytesMut::new(); let inner = input.chunks(chunk_size); diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index a58fa2c0b1..ca44fbe6ae 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -654,7 +654,7 @@ impl std::fmt::Debug for EvictionCandidate { let ts = chrono::DateTime::::from(self.last_activity_ts); let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true); struct DisplayIsDebug<'a, T>(&'a T); - impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> { + impl std::fmt::Debug for DisplayIsDebug<'_, T> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } @@ -1218,16 +1218,7 @@ mod filesystem_level_usage { let stat = Statvfs::get(tenants_dir, mock_config) .context("statvfs failed, presumably directory got unlinked")?; - // https://unix.stackexchange.com/a/703650 - let blocksize = if stat.fragment_size() > 0 { - stat.fragment_size() - } else { - stat.block_size() - }; - - // use blocks_available (b_avail) since, pageserver runs as unprivileged user - let avail_bytes = stat.blocks_available() * blocksize; - let total_bytes = stat.blocks() * blocksize; + let (avail_bytes, total_bytes) = stat.get_avail_total_bytes(); Ok(Usage { config, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2985ab1efb..8f928fd81b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,7 +18,6 @@ use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::virtual_file::IoMode; -use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use pageserver_api::models::IngestAuxFilesRequest; use pageserver_api::models::ListAuxFilesRequest; @@ -77,6 +76,7 @@ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::offload::offload_timeline; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; @@ -325,6 +325,7 @@ impl From for ApiError { match value { NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), Timeout => ApiError::Timeout("hit pageserver internal timeout".into()), + Cancelled => ApiError::ShuttingDown, e @ HasArchivedParent(_) => { ApiError::PreconditionFailed(e.to_string().into_boxed_str()) } @@ -472,8 +473,6 @@ async fn build_timeline_info_common( is_archived: Some(is_archived), walreceiver_status, - - last_aux_file_policy: timeline.last_aux_file_policy.load(), }; Ok(info) } @@ -715,8 +714,15 @@ async fn timeline_archival_config_handler( .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + tenant - .apply_timeline_archival_config(timeline_id, request_data.state, ctx) + .apply_timeline_archival_config( + timeline_id, + request_data.state, + state.broker_client.clone(), + ctx, + ) .await?; Ok::<_, ApiError>(()) } @@ -1783,6 +1789,49 @@ async fn timeline_compact_handler( .await } +// Run offload immediately on given timeline. +async fn timeline_offload_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + if tenant.get_offloaded_timeline(timeline_id).is_ok() { + return json_response(StatusCode::OK, ()); + } + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + if !tenant.timeline_has_no_attached_children(timeline_id) { + return Err(ApiError::PreconditionFailed( + "timeline has attached children".into(), + )); + } + if !timeline.can_offload() { + return Err(ApiError::PreconditionFailed( + "Timeline::can_offload() returned false".into(), + )); + } + offload_timeline(&tenant, &timeline) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) + } + .instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await +} + // Run checkpoint immediately on given timeline. async fn timeline_checkpoint_handler( request: Request, @@ -2202,7 +2251,7 @@ async fn tenant_scan_remote_handler( %timeline_id)) .await { - Ok((index_part, index_generation)) => { + Ok((index_part, index_generation, _index_mtime)) => { tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn()); generation = std::cmp::max(generation, index_generation); @@ -2347,31 +2396,6 @@ async fn post_tracing_event_handler( json_response(StatusCode::OK, ()) } -async fn force_aux_policy_switch_handler( - mut r: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - check_permission(&r, None)?; - let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; - let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?; - let policy: AuxFilePolicy = json_request(&mut r).await?; - - let state = get_state(&r); - - let tenant = state - .tenant_manager - .get_attached_tenant_shard(tenant_shard_id)?; - tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - let timeline = - active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) - .await?; - timeline - .do_switch_aux_policy(policy) - .map_err(ApiError::InternalServerError)?; - - json_response(StatusCode::OK, ()) -} - async fn put_io_engine_handler( mut r: Request, _cancel: CancellationToken, @@ -3006,6 +3030,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_compact_handler), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload", + |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), @@ -3080,10 +3108,6 @@ pub fn make_router( ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler)) - .put( - "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", - |r| api_handler(r, force_aux_policy_switch_handler), - ) .get("/v1/utilization", |r| api_handler(r, get_utilization)) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files", diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b76efa5b48..3e824b59fb 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1189,7 +1189,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { op: SmgrQueryType, } -impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { +impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> { fn drop(&mut self) { let elapsed = self.start.elapsed(); let ex_throttled = self @@ -1560,7 +1560,7 @@ impl BasebackupQueryTime { } } -impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { +impl BasebackupQueryTimeOngoingRecording<'_, '_> { pub(crate) fn observe(self, res: &Result) { let elapsed = self.start.elapsed(); let ex_throttled = self diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8fa6b9a7f0..afb2f92ff8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -26,8 +26,8 @@ use std::str::FromStr; use std::sync::Arc; use std::time::SystemTime; use std::time::{Duration, Instant}; -use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; @@ -1137,10 +1137,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } else { - let mut writer = pgb.copyout_writer(); + let mut writer = BufWriter::new(pgb.copyout_writer()); if gzip { let mut encoder = GzipEncoder::with_quality( - writer, + &mut writer, // NOTE using fast compression because it's on the critical path // for compute startup. For an empty database, we get // <100KB with this method. The Level::Best compression method @@ -1175,6 +1175,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } + writer + .flush() + .await + .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?; } pgb.write_message_noflush(&BeMessage::CopyDone) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7aa313f031..f2a11e65c1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -22,7 +22,6 @@ use pageserver_api::key::{ CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; use pageserver_api::keyspace::SparseKeySpace; -use pageserver_api::models::AuxFilePolicy; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; @@ -33,7 +32,7 @@ use std::ops::ControlFlow; use std::ops::Range; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, trace, warn}; +use tracing::{debug, trace, warn}; use utils::bin_ser::DeserializeError; use utils::pausable_failpoint; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -677,21 +676,6 @@ impl Timeline { self.get(CHECKPOINT_KEY, lsn, ctx).await } - async fn list_aux_files_v1( - &self, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result, PageReconstructError> { - match self.get(AUX_FILES_KEY, lsn, ctx).await { - Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files), - Err(e) => { - // This is expected: historical databases do not have the key. - debug!("Failed to get info about AUX files: {}", e); - Ok(HashMap::new()) - } - } - } - async fn list_aux_files_v2( &self, lsn: Lsn, @@ -722,10 +706,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result<(), PageReconstructError> { - let current_policy = self.last_aux_file_policy.load(); - if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy { - self.list_aux_files_v2(lsn, ctx).await?; - } + self.list_aux_files_v2(lsn, ctx).await?; Ok(()) } @@ -734,51 +715,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result, PageReconstructError> { - let current_policy = self.last_aux_file_policy.load(); - match current_policy { - Some(AuxFilePolicy::V1) => { - let res = self.list_aux_files_v1(lsn, ctx).await?; - let empty_str = if res.is_empty() { ", empty" } else { "" }; - warn!( - "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})" - ); - Ok(res) - } - None => { - let res = self.list_aux_files_v1(lsn, ctx).await?; - if !res.is_empty() { - warn!("this timeline is using deprecated aux file policy V1 (policy=None)"); - } - Ok(res) - } - Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await, - Some(AuxFilePolicy::CrossValidation) => { - let v1_result = self.list_aux_files_v1(lsn, ctx).await; - let v2_result = self.list_aux_files_v2(lsn, ctx).await; - match (v1_result, v2_result) { - (Ok(v1), Ok(v2)) => { - if v1 != v2 { - tracing::error!( - "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}" - ); - return Err(PageReconstructError::Other(anyhow::anyhow!( - "unmatched aux file v1 v2 result" - ))); - } - Ok(v1) - } - (Ok(_), Err(v2)) => { - tracing::error!("aux file v1 returns Ok while aux file v2 returns an err"); - Err(v2) - } - (Err(v1), Ok(_)) => { - tracing::error!("aux file v2 returns Ok while aux file v1 returns an err"); - Err(v1) - } - (Err(_), Err(v2)) => Err(v2), - } - } - } + self.list_aux_files_v2(lsn, ctx).await } pub(crate) async fn get_replorigins( @@ -954,9 +891,6 @@ impl Timeline { result.add_key(CONTROLFILE_KEY); result.add_key(CHECKPOINT_KEY); - if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() { - result.add_key(AUX_FILES_KEY); - } // Add extra keyspaces in the test cases. Some test cases write keys into the storage without // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace` @@ -1166,9 +1100,6 @@ impl<'a> DatadirModification<'a> { self.pending_directory_entries.push((DirectoryKind::Db, 0)); self.put(DBDIR_KEY, Value::Image(buf.into())); - // Create AuxFilesDirectory - self.init_aux_dir()?; - let buf = if self.tline.pg_version >= 17 { TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { xids: HashSet::new(), @@ -1347,9 +1278,6 @@ impl<'a> DatadirModification<'a> { // 'true', now write the updated 'dbdirs' map back. let buf = DbDirectory::ser(&dbdir)?; self.put(DBDIR_KEY, Value::Image(buf.into())); - - // Create AuxFilesDirectory as well - self.init_aux_dir()?; } if r.is_none() { // Create RelDirectory @@ -1545,9 +1473,6 @@ impl<'a> DatadirModification<'a> { // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update relation size cache - self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update logical database size. self.pending_nblocks -= old_size as i64 - nblocks as i64; } @@ -1729,200 +1654,60 @@ impl<'a> DatadirModification<'a> { Ok(()) } - pub fn init_aux_dir(&mut self) -> anyhow::Result<()> { - if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() { - return Ok(()); - } - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: HashMap::new(), - })?; - self.pending_directory_entries - .push((DirectoryKind::AuxFiles, 0)); - self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); - Ok(()) - } - pub async fn put_file( &mut self, path: &str, content: &[u8], ctx: &RequestContext, ) -> anyhow::Result<()> { - let switch_policy = self.tline.get_switch_aux_file_policy(); - - let policy = { - let current_policy = self.tline.last_aux_file_policy.load(); - // Allowed switch path: - // * no aux files -> v1/v2/cross-validation - // * cross-validation->v2 - - let current_policy = if current_policy.is_none() { - // This path will only be hit once per tenant: we will decide the final policy in this code block. - // The next call to `put_file` will always have `last_aux_file_policy != None`. - let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); - let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?; - if aux_files_key_v1.is_empty() { - None - } else { - warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)"); - self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?; - Some(AuxFilePolicy::V1) - } - } else { - current_policy - }; - - if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) { - self.tline.do_switch_aux_policy(switch_policy)?; - info!(current=?current_policy, next=?switch_policy, "switching aux file policy"); - switch_policy - } else { - // This branch handles non-valid migration path, and the case that switch_policy == current_policy. - // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit. - current_policy.unwrap_or(AuxFilePolicy::default_tenant_config()) - } + let key = aux_file::encode_aux_file_key(path); + // retrieve the key from the engine + let old_val = match self.get(key, ctx).await { + Ok(val) => Some(val), + Err(PageReconstructError::MissingKey(_)) => None, + Err(e) => return Err(e.into()), }; - - if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy { - let key = aux_file::encode_aux_file_key(path); - // retrieve the key from the engine - let old_val = match self.get(key, ctx).await { - Ok(val) => Some(val), - Err(PageReconstructError::MissingKey(_)) => None, - Err(e) => return Err(e.into()), - }; - let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { - aux_file::decode_file_value(old_val)? + let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { + aux_file::decode_file_value(old_val)? + } else { + Vec::new() + }; + let mut other_files = Vec::with_capacity(files.len()); + let mut modifying_file = None; + for file @ (p, content) in files { + if path == p { + assert!( + modifying_file.is_none(), + "duplicated entries found for {}", + path + ); + modifying_file = Some(content); } else { - Vec::new() - }; - let mut other_files = Vec::with_capacity(files.len()); - let mut modifying_file = None; - for file @ (p, content) in files { - if path == p { - assert!( - modifying_file.is_none(), - "duplicated entries found for {}", - path - ); - modifying_file = Some(content); - } else { - other_files.push(file); - } + other_files.push(file); } - let mut new_files = other_files; - match (modifying_file, content.is_empty()) { - (Some(old_content), false) => { - self.tline - .aux_file_size_estimator - .on_update(old_content.len(), content.len()); - new_files.push((path, content)); - } - (Some(old_content), true) => { - self.tline - .aux_file_size_estimator - .on_remove(old_content.len()); - // not adding the file key to the final `new_files` vec. - } - (None, false) => { - self.tline.aux_file_size_estimator.on_add(content.len()); - new_files.push((path, content)); - } - (None, true) => warn!("removing non-existing aux file: {}", path), - } - let new_val = aux_file::encode_file_value(&new_files)?; - self.put(key, Value::Image(new_val.into())); } - - if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy { - let file_path = path.to_string(); - let content = if content.is_empty() { - None - } else { - Some(Bytes::copy_from_slice(content)) - }; - - let n_files; - let mut aux_files = self.tline.aux_files.lock().await; - if let Some(mut dir) = aux_files.dir.take() { - // We already updated aux files in `self`: emit a delta and update our latest value. - dir.upsert(file_path.clone(), content.clone()); - n_files = dir.files.len(); - if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { - self.put( - AUX_FILES_KEY, - Value::Image(Bytes::from( - AuxFilesDirectory::ser(&dir).context("serialize")?, - )), - ); - aux_files.n_deltas = 0; - } else { - self.put( - AUX_FILES_KEY, - Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }), - ); - aux_files.n_deltas += 1; - } - aux_files.dir = Some(dir); - } else { - // Check if the AUX_FILES_KEY is initialized - match self.get(AUX_FILES_KEY, ctx).await { - Ok(dir_bytes) => { - let mut dir = AuxFilesDirectory::des(&dir_bytes)?; - // Key is already set, we may append a delta - self.put( - AUX_FILES_KEY, - Value::WalRecord(NeonWalRecord::AuxFile { - file_path: file_path.clone(), - content: content.clone(), - }), - ); - dir.upsert(file_path, content); - n_files = dir.files.len(); - aux_files.dir = Some(dir); - } - Err( - e @ (PageReconstructError::Cancelled - | PageReconstructError::AncestorLsnTimeout(_)), - ) => { - // Important that we do not interpret a shutdown error as "not found" and thereby - // reset the map. - return Err(e.into()); - } - // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but - // the original code assumes all other errors are missing keys. Therefore, we keep the code path - // the same for now, though in theory, we should only match the `MissingKey` variant. - Err( - e @ (PageReconstructError::Other(_) - | PageReconstructError::WalRedo(_) - | PageReconstructError::MissingKey(_)), - ) => { - // Key is missing, we must insert an image as the basis for subsequent deltas. - - if !matches!(e, PageReconstructError::MissingKey(_)) { - let e = utils::error::report_compact_sources(&e); - tracing::warn!("treating error as if it was a missing key: {}", e); - } - - let mut dir = AuxFilesDirectory { - files: HashMap::new(), - }; - dir.upsert(file_path, content); - self.put( - AUX_FILES_KEY, - Value::Image(Bytes::from( - AuxFilesDirectory::ser(&dir).context("serialize")?, - )), - ); - n_files = 1; - aux_files.dir = Some(dir); - } - } + let mut new_files = other_files; + match (modifying_file, content.is_empty()) { + (Some(old_content), false) => { + self.tline + .aux_file_size_estimator + .on_update(old_content.len(), content.len()); + new_files.push((path, content)); } - - self.pending_directory_entries - .push((DirectoryKind::AuxFiles, n_files)); + (Some(old_content), true) => { + self.tline + .aux_file_size_estimator + .on_remove(old_content.len()); + // not adding the file key to the final `new_files` vec. + } + (None, false) => { + self.tline.aux_file_size_estimator.on_add(content.len()); + new_files.push((path, content)); + } + (None, true) => warn!("removing non-existing aux file: {}", path), } + let new_val = aux_file::encode_file_value(&new_files)?; + self.put(key, Value::Image(new_val.into())); Ok(()) } @@ -2092,12 +1877,6 @@ impl<'a> DatadirModification<'a> { self.tline.get(key, lsn, ctx).await } - /// Only used during unit tests, force putting a key into the modification. - #[cfg(test)] - pub(crate) fn put_for_test(&mut self, key: Key, val: Value) { - self.put(key, val); - } - fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) @@ -2215,21 +1994,6 @@ struct RelDirectory { rels: HashSet<(Oid, u8)>, } -#[derive(Debug, Serialize, Deserialize, Default, PartialEq)] -pub(crate) struct AuxFilesDirectory { - pub(crate) files: HashMap, -} - -impl AuxFilesDirectory { - pub(crate) fn upsert(&mut self, key: String, value: Option) { - if let Some(value) = value { - self.files.insert(key, value); - } else { - self.files.remove(&key); - } - } -} - #[derive(Debug, Serialize, Deserialize)] struct RelSizeEntry { nblocks: u32, diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 5a6f6e5176..4e8be58d58 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -53,6 +53,22 @@ impl Statvfs { Statvfs::Mock(stat) => stat.block_size, } } + + /// Get the available and total bytes on the filesystem. + pub fn get_avail_total_bytes(&self) -> (u64, u64) { + // https://unix.stackexchange.com/a/703650 + let blocksize = if self.fragment_size() > 0 { + self.fragment_size() + } else { + self.block_size() + }; + + // use blocks_available (b_avail) since, pageserver runs as unprivileged user + let avail_bytes = self.blocks_available() * blocksize; + let total_bytes = self.blocks() * blocksize; + + (avail_bytes, total_bytes) + } } pub mod mock { @@ -74,7 +90,7 @@ pub mod mock { let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); // round it up to the nearest block multiple - let used_blocks = (used_bytes + (blocksize - 1)) / blocksize; + let used_blocks = used_bytes.div_ceil(*blocksize); if used_blocks > *total_blocks { panic!( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d2818d04dc..1066d165cd 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -20,7 +20,6 @@ use enumset::EnumSet; use futures::stream::FuturesUnordered; use futures::StreamExt; use pageserver_api::models; -use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::LsnLease; use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; @@ -67,7 +66,7 @@ use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; use self::remote_timeline_client::upload::upload_index_part; -use self::remote_timeline_client::RemoteTimelineClient; +use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::UninitializedTimeline; @@ -493,6 +492,8 @@ pub struct OffloadedTimeline { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, + /// Whether to retain the branch lsn at the ancestor or not + pub ancestor_retain_lsn: Option, // TODO: once we persist offloaded state, make this lazily constructed pub remote_client: Arc, @@ -504,10 +505,14 @@ pub struct OffloadedTimeline { impl OffloadedTimeline { fn from_timeline(timeline: &Timeline) -> Self { + let ancestor_retain_lsn = timeline + .get_ancestor_timeline_id() + .map(|_timeline_id| timeline.get_ancestor_lsn()); Self { tenant_shard_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + ancestor_retain_lsn, remote_client: timeline.remote_client.clone(), delete_progress: timeline.delete_progress.clone(), @@ -515,6 +520,12 @@ impl OffloadedTimeline { } } +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub enum MaybeOffloaded { + Yes, + No, +} + #[derive(Clone)] pub enum TimelineOrOffloaded { Timeline(Arc), @@ -607,6 +618,9 @@ pub enum TimelineArchivalError { #[error("Timeout")] Timeout, + #[error("Cancelled")] + Cancelled, + #[error("ancestor is archived: {}", .0)] HasArchivedParent(TimelineId), @@ -617,7 +631,7 @@ pub enum TimelineArchivalError { AlreadyInProgress, #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl Debug for TimelineArchivalError { @@ -625,6 +639,7 @@ impl Debug for TimelineArchivalError { match self { Self::NotFound => write!(f, "NotFound"), Self::Timeout => write!(f, "Timeout"), + Self::Cancelled => write!(f, "Cancelled"), Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(), Self::HasUnarchivedChildren(c) => { f.debug_tuple("HasUnarchivedChildren").field(c).finish() @@ -784,7 +799,6 @@ impl Tenant { index_part: Option, metadata: TimelineMetadata, ancestor: Option>, - last_aux_file_policy: Option, _ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_shard_id; @@ -795,10 +809,6 @@ impl Tenant { ancestor.clone(), resources, CreateTimelineCause::Load, - // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`, - // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence. - // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2. - last_aux_file_policy, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -813,10 +823,6 @@ impl Tenant { if let Some(index_part) = index_part.as_ref() { timeline.remote_client.init_upload_queue(index_part)?; - - timeline - .last_aux_file_policy - .store(index_part.last_aux_file_policy()); } else { // No data on the remote storage, but we have local metadata file. We can end up // here with timeline_create being interrupted before finishing index part upload. @@ -1387,15 +1393,12 @@ impl Tenant { None }; - let last_aux_file_policy = index_part.last_aux_file_policy(); - self.timeline_init_and_sync( timeline_id, resources, Some(index_part), remote_metadata, ancestor, - last_aux_file_policy, ctx, ) .await @@ -1538,8 +1541,10 @@ impl Tenant { async fn unoffload_timeline( self: &Arc, timeline_id: TimelineId, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result, TimelineArchivalError> { + info!("unoffloading timeline"); let cancel = self.cancel.clone(); let timeline_preload = self .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel) @@ -1554,6 +1559,7 @@ impl Tenant { error!(%timeline_id, "index_part not found on remote"); return Err(TimelineArchivalError::NotFound); } + Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled), Err(e) => { // Some (possibly ephemeral) error happened during index_part download. warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); @@ -1584,26 +1590,40 @@ impl Tenant { "failed to load remote timeline {} for tenant {}", timeline_id, self.tenant_shard_id ) - })?; + }) + .map_err(TimelineArchivalError::Other)?; let timelines = self.timelines.lock().unwrap(); - if let Some(timeline) = timelines.get(&timeline_id) { - let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); - if offloaded_timelines.remove(&timeline_id).is_none() { - warn!("timeline already removed from offloaded timelines"); - } - Ok(Arc::clone(timeline)) - } else { + let Some(timeline) = timelines.get(&timeline_id) else { warn!("timeline not available directly after attach"); - Err(TimelineArchivalError::Other(anyhow::anyhow!( + return Err(TimelineArchivalError::Other(anyhow::anyhow!( "timeline not available directly after attach" - ))) + ))); + }; + let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); + if offloaded_timelines.remove(&timeline_id).is_none() { + warn!("timeline already removed from offloaded timelines"); } + + // Activate the timeline (if it makes sense) + if !(timeline.is_broken() || timeline.is_stopping()) { + let background_jobs_can_start = None; + timeline.activate( + self.clone(), + broker_client.clone(), + background_jobs_can_start, + &ctx, + ); + } + + info!("timeline unoffloading complete"); + Ok(Arc::clone(timeline)) } pub(crate) async fn apply_timeline_archival_config( self: &Arc, timeline_id: TimelineId, new_state: TimelineArchivalState, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); @@ -1644,18 +1664,29 @@ impl Tenant { Some(Arc::clone(timeline)) }; - // Second part: unarchive timeline (if needed) + // Second part: unoffload timeline (if needed) let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { timeline } else { // Turn offloaded timeline into a non-offloaded one - self.unoffload_timeline(timeline_id, ctx).await? + self.unoffload_timeline(timeline_id, broker_client, ctx) + .await? }; // Third part: upload new timeline archival state and block until it is present in S3 - let upload_needed = timeline + let upload_needed = match timeline .remote_client - .schedule_index_upload_for_timeline_archival_state(new_state)?; + .schedule_index_upload_for_timeline_archival_state(new_state) + { + Ok(upload_needed) => upload_needed, + Err(e) => { + if timeline.cancel.is_cancelled() { + return Err(TimelineArchivalError::Cancelled); + } else { + return Err(TimelineArchivalError::Other(e)); + } + } + }; if upload_needed { info!("Uploading new state"); @@ -1666,11 +1697,33 @@ impl Tenant { tracing::warn!("reached timeout for waiting on upload queue"); return Err(TimelineArchivalError::Timeout); }; - v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; + v.map_err(|e| match e { + WaitCompletionError::NotInitialized(e) => { + TimelineArchivalError::Other(anyhow::anyhow!(e)) + } + WaitCompletionError::UploadQueueShutDownOrStopped => { + TimelineArchivalError::Cancelled + } + })?; } Ok(()) } + pub fn get_offloaded_timeline( + &self, + timeline_id: TimelineId, + ) -> Result, GetTimelineError> { + self.timelines_offloaded + .lock() + .unwrap() + .get(&timeline_id) + .map(Arc::clone) + .ok_or(GetTimelineError::NotFound { + tenant_id: self.tenant_shard_id, + timeline_id, + }) + } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } @@ -1758,7 +1811,6 @@ impl Tenant { create_guard, initdb_lsn, None, - None, ) .await } @@ -2121,7 +2173,8 @@ impl Tenant { .iter() .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id)) }; - let can_offload = can_offload && has_no_unoffloaded_children; + let can_offload = + can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading; if (is_active, can_offload) == (false, false) { None } else { @@ -2206,6 +2259,13 @@ impl Tenant { } } + pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { + let timelines = self.timelines.lock().unwrap(); + !timelines + .iter() + .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id)) + } + pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -2253,12 +2313,13 @@ impl Tenant { if activating { let timelines_accessor = self.timelines.lock().unwrap(); + let timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap(); let timelines_to_activate = timelines_accessor .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); // Before activation, populate each Timeline's GcInfo with information about its children - self.initialize_gc_info(&timelines_accessor); + self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor); // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. @@ -2957,7 +3018,6 @@ impl Tenant { ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, - last_aux_file_policy: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -2986,7 +3046,6 @@ impl Tenant { resources, pg_version, state, - last_aux_file_policy, self.attach_wal_lag_cooldown.clone(), self.cancel.child_token(), ); @@ -3294,10 +3353,11 @@ impl Tenant { /// Populate all Timelines' `GcInfo` with information about their children. We do not set the /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] /// - /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + /// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`]. fn initialize_gc_info( &self, timelines: &std::sync::MutexGuard>>, + timelines_offloaded: &std::sync::MutexGuard>>, ) { // This function must be called before activation: after activation timeline create/delete operations // might happen, and this function is not safe to run concurrently with those. @@ -3305,20 +3365,37 @@ impl Tenant { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + let mut all_branchpoints: BTreeMap> = + BTreeMap::new(); timelines.iter().for_each(|(timeline_id, timeline_entry)| { if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); - ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + ancestor_children.push(( + timeline_entry.get_ancestor_lsn(), + *timeline_id, + MaybeOffloaded::No, + )); } }); + timelines_offloaded + .iter() + .for_each(|(timeline_id, timeline_entry)| { + let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id else { + return; + }; + let Some(retain_lsn) = timeline_entry.ancestor_retain_lsn else { + return; + }; + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((retain_lsn, *timeline_id, MaybeOffloaded::Yes)); + }); // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines let horizon = self.get_gc_horizon(); // Populate each timeline's GcInfo with information about its child branches for timeline in timelines.values() { - let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints .remove(&timeline.timeline_id) .unwrap_or_default(); @@ -3627,7 +3704,6 @@ impl Tenant { timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), - src_timeline.last_aux_file_policy.load(), ) .await?; @@ -3821,7 +3897,6 @@ impl Tenant { timeline_create_guard, pgdata_lsn, None, - None, ) .await?; @@ -3893,7 +3968,6 @@ impl Tenant { create_guard: TimelineCreateGuard<'a>, start_lsn: Lsn, ancestor: Option>, - last_aux_file_policy: Option, ) -> anyhow::Result> { let tenant_shard_id = self.tenant_shard_id; @@ -3909,7 +3983,6 @@ impl Tenant { ancestor, resources, CreateTimelineCause::Load, - last_aux_file_policy, ) .context("Failed to create timeline data structure")?; @@ -4507,7 +4580,6 @@ mod tests { use super::*; use crate::keyspace::KeySpaceAccum; - use crate::pgdatadir_mapping::AuxFilesDirectory; use crate::repository::{Key, Value}; use crate::tenant::harness::*; use crate::tenant::timeline::CompactFlags; @@ -4516,7 +4588,7 @@ mod tests { use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; - use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use rand::{thread_rng, Rng}; @@ -4525,7 +4597,6 @@ mod tests { use tests::timeline::{GetVectoredError, ShutdownMode}; use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{DeltaLayerTestDesc, GcInfo}; - use utils::bin_ser::BeSer; use utils::id::TenantId; static TEST_KEY: Lazy = @@ -4878,7 +4949,10 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); + assert_eq!( + branchpoints[0], + (Lsn(0x40), NEW_TIMELINE_ID, MaybeOffloaded::No) + ); } // You can read the key from the child branch even though the parent is @@ -6326,16 +6400,9 @@ mod tests { } #[tokio::test] - async fn test_branch_copies_dirty_aux_file_flag() { - let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag") - .await - .unwrap(); + async fn test_aux_file_e2e() { + let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap(); - // the default aux file policy to switch is v2 if not set by the admins - assert_eq!( - harness.tenant_conf.switch_aux_file_policy, - AuxFilePolicy::default_tenant_config() - ); let (tenant, ctx) = harness.load().await; let mut lsn = Lsn(0x08); @@ -6345,9 +6412,6 @@ mod tests { .await .unwrap(); - // no aux file is written at this point, so the persistent flag should be unset - assert_eq!(tline.last_aux_file_policy.load(), None); - { lsn += 8; let mut modification = tline.begin_modification(lsn); @@ -6358,30 +6422,6 @@ mod tests { modification.commit(&ctx).await.unwrap(); } - // there is no tenant manager to pass the configuration through, so lets mimic it - tenant.set_new_location_config( - AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt { - switch_aux_file_policy: Some(AuxFilePolicy::V2), - ..Default::default() - }, - tenant.generation, - &pageserver_api::models::ShardParameters::default(), - )) - .unwrap(), - ); - - assert_eq!( - tline.get_switch_aux_file_policy(), - AuxFilePolicy::V2, - "wanted state has been updated" - ); - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there" - ); - // we can read everything from the storage let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); assert_eq!( @@ -6399,12 +6439,6 @@ mod tests { modification.commit(&ctx).await.unwrap(); } - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "keep v2 storage format when new files are written" - ); - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); assert_eq!( files.get("pg_logical/mappings/test2"), @@ -6416,321 +6450,9 @@ mod tests { .await .unwrap(); - // child copies the last flag even if that is not on remote storage yet - assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2); - assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); - let files = child.list_aux_files(lsn, &ctx).await.unwrap(); assert_eq!(files.get("pg_logical/mappings/test1"), None); assert_eq!(files.get("pg_logical/mappings/test2"), None); - - // even if we crash here without flushing parent timeline with it's new - // last_aux_file_policy we are safe, because child was never meant to access ancestor's - // files. the ancestor can even switch back to V1 because of a migration safely. - } - - #[tokio::test] - async fn aux_file_policy_switch() { - let mut harness = TenantHarness::create("aux_file_policy_switch") - .await - .unwrap(); - harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode - let (tenant, ctx) = harness.load().await; - - let mut lsn = Lsn(0x08); - - let tline: Arc = tenant - .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) - .await - .unwrap(); - - assert_eq!( - tline.last_aux_file_policy.load(), - None, - "no aux file is written so it should be unset" - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test1", b"first", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - // there is no tenant manager to pass the configuration through, so lets mimic it - tenant.set_new_location_config( - AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt { - switch_aux_file_policy: Some(AuxFilePolicy::V2), - ..Default::default() - }, - tenant.generation, - &pageserver_api::models::ShardParameters::default(), - )) - .unwrap(), - ); - - assert_eq!( - tline.get_switch_aux_file_policy(), - AuxFilePolicy::V2, - "wanted state has been updated" - ); - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::CrossValidation), - "dirty index_part.json reflected state is yet to be updated" - ); - - // we can still read the auxfile v1 before we ingest anything new - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")) - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test2", b"second", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "ingesting a file should apply the wanted switch state when applicable" - ); - - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")), - "cross validation writes to both v1 and v2 so this should be available in v2" - ); - assert_eq!( - files.get("pg_logical/mappings/test2"), - Some(&bytes::Bytes::from_static(b"second")) - ); - - // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file) - tenant.set_new_location_config( - AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt { - switch_aux_file_policy: Some(AuxFilePolicy::V1), - ..Default::default() - }, - tenant.generation, - &pageserver_api::models::ShardParameters::default(), - )) - .unwrap(), - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test2", b"third", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - assert_eq!( - tline.get_switch_aux_file_policy(), - AuxFilePolicy::V1, - "wanted state has been updated again, even if invalid request" - ); - - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "ingesting a file should apply the wanted switch state when applicable" - ); - - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")) - ); - assert_eq!( - files.get("pg_logical/mappings/test2"), - Some(&bytes::Bytes::from_static(b"third")) - ); - - // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file) - tenant.set_new_location_config( - AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt { - switch_aux_file_policy: Some(AuxFilePolicy::V2), - ..Default::default() - }, - tenant.generation, - &pageserver_api::models::ShardParameters::default(), - )) - .unwrap(), - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test3", b"last", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2); - - assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); - - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")) - ); - assert_eq!( - files.get("pg_logical/mappings/test2"), - Some(&bytes::Bytes::from_static(b"third")) - ); - assert_eq!( - files.get("pg_logical/mappings/test3"), - Some(&bytes::Bytes::from_static(b"last")) - ); - } - - #[tokio::test] - async fn aux_file_policy_force_switch() { - let mut harness = TenantHarness::create("aux_file_policy_force_switch") - .await - .unwrap(); - harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1; - let (tenant, ctx) = harness.load().await; - - let mut lsn = Lsn(0x08); - - let tline: Arc = tenant - .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) - .await - .unwrap(); - - assert_eq!( - tline.last_aux_file_policy.load(), - None, - "no aux file is written so it should be unset" - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test1", b"first", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap(); - - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "dirty index_part.json reflected state is yet to be updated" - ); - - // lose all data from v1 - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!(files.get("pg_logical/mappings/test1"), None); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test2", b"second", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - // read data ingested in v2 - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test2"), - Some(&bytes::Bytes::from_static(b"second")) - ); - // lose all data from v1 - assert_eq!(files.get("pg_logical/mappings/test1"), None); - } - - #[tokio::test] - async fn aux_file_policy_auto_detect() { - let mut harness = TenantHarness::create("aux_file_policy_auto_detect") - .await - .unwrap(); - harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode - let (tenant, ctx) = harness.load().await; - - let mut lsn = Lsn(0x08); - - let tline: Arc = tenant - .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) - .await - .unwrap(); - - assert_eq!( - tline.last_aux_file_policy.load(), - None, - "no aux file is written so it should be unset" - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: vec![( - "test_file".to_string(), - Bytes::copy_from_slice(b"test_file"), - )] - .into_iter() - .collect(), - }) - .unwrap(); - modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); - modification.commit(&ctx).await.unwrap(); - } - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test1", b"first", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V1), - "keep using v1 because there are aux files writting with v1" - ); - - // we can still read the auxfile v1 - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")) - ); - assert_eq!( - files.get("test_file"), - Some(&bytes::Bytes::from_static(b"test_file")) - ); } #[tokio::test] @@ -8261,8 +7983,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8489,8 +8211,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8723,7 +8445,7 @@ mod tests { // Update GC info let mut guard = parent_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x10), space: Lsn(0x10), @@ -8737,7 +8459,7 @@ mod tests { // Update GC info let mut guard = branch_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x50), space: Lsn(0x50), diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 68a27fbc20..2bd7f2d619 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -52,13 +52,13 @@ impl From> for BlockLease<'static> { } #[cfg(test)] -impl<'a> From> for BlockLease<'a> { +impl From> for BlockLease<'_> { fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self { BlockLease::Arc(value) } } -impl<'a> Deref for BlockLease<'a> { +impl Deref for BlockLease<'_> { type Target = [u8; PAGE_SZ]; fn deref(&self) -> &Self::Target { diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 0107b0ac7e..b302cbc975 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -131,7 +131,7 @@ struct OnDiskNode<'a, const L: usize> { values: &'a [u8], } -impl<'a, const L: usize> OnDiskNode<'a, L> { +impl OnDiskNode<'_, L> { /// /// Interpret a PAGE_SZ page as a node. /// diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 9d9852c525..0567f8f3a7 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -11,6 +11,7 @@ use pageserver_api::shard::{ }; use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; +use remote_storage::TimeoutOrCancel; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; @@ -1350,47 +1351,17 @@ impl TenantManager { } } - async fn delete_tenant_remote( - &self, - tenant_shard_id: TenantShardId, - ) -> Result<(), DeleteTenantError> { - let remote_path = remote_tenant_path(&tenant_shard_id); - let mut keys_stream = self.resources.remote_storage.list_streaming( - Some(&remote_path), - remote_storage::ListingMode::NoDelimiter, - None, - &self.cancel, - ); - while let Some(chunk) = keys_stream.next().await { - let keys = match chunk { - Ok(listing) => listing.keys, - Err(remote_storage::DownloadError::Cancelled) => { - return Err(DeleteTenantError::Cancelled) - } - Err(remote_storage::DownloadError::NotFound) => return Ok(()), - Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))), - }; - - if keys.is_empty() { - tracing::info!("Remote storage already deleted"); - } else { - tracing::info!("Deleting {} keys from remote storage", keys.len()); - let keys = keys.into_iter().map(|o| o.key).collect::>(); - self.resources - .remote_storage - .delete_objects(&keys, &self.cancel) - .await?; - } - } - - Ok(()) - } - /// If a tenant is attached, detach it. Then remove its data from remote storage. /// /// A tenant is considered deleted once it is gone from remote storage. It is the caller's /// responsibility to avoid trying to attach the tenant again or use it any way once deletion /// has started: this operation is not atomic, and must be retried until it succeeds. + /// + /// As a special case, if an unsharded tenant ID is given for a sharded tenant, it will remove + /// all tenant shards in remote storage (removing all paths with the tenant prefix). The storage + /// controller uses this to purge all remote tenant data, including any stale parent shards that + /// may remain after splits. Ideally, this special case would be handled elsewhere. See: + /// . pub(crate) async fn delete_tenant( &self, tenant_shard_id: TenantShardId, @@ -1442,25 +1413,29 @@ impl TenantManager { // in 500 responses to delete requests. // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will // 503/retry, rather than kicking off a wasteful concurrent deletion. - match backoff::retry( - || async move { self.delete_tenant_remote(tenant_shard_id).await }, - |e| match e { - DeleteTenantError::Cancelled => true, - DeleteTenantError::SlotError(_) => { - unreachable!("Remote deletion doesn't touch slots") - } - _ => false, + // NB: this also deletes partial prefixes, i.e. a path will delete all + // _/* objects. See method comment for why. + backoff::retry( + || async move { + self.resources + .remote_storage + .delete_prefix(&remote_tenant_path(&tenant_shard_id), &self.cancel) + .await }, + |_| false, // backoff::retry handles cancellation 1, 3, &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"), &self.cancel, ) .await - { - Some(r) => r, - None => Err(DeleteTenantError::Cancelled), - } + .unwrap_or(Err(TimeoutOrCancel::Cancel.into())) + .map_err(|err| { + if TimeoutOrCancel::caused_by_cancel(&err) { + return DeleteTenantError::Cancelled; + } + DeleteTenantError::Other(err) + }) } #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))] diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1f9ae40af5..14b894d17c 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -187,7 +187,7 @@ use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::download_initdb_tar_zst; -use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState}; +use pageserver_api::models::TimelineArchivalState; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -505,7 +505,7 @@ impl RemoteTimelineClient { }, ); - let (index_part, _index_generation) = download::download_index_part( + let (index_part, index_generation, index_last_modified) = download::download_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, @@ -519,6 +519,49 @@ impl RemoteTimelineClient { ) .await?; + // Defense in depth: monotonicity of generation numbers is an important correctness guarantee, so when we see a very + // old index, we do extra checks in case this is the result of backward time-travel of the generation number (e.g. + // in case of a bug in the service that issues generation numbers). Indices are allowed to be old, but we expect that + // when we load an old index we are loading the _latest_ index: if we are asked to load an old index and there is + // also a newer index available, that is surprising. + const INDEX_AGE_CHECKS_THRESHOLD: Duration = Duration::from_secs(14 * 24 * 3600); + let index_age = index_last_modified.elapsed().unwrap_or_else(|e| { + if e.duration() > Duration::from_secs(5) { + // We only warn if the S3 clock and our local clock are >5s out: because this is a low resolution + // timestamp, it is common to be out by at least 1 second. + tracing::warn!("Index has modification time in the future: {e}"); + } + Duration::ZERO + }); + if index_age > INDEX_AGE_CHECKS_THRESHOLD { + tracing::info!( + ?index_generation, + age = index_age.as_secs_f64(), + "Loaded an old index, checking for other indices..." + ); + + // Find the highest-generation index + let (_latest_index_part, latest_index_generation, latest_index_mtime) = + download::download_index_part( + &self.storage_impl, + &self.tenant_shard_id, + &self.timeline_id, + Generation::MAX, + cancel, + ) + .await?; + + if latest_index_generation > index_generation { + // Unexpected! Why are we loading such an old index if a more recent one exists? + tracing::warn!( + ?index_generation, + ?latest_index_generation, + ?latest_index_mtime, + "Found a newer index while loading an old one" + ); + } + } + if index_part.deleted_at.is_some() { Ok(MaybeDeletedIndexPart::Deleted(index_part)) } else { @@ -628,18 +671,6 @@ impl RemoteTimelineClient { Ok(()) } - /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated. - pub(crate) fn schedule_index_upload_for_aux_file_policy_update( - self: &Arc, - last_aux_file_policy: Option, - ) -> anyhow::Result<()> { - let mut guard = self.upload_queue.lock().unwrap(); - let upload_queue = guard.initialized_mut()?; - upload_queue.dirty.last_aux_file_policy = last_aux_file_policy; - self.schedule_index_upload(upload_queue)?; - Ok(()) - } - /// Launch an index-file upload operation in the background, with only the `archived_at` field updated. /// /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded, @@ -2151,7 +2182,7 @@ pub(crate) struct UploadQueueAccessor<'a> { inner: std::sync::MutexGuard<'a, UploadQueue>, } -impl<'a> UploadQueueAccessor<'a> { +impl UploadQueueAccessor<'_> { pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart { match &*self.inner { UploadQueue::Initialized(x) => &x.clean.0, diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 692e4d3096..b5d4b0f0bb 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -6,6 +6,7 @@ use std::collections::HashSet; use std::future::Future; use std::str::FromStr; +use std::time::SystemTime; use anyhow::{anyhow, Context}; use camino::{Utf8Path, Utf8PathBuf}; @@ -343,10 +344,10 @@ async fn do_download_index_part( timeline_id: &TimelineId, index_generation: Generation, cancel: &CancellationToken, -) -> Result<(IndexPart, Generation), DownloadError> { +) -> Result<(IndexPart, Generation, SystemTime), DownloadError> { let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); - let index_part_bytes = download_retry_forever( + let (index_part_bytes, index_part_mtime) = download_retry_forever( || async { let download = storage .download(&remote_path, &DownloadOpts::default(), cancel) @@ -359,7 +360,7 @@ async fn do_download_index_part( tokio::io::copy_buf(&mut stream, &mut bytes).await?; - Ok(bytes) + Ok((bytes, download.last_modified)) }, &format!("download {remote_path:?}"), cancel, @@ -370,7 +371,7 @@ async fn do_download_index_part( .with_context(|| format!("deserialize index part file at {remote_path:?}")) .map_err(DownloadError::Other)?; - Ok((index_part, index_generation)) + Ok((index_part, index_generation, index_part_mtime)) } /// index_part.json objects are suffixed with a generation number, so we cannot @@ -385,7 +386,7 @@ pub(crate) async fn download_index_part( timeline_id: &TimelineId, my_generation: Generation, cancel: &CancellationToken, -) -> Result<(IndexPart, Generation), DownloadError> { +) -> Result<(IndexPart, Generation, SystemTime), DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); if my_generation.is_none() { diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index c51ff54919..3a74a4ed11 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -133,10 +133,6 @@ impl IndexPart { pub(crate) fn example() -> Self { Self::empty(TimelineMetadata::example()) } - - pub(crate) fn last_aux_file_policy(&self) -> Option { - self.last_aux_file_policy - } } /// Metadata gathered for each of the layer files. diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index 0aad5bf392..e680fd705b 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -108,7 +108,6 @@ impl scheduler::Completion for WriteComplete { /// when we last did a write. We only populate this after doing at least one /// write for a tenant -- this avoids holding state for tenants that have /// uploads disabled. - struct UploaderTenantState { // This Weak only exists to enable culling idle instances of this type // when the Tenant has been deallocated. diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 41d558d3f6..4a4c698b56 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -12,7 +12,7 @@ use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use super::{GcError, LogicalSizeCalculationCause, Tenant}; -use crate::tenant::Timeline; +use crate::tenant::{MaybeOffloaded, Timeline}; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -264,10 +264,12 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) + .filter(|(lsn, _child_id, is_offloaded)| { + lsn > &ancestor_lsn && *is_offloaded == MaybeOffloaded::No + }) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint)) .collect::>(); lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 99bd0ece57..a229b59560 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -705,7 +705,7 @@ pub mod tests { /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers. struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range); -impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> { +impl std::fmt::Debug for RangeDisplayDebug<'_, T> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}..{}", self.0.start, self.0.end) } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 42436699a7..4661369487 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -529,8 +529,7 @@ impl DeltaLayerWriterInner { key_end: Key, ctx: &RequestContext, ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { - let index_start_blk = - ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32; let mut file = self.blob_writer.into_inner(ctx).await?; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 5773ae467f..06bcf0c7b4 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -828,8 +828,7 @@ impl ImageLayerWriterInner { ctx: &RequestContext, end_key: Option, ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { - let index_start_blk = - ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32; // Calculate compression ratio let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index bbb21b180e..38a7cd09af 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -341,6 +341,10 @@ impl Layer { Ok(()) } + pub(crate) async fn needs_download(&self) -> Result, std::io::Error> { + self.0.needs_download().await + } + /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction /// while the guard exists. /// @@ -974,7 +978,7 @@ impl LayerInner { let timeline = self .timeline .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; + .ok_or(DownloadError::TimelineShutdown)?; // count cancellations, which currently remain largely unexpected let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index ffe7ca5f3e..8e750e1187 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -339,7 +339,7 @@ impl<'de> serde::Deserialize<'de> for LayerName { struct LayerNameVisitor; -impl<'de> serde::de::Visitor<'de> for LayerNameVisitor { +impl serde::de::Visitor<'_> for LayerNameVisitor { type Value = LayerName; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 0831fd9530..f91e27241d 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -99,21 +99,21 @@ impl<'a> PeekableLayerIterRef<'a> { } } -impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> { +impl std::cmp::PartialEq for IteratorWrapper<'_> { fn eq(&self, other: &Self) -> bool { self.cmp(other) == Ordering::Equal } } -impl<'a> std::cmp::Eq for IteratorWrapper<'a> {} +impl std::cmp::Eq for IteratorWrapper<'_> {} -impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> { +impl std::cmp::PartialOrd for IteratorWrapper<'_> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl<'a> std::cmp::Ord for IteratorWrapper<'a> { +impl std::cmp::Ord for IteratorWrapper<'_> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { use std::cmp::Ordering; let a = self.peek_next_key_lsn_value(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2fd4e699cf..d67a139dfa 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -28,9 +28,9 @@ use pageserver_api::{ }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ - AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings, - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, + CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo, + DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, + LsnLease, TimelineState, }, reltag::BlockNumber, shard::{ShardIdentity, ShardNumber, TenantShardId}, @@ -98,12 +98,12 @@ use crate::{ use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; -use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; -use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; use crate::{ - pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, + pgdatadir_mapping::DirectoryKind, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; @@ -139,8 +139,10 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::{ - config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint, + config::TenantConf, + storage_layer::{inmemory_layer, LayerVisibilityHint}, upload_queue::NotInitialized, + MaybeOffloaded, }; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; @@ -204,11 +206,6 @@ pub struct TimelineResources { pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } -pub(crate) struct AuxFilesState { - pub(crate) dir: Option, - pub(crate) n_deltas: usize, -} - /// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL /// ingestion considerably, because WAL ingestion needs to check on most records if the record /// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end @@ -411,15 +408,9 @@ pub struct Timeline { timeline_get_throttle: Arc>, - /// Keep aux directory cache to avoid it's reconstruction on each update - pub(crate) aux_files: tokio::sync::Mutex, - /// Size estimator for aux file v2 pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, - /// Indicate whether aux file v2 storage is enabled. - pub(crate) last_aux_file_policy: AtomicAuxFilePolicy, - /// Some test cases directly place keys into the timeline without actually modifying the directory /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense @@ -450,7 +441,7 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId, MaybeOffloaded)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, @@ -467,8 +458,13 @@ impl GcInfo { self.cutoffs.select_min() } - pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { - self.retain_lsns.push((child_lsn, child_id)); + pub(super) fn insert_child( + &mut self, + child_id: TimelineId, + child_lsn: Lsn, + is_offloaded: MaybeOffloaded, + ) { + self.retain_lsns.push((child_lsn, child_id, is_offloaded)); self.retain_lsns.sort_by_key(|i| i.0); } @@ -1558,6 +1554,7 @@ impl Timeline { } /// Checks if the internal state of the timeline is consistent with it being able to be offloaded. + /// /// This is neccessary but not sufficient for offloading of the timeline as it might have /// child timelines that are not offloaded yet. pub(crate) fn can_offload(&self) -> bool { @@ -2004,14 +2001,6 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts) } - pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy { - let tenant_conf = self.tenant_conf.load(); - tenant_conf - .tenant_conf - .switch_aux_file_policy - .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy) - } - pub(crate) fn get_lazy_slru_download(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2144,7 +2133,6 @@ impl Timeline { resources: TimelineResources, pg_version: u32, state: TimelineState, - aux_file_policy: Option, attach_wal_lag_cooldown: Arc>, cancel: CancellationToken, ) -> Arc { @@ -2164,7 +2152,9 @@ impl Timeline { if let Some(ancestor) = &ancestor { let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); - ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + // If we construct an explicit timeline object, it's obviously not offloaded + let is_offloaded = MaybeOffloaded::No; + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded); } Arc::new_cyclic(|myself| { @@ -2272,15 +2262,8 @@ impl Timeline { timeline_get_throttle: resources.timeline_get_throttle, - aux_files: tokio::sync::Mutex::new(AuxFilesState { - dir: None, - n_deltas: 0, - }), - aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), - last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy), - #[cfg(test)] extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), @@ -2291,10 +2274,6 @@ impl Timeline { attach_wal_lag_cooldown, }; - if aux_file_policy == Some(AuxFilePolicy::V1) { - warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)"); - } - result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -3083,7 +3062,6 @@ impl Timeline { } impl Timeline { - #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// @@ -4470,14 +4448,6 @@ impl Timeline { ) -> Result<(), detach_ancestor::Error> { detach_ancestor::complete(self, tenant, attempt, ctx).await } - - /// Switch aux file policy and schedule upload to the index part. - pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> { - self.last_aux_file_policy.store(Some(policy)); - self.remote_client - .schedule_index_upload_for_aux_file_policy_update(Some(policy))?; - Ok(()) - } } impl Drop for Timeline { @@ -4875,7 +4845,7 @@ impl Timeline { let retain_lsns = gc_info .retain_lsns .iter() - .map(|(lsn, _child_id)| *lsn) + .map(|(lsn, _child_id, _is_offloaded)| *lsn) .collect(); // Gets the maximum LSN that holds the valid lease. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9f64471432..5588363330 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -29,6 +29,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::filter_iterator::FilterIterator; @@ -42,7 +43,7 @@ use crate::tenant::storage_layer::{ use crate::tenant::timeline::ImageLayerCreationOutcome; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::DeltaLayer; +use crate::tenant::{DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use pageserver_api::config::tenant_conf_defaults::{ DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, @@ -639,7 +640,10 @@ impl Timeline { let children = self.gc_info.read().unwrap().retain_lsns.clone(); let mut readable_points = Vec::with_capacity(children.len() + 1); - for (child_lsn, _child_timeline_id) in &children { + for (child_lsn, _child_timeline_id, is_offloaded) in &children { + if *is_offloaded == MaybeOffloaded::Yes { + continue; + } readable_points.push(*child_lsn); } readable_points.push(head_lsn); @@ -1688,6 +1692,45 @@ impl Timeline { unreachable!("key retention is empty") } + /// Check how much space is left on the disk + async fn check_available_space(self: &Arc) -> anyhow::Result { + let tenants_dir = self.conf.tenants_path(); + + let stat = Statvfs::get(&tenants_dir, None) + .context("statvfs failed, presumably directory got unlinked")?; + + let (avail_bytes, _) = stat.get_avail_total_bytes(); + + Ok(avail_bytes) + } + + /// Check if the compaction can proceed safely without running out of space. We assume the size + /// upper bound of the produced files of a compaction job is the same as all layers involved in + /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a + /// compaction. + async fn check_compaction_space( + self: &Arc, + layer_selection: &[Layer], + ) -> anyhow::Result<()> { + let available_space = self.check_available_space().await?; + let mut remote_layer_size = 0; + let mut all_layer_size = 0; + for layer in layer_selection { + let needs_download = layer.needs_download().await?; + if needs_download.is_some() { + remote_layer_size += layer.layer_desc().file_size; + } + all_layer_size += layer.layer_desc().file_size; + } + let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ + if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space + { + return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", + available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size)); + } + Ok(()) + } + /// An experimental compaction building block that combines compaction with garbage collection. /// /// The current implementation picks all delta + image layers that are below or intersecting with @@ -1741,7 +1784,7 @@ impl Timeline { let gc_info = self.gc_info.read().unwrap(); let mut retain_lsns_below_horizon = Vec::new(); let gc_cutoff = gc_info.cutoffs.select_min(); - for (lsn, _timeline_id) in &gc_info.retain_lsns { + for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns { if lsn < &gc_cutoff { retain_lsns_below_horizon.push(*lsn); } @@ -1803,6 +1846,8 @@ impl Timeline { lowest_retain_lsn ); + self.check_compaction_space(&layer_selection).await?; + // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 305c5758cc..71b9e4e288 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -283,8 +283,6 @@ impl DeleteTimelineFlow { // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, - // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace - None, ) .context("create_timeline_struct")?; diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index fb906d906b..7e6084baaf 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -19,6 +19,9 @@ pub(crate) async fn offload_timeline( return Ok(()); }; + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. + timeline.shutdown(super::ShutdownMode::Hard).await; + // TODO extend guard mechanism above with method // to make deletions possible while offloading is in progress diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 6257035986..dfe2352310 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -74,7 +74,7 @@ impl<'a> BufView<'a> { } } -impl<'a> Deref for BufView<'a> { +impl Deref for BufView<'_> { type Target = [u8]; fn deref(&self) -> &Self::Target { @@ -85,7 +85,7 @@ impl<'a> Deref for BufView<'a> { } } -impl<'a> AsRef<[u8]> for BufView<'a> { +impl AsRef<[u8]> for BufView<'_> { fn as_ref(&self) -> &[u8] { match self { BufView::Slice(slice) => slice, @@ -197,11 +197,6 @@ pub(crate) struct ChunkedVectoredReadBuilder { max_read_size: Option, } -/// Computes x / d rounded up. -fn div_round_up(x: usize, d: usize) -> usize { - (x + (d - 1)) / d -} - impl ChunkedVectoredReadBuilder { const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment(); /// Start building a new vectored read. @@ -221,7 +216,7 @@ impl ChunkedVectoredReadBuilder { .expect("First insertion always succeeds"); let start_blk_no = start_offset as usize / Self::CHUNK_SIZE; - let end_blk_no = div_round_up(end_offset as usize, Self::CHUNK_SIZE); + let end_blk_no = (end_offset as usize).div_ceil(Self::CHUNK_SIZE); Self { start_blk_no, end_blk_no, @@ -249,7 +244,7 @@ impl ChunkedVectoredReadBuilder { pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { tracing::trace!(start, end, "trying to extend"); let start_blk_no = start as usize / Self::CHUNK_SIZE; - let end_blk_no = div_round_up(end as usize, Self::CHUNK_SIZE); + let end_blk_no = (end as usize).div_ceil(Self::CHUNK_SIZE); let not_limited_by_max_read_size = { if let Some(max_read_size) = self.max_read_size { @@ -976,12 +971,4 @@ mod tests { round_trip_test_compressed(&blobs, true).await?; Ok(()) } - - #[test] - fn test_div_round_up() { - const CHUNK_SIZE: usize = 512; - assert_eq!(1, div_round_up(200, CHUNK_SIZE)); - assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE)); - assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE)); - } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index be644fb4cd..daa8b99ab0 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -729,9 +729,9 @@ impl VirtualFileInner { *handle_guard = handle; - return Ok(FileGuard { + Ok(FileGuard { slot_guard: slot_guard.downgrade(), - }); + }) } pub fn remove(self) { diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index facf01004c..c067787f97 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -1,8 +1,7 @@ -use crate::pgdatadir_mapping::AuxFilesDirectory; use crate::walrecord::NeonWalRecord; use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; -use bytes::{BufMut, BytesMut}; +use bytes::BytesMut; use pageserver_api::key::Key; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants; @@ -13,7 +12,6 @@ use postgres_ffi::v14::nonrelfile_utils::{ }; use postgres_ffi::BLCKSZ; use tracing::*; -use utils::bin_ser::BeSer; use utils::lsn::Lsn; /// Can this request be served by neon redo functions @@ -236,13 +234,9 @@ pub(crate) fn apply_in_neon( LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid); } } - NeonWalRecord::AuxFile { file_path, content } => { - let mut dir = AuxFilesDirectory::des(page)?; - dir.upsert(file_path.clone(), content.clone()); - - page.clear(); - let mut writer = page.writer(); - dir.ser_into(&mut writer)?; + NeonWalRecord::AuxFile { .. } => { + // No-op: this record will never be created in aux v2. + warn!("AuxFile record should not be created in aux v2"); } #[cfg(test)] NeonWalRecord::Test { @@ -250,6 +244,7 @@ pub(crate) fn apply_in_neon( clear, will_init, } => { + use bytes::BufMut; if *will_init { assert!(*clear, "init record must be clear to ensure correctness"); } @@ -261,59 +256,3 @@ pub(crate) fn apply_in_neon( } Ok(()) } - -#[cfg(test)] -mod test { - use bytes::Bytes; - use pageserver_api::key::AUX_FILES_KEY; - - use super::*; - use std::collections::HashMap; - - /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile - #[test] - fn apply_aux_file_deltas() -> anyhow::Result<()> { - let base_dir = AuxFilesDirectory { - files: HashMap::from([ - ("two".to_string(), Bytes::from_static(b"content0")), - ("three".to_string(), Bytes::from_static(b"contentX")), - ]), - }; - let base_image = AuxFilesDirectory::ser(&base_dir)?; - - let deltas = vec![ - // Insert - NeonWalRecord::AuxFile { - file_path: "one".to_string(), - content: Some(Bytes::from_static(b"content1")), - }, - // Update - NeonWalRecord::AuxFile { - file_path: "two".to_string(), - content: Some(Bytes::from_static(b"content99")), - }, - // Delete - NeonWalRecord::AuxFile { - file_path: "three".to_string(), - content: None, - }, - ]; - - let file_path = AUX_FILES_KEY; - let mut page = BytesMut::from_iter(base_image); - - for record in deltas { - apply_in_neon(&record, Lsn(8), file_path, &mut page)?; - } - - let reconstructed = AuxFilesDirectory::des(&page)?; - let expect = HashMap::from([ - ("one".to_string(), Bytes::from_static(b"content1")), - ("two".to_string(), Bytes::from_static(b"content99")), - ]); - - assert_eq!(reconstructed.files, expect); - - Ok(()) - } -} diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index d789526050..70b250d394 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -43,6 +43,7 @@ #include "hll.h" #include "bitmap.h" #include "neon.h" +#include "neon_perf_counters.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -114,7 +115,9 @@ typedef struct FileCacheControl uint32 limit; /* shared copy of lfc_size_limit */ uint64 hits; uint64 misses; - uint64 writes; + uint64 writes; /* number of writes issued */ + uint64 time_read; /* time spent reading (us) */ + uint64 time_write; /* time spent writing (us) */ dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ @@ -270,6 +273,8 @@ lfc_shmem_startup(void) lfc_ctl->hits = 0; lfc_ctl->misses = 0; lfc_ctl->writes = 0; + lfc_ctl->time_read = 0; + lfc_ctl->time_write = 0; dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); @@ -612,31 +617,34 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) /* remove the page from the cache */ entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1))); - /* - * If the chunk has no live entries, we can position the chunk to be - * recycled first. - */ - if (entry->bitmap[chunk_offs >> 5] == 0) + if (entry->access_count == 0) { - bool has_remaining_pages = false; - - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) - { - if (entry->bitmap[i] != 0) - { - has_remaining_pages = true; - break; - } - } - /* - * Put the entry at the position that is first to be reclaimed when we - * have no cached pages remaining in the chunk + * If the chunk has no live entries, we can position the chunk to be + * recycled first. */ - if (!has_remaining_pages) + if (entry->bitmap[chunk_offs >> 5] == 0) { - dlist_delete(&entry->list_node); - dlist_push_head(&lfc_ctl->lru, &entry->list_node); + bool has_remaining_pages = false; + + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) + { + if (entry->bitmap[i] != 0) + { + has_remaining_pages = true; + break; + } + } + + /* + * Put the entry at the position that is first to be reclaimed when we + * have no cached pages remaining in the chunk + */ + if (!has_remaining_pages) + { + dlist_delete(&entry->list_node); + dlist_push_head(&lfc_ctl->lru, &entry->list_node); + } } } @@ -701,6 +709,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); int iteration_hits = 0; int iteration_misses = 0; + uint64 io_time_us = 0; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -795,6 +804,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_ctl->misses += iteration_misses; pgBufferUsage.file_cache.hits += iteration_hits; pgBufferUsage.file_cache.misses += iteration_misses; + + if (iteration_hits) + { + lfc_ctl->time_read += io_time_us; + inc_page_cache_read_wait(io_time_us); + } + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); @@ -859,6 +875,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, struct iovec iov[PG_IOV_MAX]; int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + instr_time io_start, io_end; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -947,12 +964,13 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); + INSTR_TIME_SET_CURRENT(io_start); rc = pwritev(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + INSTR_TIME_SET_CURRENT(io_end); pgstat_report_wait_end(); if (rc != BLCKSZ * blocks_in_chunk) @@ -965,9 +983,17 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_ctl->generation == generation) { + uint64 time_spent_us; CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ CriticalAssert(entry->access_count > 0); + + lfc_ctl->writes += blocks_in_chunk; + INSTR_TIME_SUBTRACT(io_start, io_end); + time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); + lfc_ctl->time_write += time_spent_us; + inc_page_cache_write_wait(time_spent_us); + if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index a497d387c8..05db187076 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -50,28 +50,52 @@ NeonPerfCountersShmemInit(void) } } -/* - * Count a GetPage wait operation. - */ -void -inc_getpage_wait(uint64 latency_us) +static inline void +inc_iohist(IOHistogram hist, uint64 latency_us) { int lo = 0; - int hi = NUM_GETPAGE_WAIT_BUCKETS - 1; + int hi = NUM_IO_WAIT_BUCKETS - 1; /* Find the right bucket with binary search */ while (lo < hi) { int mid = (lo + hi) / 2; - if (latency_us < getpage_wait_bucket_thresholds[mid]) + if (latency_us < io_wait_bucket_thresholds[mid]) hi = mid; else lo = mid + 1; } - MyNeonCounters->getpage_wait_us_bucket[lo]++; - MyNeonCounters->getpage_wait_us_sum += latency_us; - MyNeonCounters->getpage_wait_us_count++; + hist->wait_us_bucket[lo]++; + hist->wait_us_sum += latency_us; + hist->wait_us_count++; +} + +/* + * Count a GetPage wait operation. + */ +void +inc_getpage_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->getpage_hist, latency); +} + +/* + * Count an LFC read wait operation. + */ +void +inc_page_cache_read_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_read_hist, latency); +} + +/* + * Count an LFC write wait operation. + */ +void +inc_page_cache_write_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_write_hist, latency); } /* @@ -81,77 +105,91 @@ inc_getpage_wait(uint64 latency_us) typedef struct { - char *name; + const char *name; bool is_bucket; double bucket_le; double value; } metric_t; -static metric_t * -neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +static int +histogram_to_metrics(IOHistogram histogram, + metric_t *metrics, + const char *count, + const char *sum, + const char *bucket) { -#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8) - metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); - uint64 bucket_accum; - int i = 0; + int i = 0; + uint64 bucket_accum = 0; - metrics[i].name = "getpage_wait_seconds_count"; + metrics[i].name = count; metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_wait_us_count; + metrics[i].value = (double) histogram->wait_us_count; i++; - metrics[i].name = "getpage_wait_seconds_sum"; + metrics[i].name = sum; metrics[i].is_bucket = false; - metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0; + metrics[i].value = (double) histogram->wait_us_sum / 1000000.0; i++; - - bucket_accum = 0; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) { - uint64 threshold = getpage_wait_bucket_thresholds[bucketno]; + uint64 threshold = io_wait_bucket_thresholds[bucketno]; - bucket_accum += counters->getpage_wait_us_bucket[bucketno]; + bucket_accum += histogram->wait_us_bucket[bucketno]; - metrics[i].name = "getpage_wait_seconds_bucket"; + metrics[i].name = bucket; metrics[i].is_bucket = true; metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0; metrics[i].value = (double) bucket_accum; i++; } - metrics[i].name = "getpage_prefetch_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_requests_total; - i++; - metrics[i].name = "getpage_sync_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_sync_requests_total; - i++; - metrics[i].name = "getpage_prefetch_misses_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_misses_total; - i++; - metrics[i].name = "getpage_prefetch_discards_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_discards_total; - i++; - metrics[i].name = "pageserver_requests_sent_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_requests_sent_total; - i++; - metrics[i].name = "pageserver_disconnects_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_disconnects_total; - i++; - metrics[i].name = "pageserver_send_flushes_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_send_flushes_total; - i++; - metrics[i].name = "file_cache_hits_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->file_cache_hits_total; - i++; + + return i; +} + +static metric_t * +neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +{ +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10) + metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); + int i = 0; + +#define APPEND_METRIC(_name) do { \ + metrics[i].name = #_name; \ + metrics[i].is_bucket = false; \ + metrics[i].value = (double) counters->_name; \ + i++; \ + } while (false) + + i += histogram_to_metrics(&counters->getpage_hist, &metrics[i], + "getpage_wait_seconds_count", + "getpage_wait_seconds_sum", + "getpage_wait_seconds_bucket"); + + APPEND_METRIC(getpage_prefetch_requests_total); + APPEND_METRIC(getpage_sync_requests_total); + APPEND_METRIC(getpage_prefetch_misses_total); + APPEND_METRIC(getpage_prefetch_discards_total); + APPEND_METRIC(pageserver_requests_sent_total); + APPEND_METRIC(pageserver_disconnects_total); + APPEND_METRIC(pageserver_send_flushes_total); + APPEND_METRIC(pageserver_open_requests); + APPEND_METRIC(getpage_prefetches_buffered); + + APPEND_METRIC(file_cache_hits_total); + + i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], + "file_cache_read_wait_seconds_count", + "file_cache_read_wait_seconds_sum", + "file_cache_read_wait_seconds_bucket"); + i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], + "file_cache_write_wait_seconds_count", + "file_cache_write_wait_seconds_sum", + "file_cache_write_wait_seconds_bucket"); Assert(i == NUM_METRICS); +#undef APPEND_METRIC +#undef NUM_METRICS + /* NULL entry marks end of array */ metrics[i].name = NULL; metrics[i].value = 0; @@ -216,6 +254,15 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS) return (Datum) 0; } +static inline void +histogram_merge_into(IOHistogram into, IOHistogram from) +{ + into->wait_us_count += from->wait_us_count; + into->wait_us_sum += from->wait_us_sum; + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) + into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno]; +} + PG_FUNCTION_INFO_V1(neon_get_perf_counters); Datum neon_get_perf_counters(PG_FUNCTION_ARGS) @@ -234,10 +281,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; - totals.getpage_wait_us_count += counters->getpage_wait_us_count; - totals.getpage_wait_us_sum += counters->getpage_wait_us_sum; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) - totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno]; + histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total; totals.getpage_sync_requests_total += counters->getpage_sync_requests_total; totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total; @@ -245,7 +289,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total; totals.pageserver_disconnects_total += counters->pageserver_disconnects_total; totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total; + totals.pageserver_open_requests += counters->pageserver_open_requests; + totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered; totals.file_cache_hits_total += counters->file_cache_hits_total; + histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); + histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 49d477c4f8..8edc658a30 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -15,17 +15,26 @@ #include "storage/proc.h" #endif -static const uint64 getpage_wait_bucket_thresholds[] = { - 20, 30, 60, 100, /* 0 - 100 us */ +static const uint64 io_wait_bucket_thresholds[] = { + 2, 3, 6, 10, /* 0 us - 10 us */ + 20, 30, 60, 100, /* 10 us - 100 us */ 200, 300, 600, 1000, /* 100 us - 1 ms */ 2000, 3000, 6000, 10000, /* 1 ms - 10 ms */ 20000, 30000, 60000, 100000, /* 10 ms - 100 ms */ 200000, 300000, 600000, 1000000, /* 100 ms - 1 s */ 2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */ - 20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */ UINT64_MAX, }; -#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds)) +#define NUM_IO_WAIT_BUCKETS (lengthof(io_wait_bucket_thresholds)) + +typedef struct IOHistogramData +{ + uint64 wait_us_count; + uint64 wait_us_sum; + uint64 wait_us_bucket[NUM_IO_WAIT_BUCKETS]; +} IOHistogramData; + +typedef IOHistogramData *IOHistogram; typedef struct { @@ -39,9 +48,7 @@ typedef struct * the backend, but the 'neon_backend_perf_counters' view will convert * them to seconds, to make them more idiomatic as prometheus metrics. */ - uint64 getpage_wait_us_count; - uint64 getpage_wait_us_sum; - uint64 getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS]; + IOHistogramData getpage_hist; /* * Total number of speculative prefetch Getpage requests and synchronous @@ -50,7 +57,11 @@ typedef struct uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; - /* XXX: It's not clear to me when these misses happen. */ + /* + * Total number of readahead misses; consisting of either prefetches that + * don't satisfy the LSN bounds, or cases where no readahead was issued + * for the read. + */ uint64 getpage_prefetch_misses_total; /* @@ -80,6 +91,16 @@ typedef struct * this can be smaller than pageserver_requests_sent_total. */ uint64 pageserver_send_flushes_total; + + /* + * Number of open requests to PageServer. + */ + uint64 pageserver_open_requests; + + /* + * Number of unused prefetches currently cached in this backend. + */ + uint64 getpage_prefetches_buffered; /* * Number of requests satisfied from the LFC. @@ -91,6 +112,9 @@ typedef struct */ uint64 file_cache_hits_total; + /* LFC I/O time buckets */ + IOHistogramData file_cache_read_hist; + IOHistogramData file_cache_write_hist; } neon_per_backend_counters; /* Pointer to the shared memory array of neon_per_backend_counters structs */ @@ -111,6 +135,8 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared; #endif extern void inc_getpage_wait(uint64 latency); +extern void inc_page_cache_read_wait(uint64 latency); +extern void inc_page_cache_write_wait(uint64 latency); extern Size NeonPerfCountersShmemSize(void); extern void NeonPerfCountersShmemInit(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3d9d9285df..cbb0e2ae6d 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -488,6 +488,11 @@ readahead_buffer_resize(int newsize, void *extra) newPState->n_unused -= 1; } + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) { prefetch_set_unused(end); @@ -621,6 +626,8 @@ prefetch_read(PrefetchRequest *slot) MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; /* update slot state */ slot->status = PRFS_RECEIVED; @@ -674,6 +681,15 @@ prefetch_on_ps_disconnect(void) prefetch_set_unused(ring_index); } + + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } /* @@ -706,6 +722,9 @@ prefetch_set_unused(uint64 ring_index) MyPState->n_responses_buffered -= 1; MyPState->n_unused += 1; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } else { @@ -820,6 +839,15 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, hashkey.buftag = tag; Retry: + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + min_ring_index = UINT64_MAX; for (int i = 0; i < nblocks; i++) { @@ -1001,6 +1029,9 @@ Retry: prefetch_do_request(slot, lsns); } + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + Assert(any_hits); Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || @@ -1061,8 +1092,7 @@ page_server_request(void const *req) * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || - ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) { shard_no = 0; } @@ -1076,8 +1106,10 @@ page_server_request(void const *req) { /* do nothing */ } + MyNeonCounters->pageserver_open_requests++; consume_prefetch_responses(); resp = page_server->receive(shard_no); + MyNeonCounters->pageserver_open_requests--; } PG_CATCH(); { @@ -1086,6 +1118,8 @@ page_server_request(void const *req) * point, but this currently seems fine for now. */ page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + PG_RE_THROW(); } PG_END_TRY(); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index a3f33cb261..d2a6104c74 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -841,6 +841,23 @@ HandleElectedProposer(WalProposer *wp) wp_log(FATAL, "failed to download WAL for logical replicaiton"); } + /* + * Zero propEpochStartLsn means majority of safekeepers doesn't have any + * WAL, timeline was just created. Compute bumps it to basebackup LSN, + * otherwise we must be sync-safekeepers and we have nothing to do then. + * + * Proceeding is not only pointless but harmful, because we'd give + * safekeepers term history starting with 0/0. These hacks will go away once + * we disable implicit timeline creation on safekeepers and create it with + * non zero LSN from the start. + */ + if (wp->propEpochStartLsn == InvalidXLogRecPtr) + { + Assert(wp->config->syncSafekeepers); + wp_log(LOG, "elected with zero propEpochStartLsn in sync-safekeepers, exiting"); + wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn); + } + if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers) { /* Sync is not needed: just exit */ diff --git a/poetry.lock b/poetry.lock index 00fe2505c9..e307b873f3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1758,85 +1758,101 @@ tests = ["pytest (>=4.6)"] [[package]] name = "multidict" -version = "6.0.4" +version = "6.0.5" description = "multidict implementation" optional = false python-versions = ">=3.7" files = [ - {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"}, - {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"}, - {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"}, - {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"}, - {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"}, - {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"}, - {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"}, - {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"}, - {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"}, - {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"}, - {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"}, - {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"}, - {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"}, - {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"}, - {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"}, - {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"}, - {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"}, - {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"}, - {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"}, - {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"}, - {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"}, - {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"}, - {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"}, - {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, + {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"}, + {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"}, + {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"}, + {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"}, + {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"}, + {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"}, + {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"}, + {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"}, + {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"}, + {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"}, + {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e"}, + {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b"}, + {file = "multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda"}, + {file = "multidict-6.0.5-cp312-cp312-win32.whl", hash = "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5"}, + {file = "multidict-6.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556"}, + {file = "multidict-6.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc"}, + {file = "multidict-6.0.5-cp37-cp37m-win32.whl", hash = "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee"}, + {file = "multidict-6.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423"}, + {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54"}, + {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d"}, + {file = "multidict-6.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44"}, + {file = "multidict-6.0.5-cp38-cp38-win32.whl", hash = "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241"}, + {file = "multidict-6.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c"}, + {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"}, + {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"}, + {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"}, + {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"}, + {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"}, + {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"}, + {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"}, ] [[package]] @@ -2766,28 +2782,29 @@ six = "*" [[package]] name = "ruff" -version = "0.2.2" +version = "0.7.0" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"}, - {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"}, - {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"}, - {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"}, - {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"}, - {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"}, - {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"}, - {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"}, - {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"}, - {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"}, + {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"}, + {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"}, + {file = "ruff-0.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:630fce3fefe9844e91ea5bbf7ceadab4f9981f42b704fae011bb8efcaf5d84be"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:211d877674e9373d4bb0f1c80f97a0201c61bcd1e9d045b6e9726adc42c156aa"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:194d6c46c98c73949a106425ed40a576f52291c12bc21399eb8f13a0f7073495"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:82c2579b82b9973a110fab281860403b397c08c403de92de19568f32f7178598"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9af971fe85dcd5eaed8f585ddbc6bdbe8c217fb8fcf510ea6bca5bdfff56040e"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b641c7f16939b7d24b7bfc0be4102c56562a18281f84f635604e8a6989948914"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9"}, + {file = "ruff-0.7.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ab7d98c7eed355166f367597e513a6c82408df4181a937628dbec79abb2a1fe4"}, + {file = "ruff-0.7.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1eb54986f770f49edb14f71d33312d79e00e629a57387382200b1ef12d6a4ef9"}, + {file = "ruff-0.7.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:dc452ba6f2bb9cf8726a84aa877061a2462afe9ae0ea1d411c53d226661c601d"}, + {file = "ruff-0.7.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:4b406c2dce5be9bad59f2de26139a86017a517e6bcd2688da515481c05a2cb11"}, + {file = "ruff-0.7.0-py3-none-win32.whl", hash = "sha256:f6c968509f767776f524a8430426539587d5ec5c662f6addb6aa25bc2e8195ec"}, + {file = "ruff-0.7.0-py3-none-win_amd64.whl", hash = "sha256:ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2"}, + {file = "ruff-0.7.0-py3-none-win_arm64.whl", hash = "sha256:10842f69c245e78d6adec7e1db0a7d9ddc2fff0621d730e61657b64fa36f207e"}, + {file = "ruff-0.7.0.tar.gz", hash = "sha256:47a86360cf62d9cd53ebfb0b5eb0e882193fc191c6d717e8bef4462bc3b9ea2b"}, ] [[package]] @@ -3389,4 +3406,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "9055b73352f1534f664cd8af6ebf8d93cf3bf857f115756f312ff2e3ae1bbbc1" +content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 963fb94a7d..e25d2fcbab 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -42,9 +42,10 @@ hyper0.workspace = true hyper = { workspace = true, features = ["server", "http1", "http2"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } http-body-util = { version = "0.1" } -indexmap.workspace = true +indexmap = { workspace = true, features = ["serde"] } ipnet.workspace = true itertools.workspace = true +itoa.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } measured = { workspace = true, features = ["lasso"] } metrics.workspace = true diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 94b84b6f00..de32a06e9e 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -1,16 +1,15 @@ -use super::{ComputeCredentials, ComputeUserInfo}; -use crate::{ - auth::{self, backend::ComputeCredentialKeys, AuthFlow}, - compute, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::AuthSecret, - sasl, - stream::{PqStream, Stream}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; +use super::{ComputeCredentials, ComputeUserInfo}; +use crate::auth::backend::ComputeCredentialKeys; +use crate::auth::{self, AuthFlow}; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::stream::{PqStream, Stream}; +use crate::{compute, sasl}; + pub(super) async fn authenticate( ctx: &RequestMonitoring, creds: ComputeUserInfo, diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 127be545e1..255e1fed54 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,18 +1,21 @@ -use crate::{ - auth, compute, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::{self, provider::NodeInfo}, - error::{ReportableError, UserFacingError}, - stream::PqStream, - waiters, -}; +use async_trait::async_trait; use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::SslMode; use tracing::{info, info_span}; +use super::ComputeCredentialKeys; +use crate::cache::Cached; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::provider::NodeInfo; +use crate::control_plane::{self, CachedNodeInfo}; +use crate::error::{ReportableError, UserFacingError}; +use crate::proxy::connect_compute::ComputeConnectBackend; +use crate::stream::PqStream; +use crate::{auth, compute, waiters}; + #[derive(Debug, Error)] pub(crate) enum WebAuthError { #[error(transparent)] @@ -25,6 +28,7 @@ pub(crate) enum WebAuthError { Io(#[from] std::io::Error), } +#[derive(Debug)] pub struct ConsoleRedirectBackend { console_uri: reqwest::Url, } @@ -66,17 +70,31 @@ impl ConsoleRedirectBackend { Self { console_uri } } - pub(super) fn url(&self) -> &reqwest::Url { - &self.console_uri - } - pub(crate) async fn authenticate( &self, ctx: &RequestMonitoring, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result { - authenticate(ctx, auth_config, &self.console_uri, client).await + ) -> auth::Result { + authenticate(ctx, auth_config, &self.console_uri, client) + .await + .map(ConsoleRedirectNodeInfo) + } +} + +pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo); + +#[async_trait] +impl ComputeConnectBackend for ConsoleRedirectNodeInfo { + async fn wake_compute( + &self, + _ctx: &RequestMonitoring, + ) -> Result { + Ok(Cached::new_uncached(self.0.clone())) + } + + fn get_keys(&self) -> &ComputeCredentialKeys { + &ComputeCredentialKeys::None } } diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 749218d260..8ab8d5d37f 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -1,16 +1,15 @@ -use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; -use crate::{ - auth::{self, AuthFlow}, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::AuthSecret, - intern::EndpointIdInt, - sasl, - stream::{self, Stream}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; +use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; +use crate::auth::{self, AuthFlow}; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::intern::EndpointIdInt; +use crate::sasl; +use crate::stream::{self, Stream}; + /// Compared to [SCRAM](crate::scram), cleartext password auth saves /// one round trip and *expensive* computations (>= 4096 HMAC iterations). /// These properties are benefical for serverless JS workers, so we diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 17ab7eda22..3f53ee24c3 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -1,23 +1,22 @@ -use std::{ - future::Future, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; -use anyhow::{bail, ensure, Context}; use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; -use serde::{de::Visitor, Deserialize, Deserializer}; +use serde::de::Visitor; +use serde::{Deserialize, Deserializer}; use signature::Verifier; +use thiserror::Error; use tokio::time::Instant; -use crate::{ - context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId, - RoleName, -}; - -use super::ComputeCredentialKeys; +use crate::auth::backend::ComputeCredentialKeys; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::http::parse_json_body_with_limit; +use crate::intern::RoleNameInt; +use crate::{EndpointId, RoleName}; // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); @@ -32,7 +31,16 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> impl Future>> + Send; + ) -> impl Future, FetchAuthRulesError>> + Send; +} + +#[derive(Error, Debug)] +pub(crate) enum FetchAuthRulesError { + #[error(transparent)] + GetEndpointJwks(#[from] GetEndpointJwksError), + + #[error("JWKs settings for this role were not configured")] + RoleJwksNotConfigured, } pub(crate) struct AuthRule { @@ -122,7 +130,7 @@ impl JwkCacheEntryLock { client: &reqwest::Client, endpoint: EndpointId, auth_rules: &F, - ) -> anyhow::Result> { + ) -> Result, JwtError> { // double check that no one beat us to updating the cache. let now = Instant::now(); let guard = self.cached.load_full(); @@ -188,7 +196,7 @@ impl JwkCacheEntryLock { client: &reqwest::Client, endpoint: EndpointId, fetch: &F, - ) -> Result, anyhow::Error> { + ) -> Result, JwtError> { let now = Instant::now(); let guard = self.cached.load_full(); @@ -243,27 +251,24 @@ impl JwkCacheEntryLock { endpoint: EndpointId, role_name: &RoleName, fetch: &F, - ) -> Result { + ) -> Result { // JWT compact form is defined to be // || . || || . || // where Signature = alg( || . || ); let (header_payload, signature) = jwt .rsplit_once('.') - .context("Provided authentication token is not a valid JWT encoding")?; + .ok_or(JwtEncodingError::InvalidCompactForm)?; let (header, payload) = header_payload .split_once('.') - .context("Provided authentication token is not a valid JWT encoding")?; + .ok_or(JwtEncodingError::InvalidCompactForm)?; - let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; - let header = serde_json::from_slice::>(&header) - .context("Provided authentication token is not a valid JWT encoding")?; + let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?; + let header = serde_json::from_slice::>(&header)?; - let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; + let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?; - let kid = header.key_id.context("missing key id")?; + let kid = header.key_id.ok_or(JwtError::MissingKeyId)?; let mut guard = self .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch) @@ -281,16 +286,13 @@ impl JwkCacheEntryLock { .renew_jwks(permit, ctx, client, endpoint.clone(), fetch) .await?; } - _ => { - bail!("jwk not found"); - } + _ => return Err(JwtError::JwkNotFound), } }; - ensure!( - jwk.is_supported(&header.algorithm), - "signature algorithm not supported" - ); + if !jwk.is_supported(&header.algorithm) { + return Err(JwtError::SignatureAlgorithmNotSupported); + } match &jwk.key { jose_jwk::Key::Ec(key) => { @@ -299,34 +301,32 @@ impl JwkCacheEntryLock { jose_jwk::Key::Rsa(key) => { verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } - key => bail!("unsupported key type {key:?}"), + key => return Err(JwtError::UnsupportedKeyType(key.into())), }; - let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; - let payload = serde_json::from_slice::>(&payloadb) - .context("Provided authentication token is not a valid JWT encoding")?; + let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?; + let payload = serde_json::from_slice::>(&payloadb)?; tracing::debug!(?payload, "JWT signature valid with claims"); if let Some(aud) = expected_audience { - ensure!( - payload.audience.0.iter().any(|s| s == aud), - "invalid JWT token audience" - ); + if payload.audience.0.iter().all(|s| s != aud) { + return Err(JwtError::InvalidJwtTokenAudience); + } } let now = SystemTime::now(); if let Some(exp) = payload.expiration { - ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired"); + if now >= exp + CLOCK_SKEW_LEEWAY { + return Err(JwtError::JwtTokenHasExpired); + } } if let Some(nbf) = payload.not_before { - ensure!( - nbf < now + CLOCK_SKEW_LEEWAY, - "JWT token is not yet ready to use" - ); + if nbf >= now + CLOCK_SKEW_LEEWAY { + return Err(JwtError::JwtTokenNotYetReadyToUse); + } } Ok(ComputeCredentialKeys::JwtPayload(payloadb)) @@ -341,7 +341,7 @@ impl JwkCache { role_name: &RoleName, fetch: &F, jwt: &str, - ) -> Result { + ) -> Result { // try with just a read lock first let key = (endpoint.clone(), role_name.clone()); let entry = self.map.get(&key).as_deref().map(Arc::clone); @@ -357,19 +357,18 @@ impl JwkCache { } } -fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> { +fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> { use ecdsa::Signature; use signature::Verifier; match key.crv { jose_jwk::EcCurves::P256 => { - let pk = - p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?; + let pk = p256::PublicKey::try_from(key).map_err(JwtError::InvalidP256Key)?; let key = p256::ecdsa::VerifyingKey::from(&pk); let sig = Signature::from_slice(sig)?; key.verify(data, &sig)?; } - key => bail!("unsupported ec key type {key:?}"), + key => return Err(JwtError::UnsupportedEcKeyType(key)), } Ok(()) @@ -380,14 +379,12 @@ fn verify_rsa_signature( sig: &[u8], key: &jose_jwk::Rsa, alg: &jose_jwa::Algorithm, -) -> anyhow::Result<()> { +) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; - use rsa::{ - pkcs1v15::{Signature, VerifyingKey}, - RsaPublicKey, - }; + use rsa::pkcs1v15::{Signature, VerifyingKey}; + use rsa::RsaPublicKey; - let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?; + let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; match alg { Algorithm::Signing(Signing::Rs256) => { @@ -395,7 +392,7 @@ fn verify_rsa_signature( let sig = Signature::try_from(sig)?; key.verify(data, &sig)?; } - _ => bail!("invalid RSA signing algorithm"), + _ => return Err(JwtError::InvalidRsaSigningAlgorithm), }; Ok(()) @@ -561,13 +558,104 @@ impl Drop for JwkRenewalPermit<'_> { } } +#[derive(Error, Debug)] +#[non_exhaustive] +pub(crate) enum JwtError { + #[error("jwk not found")] + JwkNotFound, + + #[error("missing key id")] + MissingKeyId, + + #[error("Provided authentication token is not a valid JWT encoding")] + JwtEncoding(#[from] JwtEncodingError), + + #[error("invalid JWT token audience")] + InvalidJwtTokenAudience, + + #[error("JWT token has expired")] + JwtTokenHasExpired, + + #[error("JWT token is not yet ready to use")] + JwtTokenNotYetReadyToUse, + + #[error("invalid P256 key")] + InvalidP256Key(jose_jwk::crypto::Error), + + #[error("invalid RSA key")] + InvalidRsaKey(jose_jwk::crypto::Error), + + #[error("invalid RSA signing algorithm")] + InvalidRsaSigningAlgorithm, + + #[error("unsupported EC key type {0:?}")] + UnsupportedEcKeyType(jose_jwk::EcCurves), + + #[error("unsupported key type {0:?}")] + UnsupportedKeyType(KeyType), + + #[error("signature algorithm not supported")] + SignatureAlgorithmNotSupported, + + #[error("signature error: {0}")] + Signature(#[from] signature::Error), + + #[error("failed to fetch auth rules: {0}")] + FetchAuthRules(#[from] FetchAuthRulesError), +} + +impl From for JwtError { + fn from(err: base64::DecodeError) -> Self { + JwtEncodingError::Base64Decode(err).into() + } +} + +impl From for JwtError { + fn from(err: serde_json::Error) -> Self { + JwtEncodingError::SerdeJson(err).into() + } +} + +#[derive(Error, Debug)] +#[non_exhaustive] +pub enum JwtEncodingError { + #[error(transparent)] + Base64Decode(#[from] base64::DecodeError), + + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + + #[error("invalid compact form")] + InvalidCompactForm, +} + +#[allow(dead_code, reason = "Debug use only")] +#[derive(Debug)] +pub(crate) enum KeyType { + Ec(jose_jwk::EcCurves), + Rsa, + Oct, + Okp(jose_jwk::OkpCurves), + Unknown, +} + +impl From<&jose_jwk::Key> for KeyType { + fn from(key: &jose_jwk::Key) -> Self { + match key { + jose_jwk::Key::Ec(ec) => Self::Ec(ec.crv), + jose_jwk::Key::Rsa(_rsa) => Self::Rsa, + jose_jwk::Key::Oct(_oct) => Self::Oct, + jose_jwk::Key::Okp(okp) => Self::Okp(okp.crv), + _ => Self::Unknown, + } + } +} + #[cfg(test)] mod tests { - use crate::RoleName; - - use super::*; - - use std::{future::IntoFuture, net::SocketAddr, time::SystemTime}; + use std::future::IntoFuture; + use std::net::SocketAddr; + use std::time::SystemTime; use base64::URL_SAFE_NO_PAD; use bytes::Bytes; @@ -580,6 +668,9 @@ mod tests { use signature::Signer; use tokio::net::TcpListener; + use super::*; + use crate::RoleName; + fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) { let sk = p256::SecretKey::random(&mut OsRng); let pk = sk.public_key().into(); @@ -758,7 +849,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, FetchAuthRulesError> { Ok(vec![ AuthRule { id: "foo".to_owned(), diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 12451847b1..1e029ff609 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -1,28 +1,32 @@ use std::net::SocketAddr; -use anyhow::Context; use arc_swap::ArcSwapOption; - -use crate::{ - compute::ConnCfg, - context::RequestMonitoring, - control_plane::{ - messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}, - NodeInfo, - }, - intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}, - EndpointId, -}; +use tokio::sync::Semaphore; use super::jwt::{AuthRule, FetchAuthRules}; +use crate::auth::backend::jwt::FetchAuthRulesError; +use crate::compute::ConnCfg; +use crate::compute_ctl::ComputeCtlApi; +use crate::context::RequestMonitoring; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; +use crate::control_plane::NodeInfo; +use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}; +use crate::url::ApiUrl; +use crate::{http, EndpointId}; pub struct LocalBackend { + pub(crate) initialize: Semaphore, + pub(crate) compute_ctl: ComputeCtlApi, pub(crate) node_info: NodeInfo, } impl LocalBackend { - pub fn new(postgres_addr: SocketAddr) -> Self { + pub fn new(postgres_addr: SocketAddr, compute_ctl: ApiUrl) -> Self { LocalBackend { + initialize: Semaphore::new(1), + compute_ctl: ComputeCtlApi { + api: http::Endpoint::new(compute_ctl, http::new_client()), + }, node_info: NodeInfo { config: { let mut cfg = ConnCfg::new(); @@ -53,11 +57,11 @@ impl FetchAuthRules for StaticAuthRules { &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, FetchAuthRulesError> { let mappings = JWKS_ROLE_MAP.load(); let role_mappings = mappings .as_deref() - .context("JWKs settings for this role were not configured")?; + .ok_or(FetchAuthRulesError::RoleJwksNotConfigured)?; let mut rules = vec![]; for setting in &role_mappings.jwks { rules.push(AuthRule { diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 27c9f1876e..a4db130b61 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -17,29 +17,22 @@ use tokio_postgres::config::AuthKeys; use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::{validate_password_and_exchange, AuthError}; +use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint}; use crate::cache::Cached; +use crate::config::AuthenticationConfig; use crate::context::RequestMonitoring; use crate::control_plane::errors::GetAuthInfoError; -use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend}; -use crate::control_plane::{AuthSecret, NodeInfo}; +use crate::control_plane::provider::{ + CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend, +}; +use crate::control_plane::{self, Api, AuthSecret}; use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; -use crate::{ - auth::{self, ComputeUserInfoMaybeEndpoint}, - config::AuthenticationConfig, - control_plane::{ - self, - provider::{CachedAllowedIps, CachedNodeInfo}, - Api, - }, - stream, -}; -use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; +use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName}; /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality pub enum MaybeOwned<'a, T> { @@ -66,11 +59,9 @@ impl std::ops::Deref for MaybeOwned<'_, T> { /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -pub enum Backend<'a, T, D> { +pub enum Backend<'a, T> { /// Cloud API (V2). ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T), - /// Authentication via a web browser. - ConsoleRedirect(MaybeOwned<'a, ConsoleRedirectBackend>, D), /// Local proxy uses configured auth credentials and does not wake compute Local(MaybeOwned<'a, LocalBackend>), } @@ -91,7 +82,7 @@ impl Clone for Box { } } -impl std::fmt::Display for Backend<'_, (), ()> { +impl std::fmt::Display for Backend<'_, ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ControlPlane(api, ()) => match &**api { @@ -107,46 +98,39 @@ impl std::fmt::Display for Backend<'_, (), ()> { #[cfg(test)] ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(), }, - Self::ConsoleRedirect(backend, ()) => fmt - .debug_tuple("ConsoleRedirect") - .field(&backend.url().as_str()) - .finish(), Self::Local(_) => fmt.debug_tuple("Local").finish(), } } } -impl Backend<'_, T, D> { +impl Backend<'_, T> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. - pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> { + pub(crate) fn as_ref(&self) -> Backend<'_, &T> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(MaybeOwned::Borrowed(c), x), - Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(MaybeOwned::Borrowed(c), x), Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)), } } } -impl<'a, T, D> Backend<'a, T, D> { +impl<'a, T> Backend<'a, T> { /// Very similar to [`std::option::Option::map`]. /// Maps [`Backend`] to [`Backend`] by applying /// a function to a contained value. - pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> { + pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(c, f(x)), - Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(c, x), Self::Local(l) => Backend::Local(l), } } } -impl<'a, T, D, E> Backend<'a, Result, D> { +impl<'a, T, E> Backend<'a, Result> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub(crate) fn transpose(self) -> Result, E> { + pub(crate) fn transpose(self) -> Result, E> { match self { Self::ControlPlane(c, x) => x.map(|x| Backend::ControlPlane(c, x)), - Self::ConsoleRedirect(c, x) => Ok(Backend::ConsoleRedirect(c, x)), Self::Local(l) => Ok(Backend::Local(l)), } } @@ -414,12 +398,11 @@ async fn authenticate_with_secret( classic::authenticate(ctx, info, client, config, secret).await } -impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { +impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { /// Get username from the credentials. pub(crate) fn get_user(&self) -> &str { match self { Self::ControlPlane(_, user_info) => &user_info.user, - Self::ConsoleRedirect(_, ()) => "web", Self::Local(_) => "local", } } @@ -433,7 +416,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, - ) -> auth::Result> { + ) -> auth::Result> { let res = match self { Self::ControlPlane(api, user_info) => { info!( @@ -454,14 +437,6 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { .await?; Backend::ControlPlane(api, credentials) } - // NOTE: this auth backend doesn't use client credentials. - Self::ConsoleRedirect(backend, ()) => { - info!("performing web authentication"); - - let info = backend.authenticate(ctx, config, client).await?; - - Backend::ConsoleRedirect(backend, info) - } Self::Local(_) => { return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) } @@ -472,14 +447,13 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { } } -impl Backend<'_, ComputeUserInfo, &()> { +impl Backend<'_, ComputeUserInfo> { pub(crate) async fn get_role_secret( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await, - Self::ConsoleRedirect(_, ()) => Ok(Cached::new_uncached(None)), Self::Local(_) => Ok(Cached::new_uncached(None)), } } @@ -492,21 +466,19 @@ impl Backend<'_, ComputeUserInfo, &()> { Self::ControlPlane(api, user_info) => { api.get_allowed_ips_and_secret(ctx, user_info).await } - Self::ConsoleRedirect(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), } } } #[async_trait::async_trait] -impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> { +impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { async fn wake_compute( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::ConsoleRedirect(_, info) => Ok(Cached::new_uncached(info.clone())), Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), } } @@ -514,31 +486,6 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> { fn get_keys(&self) -> &ComputeCredentialKeys { match self { Self::ControlPlane(_, creds) => &creds.keys, - Self::ConsoleRedirect(_, _) => &ComputeCredentialKeys::None, - Self::Local(_) => &ComputeCredentialKeys::None, - } - } -} - -#[async_trait::async_trait] -impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> { - async fn wake_compute( - &self, - ctx: &RequestMonitoring, - ) -> Result { - match self { - Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::ConsoleRedirect(_, ()) => { - unreachable!("web auth flow doesn't support waking the compute") - } - Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), - } - } - - fn get_keys(&self) -> &ComputeCredentialKeys { - match self { - Self::ControlPlane(_, creds) => &creds.keys, - Self::ConsoleRedirect(_, ()) => &ComputeCredentialKeys::None, Self::Local(_) => &ComputeCredentialKeys::None, } } @@ -546,34 +493,32 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> { #[cfg(test)] mod tests { - use std::{net::IpAddr, sync::Arc, time::Duration}; + use std::net::IpAddr; + use std::sync::Arc; + use std::time::Duration; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use once_cell::sync::Lazy; - use postgres_protocol::{ - authentication::sasl::{ChannelBinding, ScramSha256}, - message::{backend::Message as PgMessage, frontend}, - }; + use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; + use postgres_protocol::message::backend::Message as PgMessage; + use postgres_protocol::message::frontend; use provider::AuthSecret; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; - use crate::{ - auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::{ - self, - provider::{self, CachedAllowedIps, CachedRoleSecret}, - CachedNodeInfo, - }, - proxy::NeonOptions, - rate_limiter::{EndpointRateLimiter, RateBucketInfo}, - scram::{threadpool::ThreadPool, ServerSecret}, - stream::{PqStream, Stream}, - }; - - use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter}; + use super::jwt::JwkCache; + use super::{auth_quirks, AuthRateLimiter}; + use crate::auth::backend::MaskedIp; + use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; + use crate::config::AuthenticationConfig; + use crate::context::RequestMonitoring; + use crate::control_plane::provider::{self, CachedAllowedIps, CachedRoleSecret}; + use crate::control_plane::{self, CachedNodeInfo}; + use crate::proxy::NeonOptions; + use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; + use crate::scram::threadpool::ThreadPool; + use crate::scram::ServerSecret; + use crate::stream::{PqStream, Stream}; struct Auth { ips: Vec, @@ -607,7 +552,8 @@ mod tests { &self, _ctx: &RequestMonitoring, _endpoint: crate::EndpointId, - ) -> anyhow::Result> { + ) -> Result, control_plane::errors::GetEndpointJwksError> + { unimplemented!() } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index cba8601d14..465e427f7c 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,20 +1,22 @@ //! User credentials used in authentication. -use crate::{ - auth::password_hack::parse_endpoint_param, - context::RequestMonitoring, - error::{ReportableError, UserFacingError}, - metrics::{Metrics, SniKind}, - proxy::NeonOptions, - serverless::SERVERLESS_DRIVER_SNI, - EndpointId, RoleName, -}; +use std::collections::HashSet; +use std::net::IpAddr; +use std::str::FromStr; + use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::{collections::HashSet, net::IpAddr, str::FromStr}; use thiserror::Error; use tracing::{info, warn}; +use crate::auth::password_hack::parse_endpoint_param; +use crate::context::RequestMonitoring; +use crate::error::{ReportableError, UserFacingError}; +use crate::metrics::{Metrics, SniKind}; +use crate::proxy::NeonOptions; +use crate::serverless::SERVERLESS_DRIVER_SNI; +use crate::{EndpointId, RoleName}; + #[derive(Debug, Error, PartialEq, Eq, Clone)] pub(crate) enum ComputeUserInfoParseError { #[error("Parameter '{0}' is missing in startup packet.")] @@ -191,7 +193,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern { D: serde::Deserializer<'de>, { struct StrVisitor; - impl<'de> serde::de::Visitor<'de> for StrVisitor { + impl serde::de::Visitor<'_> for StrVisitor { type Value = IpPattern; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -249,10 +251,11 @@ fn project_name_valid(name: &str) -> bool { #[cfg(test)] mod tests { - use super::*; use serde_json::json; use ComputeUserInfoParseError::*; + use super::*; + #[test] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 9a5139dfb8..6294549ff6 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,21 +1,24 @@ //! Main authentication flow. -use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload}; -use crate::{ - config::TlsServerEndPoint, - context::RequestMonitoring, - control_plane::AuthSecret, - intern::EndpointIdInt, - sasl, - scram::{self, threadpool::ThreadPool}, - stream::{PqStream, Stream}, -}; +use std::io; +use std::sync::Arc; + use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; -use std::{io, sync::Arc}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; +use super::backend::ComputeCredentialKeys; +use super::{AuthError, PasswordHackPayload}; +use crate::config::TlsServerEndPoint; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::intern::EndpointIdInt; +use crate::sasl; +use crate::scram::threadpool::ThreadPool; +use crate::scram::{self}; +use crate::stream::{PqStream, Stream}; + /// Every authentication selector is supposed to implement this trait. pub(crate) trait AuthMethod { /// Any authentication selector should provide initial backend message @@ -114,14 +117,14 @@ impl AuthFlow<'_, S, PasswordHack> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) - .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + .ok_or(AuthError::MalformedPassword("missing terminator"))?; let payload = PasswordHackPayload::parse(password) // If we ended up here and the payload is malformed, it means that // the user neither enabled SNI nor resorted to any other method // for passing the project name we rely on. We should show them // the most helpful error message and point to the documentation. - .ok_or(AuthErrorImpl::MissingEndpointName)?; + .ok_or(AuthError::MissingEndpointName)?; Ok(payload) } @@ -133,7 +136,7 @@ impl AuthFlow<'_, S, CleartextPassword> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) - .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + .ok_or(AuthError::MalformedPassword("missing terminator"))?; let outcome = validate_password_and_exchange( &self.state.pool, @@ -163,7 +166,7 @@ impl AuthFlow<'_, S, Scram<'_>> { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg) - .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?; + .ok_or(AuthError::MalformedPassword("bad sasl message"))?; // Currently, the only supported SASL method is SCRAM. if !scram::METHODS.contains(&sasl.method) { diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index 0c8686add2..7a373dd825 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -14,22 +14,22 @@ pub(crate) use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; mod flow; +use std::io; +use std::net::IpAddr; + pub(crate) use flow::*; +use thiserror::Error; use tokio::time::error::Elapsed; -use crate::{ - control_plane, - error::{ReportableError, UserFacingError}, -}; -use std::{io, net::IpAddr}; -use thiserror::Error; +use crate::control_plane; +use crate::error::{ReportableError, UserFacingError}; /// Convenience wrapper for the authentication error. pub(crate) type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] -pub(crate) enum AuthErrorImpl { +pub(crate) enum AuthError { #[error(transparent)] Web(#[from] backend::WebAuthError), @@ -78,80 +78,70 @@ pub(crate) enum AuthErrorImpl { ConfirmationTimeout(humantime::Duration), } -#[derive(Debug, Error)] -#[error(transparent)] -pub(crate) struct AuthError(Box); - impl AuthError { pub(crate) fn bad_auth_method(name: impl Into>) -> Self { - AuthErrorImpl::BadAuthMethod(name.into()).into() + AuthError::BadAuthMethod(name.into()) } pub(crate) fn auth_failed(user: impl Into>) -> Self { - AuthErrorImpl::AuthFailed(user.into()).into() + AuthError::AuthFailed(user.into()) } pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self { - AuthErrorImpl::IpAddressNotAllowed(ip).into() + AuthError::IpAddressNotAllowed(ip) } pub(crate) fn too_many_connections() -> Self { - AuthErrorImpl::TooManyConnections.into() + AuthError::TooManyConnections } pub(crate) fn is_auth_failed(&self) -> bool { - matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) + matches!(self, AuthError::AuthFailed(_)) } pub(crate) fn user_timeout(elapsed: Elapsed) -> Self { - AuthErrorImpl::UserTimeout(elapsed).into() + AuthError::UserTimeout(elapsed) } pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self { - AuthErrorImpl::ConfirmationTimeout(timeout).into() - } -} - -impl> From for AuthError { - fn from(e: E) -> Self { - Self(Box::new(e.into())) + AuthError::ConfirmationTimeout(timeout) } } impl UserFacingError for AuthError { fn to_string_client(&self) -> String { - match self.0.as_ref() { - AuthErrorImpl::Web(e) => e.to_string_client(), - AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(), - AuthErrorImpl::Sasl(e) => e.to_string_client(), - AuthErrorImpl::AuthFailed(_) => self.to_string(), - AuthErrorImpl::BadAuthMethod(_) => self.to_string(), - AuthErrorImpl::MalformedPassword(_) => self.to_string(), - AuthErrorImpl::MissingEndpointName => self.to_string(), - AuthErrorImpl::Io(_) => "Internal error".to_string(), - AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(), - AuthErrorImpl::TooManyConnections => self.to_string(), - AuthErrorImpl::UserTimeout(_) => self.to_string(), - AuthErrorImpl::ConfirmationTimeout(_) => self.to_string(), + match self { + Self::Web(e) => e.to_string_client(), + Self::GetAuthInfo(e) => e.to_string_client(), + Self::Sasl(e) => e.to_string_client(), + Self::AuthFailed(_) => self.to_string(), + Self::BadAuthMethod(_) => self.to_string(), + Self::MalformedPassword(_) => self.to_string(), + Self::MissingEndpointName => self.to_string(), + Self::Io(_) => "Internal error".to_string(), + Self::IpAddressNotAllowed(_) => self.to_string(), + Self::TooManyConnections => self.to_string(), + Self::UserTimeout(_) => self.to_string(), + Self::ConfirmationTimeout(_) => self.to_string(), } } } impl ReportableError for AuthError { fn get_error_kind(&self) -> crate::error::ErrorKind { - match self.0.as_ref() { - AuthErrorImpl::Web(e) => e.get_error_kind(), - AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(), - AuthErrorImpl::Sasl(e) => e.get_error_kind(), - AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User, - AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User, - AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User, - AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User, - AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect, - AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, - AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit, - AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User, - AuthErrorImpl::ConfirmationTimeout(_) => crate::error::ErrorKind::User, + match self { + Self::Web(e) => e.get_error_kind(), + Self::GetAuthInfo(e) => e.get_error_kind(), + Self::Sasl(e) => e.get_error_kind(), + Self::AuthFailed(_) => crate::error::ErrorKind::User, + Self::BadAuthMethod(_) => crate::error::ErrorKind::User, + Self::MalformedPassword(_) => crate::error::ErrorKind::User, + Self::MissingEndpointName => crate::error::ErrorKind::User, + Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, + Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + Self::TooManyConnections => crate::error::ErrorKind::RateLimit, + Self::UserTimeout(_) => crate::error::ErrorKind::User, + Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User, } } } diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index c781af846a..a16c288e5d 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -1,41 +1,44 @@ -use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration}; +use std::net::SocketAddr; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; use anyhow::{bail, ensure, Context}; use camino::{Utf8Path, Utf8PathBuf}; use compute_api::spec::LocalProxySpec; use dashmap::DashMap; use futures::future::Either; -use proxy::{ - auth::{ - self, - backend::{ - jwt::JwkCache, - local::{LocalBackend, JWKS_ROLE_MAP}, - }, - }, - cancellation::CancellationHandlerMain, - config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, - control_plane::{ - locks::ApiLocks, - messages::{EndpointJwksResponse, JwksSettings}, - }, - http::health_server::AppMetrics, - intern::RoleNameInt, - metrics::{Metrics, ThreadPoolMetrics}, - rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}, - scram::threadpool::ThreadPool, - serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions}, - RoleName, +use proxy::auth::backend::jwt::JwkCache; +use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; +use proxy::auth::{self}; +use proxy::cancellation::CancellationHandlerMain; +use proxy::config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}; +use proxy::control_plane::locks::ApiLocks; +use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use proxy::http::health_server::AppMetrics; +use proxy::intern::RoleNameInt; +use proxy::metrics::{Metrics, ThreadPoolMetrics}; +use proxy::rate_limiter::{ + BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, }; +use proxy::scram::threadpool::ThreadPool; +use proxy::serverless::cancel_set::CancelSet; +use proxy::serverless::{self, GlobalConnPoolOptions}; +use proxy::url::ApiUrl; +use proxy::RoleName; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); use clap::Parser; -use tokio::{net::TcpListener, sync::Notify, task::JoinSet}; +use tokio::net::TcpListener; +use tokio::sync::Notify; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; -use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; @@ -78,7 +81,10 @@ struct LocalProxyCliArgs { connect_to_compute_retry: String, /// Address of the postgres server #[clap(long, default_value = "127.0.0.1:5432")] - compute: SocketAddr, + postgres: SocketAddr, + /// Address of the compute-ctl api service + #[clap(long, default_value = "http://127.0.0.1:3080/")] + compute_ctl: ApiUrl, /// Path of the local proxy config file #[clap(long, default_value = "./local_proxy.json")] config_path: Utf8PathBuf, @@ -291,9 +297,9 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig /// auth::Backend is created at proxy startup, and lives forever. fn build_auth_backend( args: &LocalProxyCliArgs, -) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> { +) -> anyhow::Result<&'static auth::Backend<'static, ()>> { let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( - LocalBackend::new(args.compute), + LocalBackend::new(args.postgres, args.compute_ctl.clone()), )); Ok(Box::leak(Box::new(auth_backend))) diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 53f1586abe..13b7fdd40a 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -5,25 +5,24 @@ /// the outside. Similar to an ingress controller for HTTPS. use std::{net::SocketAddr, sync::Arc}; +use anyhow::{anyhow, bail, ensure, Context}; +use clap::Arg; use futures::future::Either; +use futures::TryFutureExt; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; use proxy::metrics::{Metrics, ThreadPoolMetrics}; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; -use rustls::pki_types::PrivateKeyDer; -use tokio::net::TcpListener; - -use anyhow::{anyhow, bail, ensure, Context}; -use clap::Arg; -use futures::TryFutureExt; use proxy::stream::{PqStream, Stream}; - +use rustls::crypto::aws_lc_rs; +use rustls::pki_types::PrivateKeyDer; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpListener; use tokio_util::sync::CancellationToken; -use utils::{project_git_version, sentry_init::init_sentry}; - use tracing::{error, info, Instrument}; +use utils::project_git_version; +use utils::sentry_init::init_sentry; project_git_version!(GIT_VERSION); @@ -106,10 +105,11 @@ async fn main() -> anyhow::Result<()> { let first_cert = cert_chain.first().context("missing certificate")?; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[ - &rustls::version::TLS13, - &rustls::version::TLS12, - ]) + let tls_config = rustls::ServerConfig::builder_with_provider(Arc::new( + aws_lc_rs::default_provider(), + )) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("aws_lc_rs should support TLS1.2 and TLS1.3")? .with_no_client_auth() .with_single_cert(cert_chain, key)? .into(); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 3f4c2df809..96a71e69c6 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,3 +1,8 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::sync::Arc; + +use anyhow::bail; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; @@ -7,52 +12,34 @@ use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use aws_config::Region; use futures::future::Either; -use proxy::auth; use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::AuthRateLimiter; -use proxy::auth::backend::ConsoleRedirectBackend; -use proxy::auth::backend::MaybeOwned; -use proxy::cancellation::CancelMap; -use proxy::cancellation::CancellationHandler; -use proxy::config::remote_storage_from_toml; -use proxy::config::AuthenticationConfig; -use proxy::config::CacheOptions; -use proxy::config::HttpConfig; -use proxy::config::ProjectInfoCacheOptions; -use proxy::config::ProxyProtocolV2; +use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; +use proxy::cancellation::{CancelMap, CancellationHandler}; +use proxy::config::{ + self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig, + ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, +}; use proxy::context::parquet::ParquetUploadArgs; -use proxy::control_plane; -use proxy::http; use proxy::http::health_server::AppMetrics; use proxy::metrics::Metrics; -use proxy::rate_limiter::EndpointRateLimiter; -use proxy::rate_limiter::LeakyBucketConfig; -use proxy::rate_limiter::RateBucketInfo; -use proxy::rate_limiter::WakeComputeRateLimiter; +use proxy::rate_limiter::{ + EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, +}; use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use proxy::redis::elasticache; -use proxy::redis::notifications; +use proxy::redis::{elasticache, notifications}; use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; -use proxy::usage_metrics; - -use anyhow::bail; -use proxy::config::{self, ProxyConfig}; -use proxy::serverless; +use proxy::{auth, control_plane, http, serverless, usage_metrics}; use remote_storage::RemoteStorageConfig; -use std::net::SocketAddr; -use std::pin::pin; -use std::sync::Arc; use tokio::net::TcpListener; use tokio::sync::Mutex; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::info; -use tracing::warn; -use tracing::Instrument; -use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; +use tracing::{info, warn, Instrument}; +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -314,7 +301,10 @@ async fn main() -> anyhow::Result<()> { let config = build_config(&args)?; let auth_backend = build_auth_backend(&args)?; - info!("Authentication backend: {}", auth_backend); + match auth_backend { + Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), + Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), + }; info!("Using region: {}", args.aws_region); let region_provider = @@ -461,26 +451,41 @@ async fn main() -> anyhow::Result<()> { // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::proxy::task_main( - config, - auth_backend, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } + match auth_backend { + Either::Left(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } - if let Some(serverless_listener) = serverless_listener { - client_tasks.spawn(serverless::task_main( - config, - auth_backend, - serverless_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); + if let Some(serverless_listener) = serverless_listener { + client_tasks.spawn(serverless::task_main( + config, + auth_backend, + serverless_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + } + Either::Right(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::console_redirect_proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + )); + } + } } client_tasks.spawn(proxy::context::parquet::worker( @@ -510,7 +515,7 @@ async fn main() -> anyhow::Result<()> { )); } - if let auth::Backend::ControlPlane(api, _) = auth_backend { + if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api { match (redis_notifications_client, regional_redis_client.clone()) { (None, None) => {} @@ -663,7 +668,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { webauth_confirmation_timeout: args.webauth_confirmation_timeout, }; - let config = Box::leak(Box::new(ProxyConfig { + let config = ProxyConfig { tls_config, metric_collection, allow_self_signed_compute: args.allow_self_signed_compute, @@ -677,7 +682,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { connect_to_compute_retry_config: config::RetryConfig::parse( &args.connect_to_compute_retry, )?, - })); + }; + + let config = Box::leak(Box::new(config)); tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); @@ -687,8 +694,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { /// auth::Backend is created at proxy startup, and lives forever. fn build_auth_backend( args: &ProxyCliArgs, -) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> { - let auth_backend = match &args.auth_backend { +) -> anyhow::Result, &'static ConsoleRedirectBackend>> { + match &args.auth_backend { AuthBackendType::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = @@ -738,12 +745,11 @@ fn build_auth_backend( wake_compute_endpoint_rate_limiter, ); let api = control_plane::provider::ControlPlaneBackend::Management(api); - auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()) - } + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - AuthBackendType::Web => { - let url = args.uri.parse()?; - auth::Backend::ConsoleRedirect(MaybeOwned::Owned(ConsoleRedirectBackend::new(url)), ()) + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) } #[cfg(feature = "testing")] @@ -751,11 +757,23 @@ fn build_auth_backend( let url = args.auth_endpoint.parse()?; let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy); let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api); - auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()) - } - }; - Ok(Box::leak(Box::new(auth_backend))) + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + AuthBackendType::Web => { + let url = args.uri.parse()?; + let backend = ConsoleRedirectBackend::new(url); + + let config = Box::leak(Box::new(backend)); + + Ok(Either::Right(config)) + } + } } #[cfg(test)] diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 27121ce89e..82f3247fa7 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -1,31 +1,23 @@ -use std::{ - convert::Infallible, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::convert::Infallible; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::time::Duration; use dashmap::DashSet; -use redis::{ - streams::{StreamReadOptions, StreamReadReply}, - AsyncCommands, FromRedisValue, Value, -}; +use redis::streams::{StreamReadOptions, StreamReadReply}; +use redis::{AsyncCommands, FromRedisValue, Value}; use serde::Deserialize; use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; use tracing::info; -use crate::{ - config::EndpointCacheConfig, - context::RequestMonitoring, - intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, - metrics::{Metrics, RedisErrors, RedisEventsCount}, - rate_limiter::GlobalRateLimiter, - redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, - EndpointId, -}; +use crate::config::EndpointCacheConfig; +use crate::context::RequestMonitoring; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; +use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; +use crate::rate_limiter::GlobalRateLimiter; +use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::EndpointId; #[derive(Deserialize, Debug, Clone)] pub(crate) struct ControlPlaneEventKey { diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index b92cedb043..31d1dc96e7 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -1,9 +1,8 @@ -use std::{ - collections::HashSet, - convert::Infallible, - sync::{atomic::AtomicU64, Arc}, - time::Duration, -}; +use std::collections::HashSet; +use std::convert::Infallible; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; use dashmap::DashMap; @@ -13,15 +12,12 @@ use tokio::sync::Mutex; use tokio::time::Instant; use tracing::{debug, info}; -use crate::{ - auth::IpPattern, - config::ProjectInfoCacheOptions, - control_plane::AuthSecret, - intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, - EndpointId, RoleName, -}; - use super::{Cache, Cached}; +use crate::auth::IpPattern; +use crate::config::ProjectInfoCacheOptions; +use crate::control_plane::AuthSecret; +use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}; +use crate::{EndpointId, RoleName}; #[async_trait] pub(crate) trait ProjectInfoCache { @@ -371,7 +367,8 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::{scram::ServerSecret, ProjectId}; + use crate::scram::ServerSecret; + use crate::ProjectId; #[tokio::test] async fn test_project_info_cache_settings() { diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 5b08d74696..06eaeb9a30 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -1,9 +1,6 @@ -use std::{ - borrow::Borrow, - hash::Hash, - time::{Duration, Instant}, -}; -use tracing::debug; +use std::borrow::Borrow; +use std::hash::Hash; +use std::time::{Duration, Instant}; // This seems to make more sense than `lru` or `cached`: // @@ -15,8 +12,10 @@ use tracing::debug; // // On the other hand, `hashlink` has good download stats and appears to be maintained. use hashlink::{linked_hash_map::RawEntryMut, LruCache}; +use tracing::debug; -use super::{common::Cached, timed_lru, Cache}; +use super::common::Cached; +use super::{timed_lru, Cache}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 71a2a16af8..db0970adcb 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,6 +1,8 @@ +use std::net::SocketAddr; +use std::sync::Arc; + use dashmap::DashMap; use pq_proto::CancelKeyData; -use std::{net::SocketAddr, sync::Arc}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::Mutex; @@ -8,12 +10,10 @@ use tokio_postgres::{CancelToken, NoTls}; use tracing::info; use uuid::Uuid; -use crate::{ - error::ReportableError, - metrics::{CancellationRequest, CancellationSource, Metrics}, - redis::cancellation_publisher::{ - CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, - }, +use crate::error::ReportableError; +use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::redis::cancellation_publisher::{ + CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; pub type CancelMap = Arc>>; diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 006804fcd4..a7c2cab4a1 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,25 +1,32 @@ -use crate::{ - auth::parse_endpoint_param, - cancellation::CancelClosure, - context::RequestMonitoring, - control_plane::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError}, - error::{ReportableError, UserFacingError}, - metrics::{Metrics, NumDbConnectionsGuard}, - proxy::neon_option, - Host, -}; +use std::io; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Duration; + use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; -use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError}; -use std::{io, net::SocketAddr, sync::Arc, time::Duration}; +use rustls::client::danger::ServerCertVerifier; +use rustls::crypto::aws_lc_rs; +use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres_rustls::MakeRustlsConnect; use tracing::{error, info, warn}; +use crate::auth::parse_endpoint_param; +use crate::cancellation::CancelClosure; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::provider::ApiLockError; +use crate::error::{ReportableError, UserFacingError}; +use crate::metrics::{Metrics, NumDbConnectionsGuard}; +use crate::proxy::neon_option; +use crate::Host; + pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] @@ -32,6 +39,9 @@ pub(crate) enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] CouldNotConnect(#[from] io::Error), + #[error("Couldn't load native TLS certificates: {0:?}")] + TlsCertificateError(Vec), + #[error("{COULD_NOT_CONNECT}: {0}")] TlsError(#[from] InvalidDnsNameError), @@ -78,6 +88,7 @@ impl ReportableError for ConnectionError { } ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, + ConnectionError::TlsCertificateError(_) => crate::error::ErrorKind::Service, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), @@ -287,12 +298,20 @@ impl ConnCfg { let client_config = if allow_self_signed_compute { // Allow all certificates for creating the connection let verifier = Arc::new(AcceptEverythingVerifier); - rustls::ClientConfig::builder() + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("aws_lc_rs should support the default protocol versions") .dangerous() .with_custom_certificate_verifier(verifier) } else { - let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); - rustls::ClientConfig::builder().with_root_certificates(root_store) + let root_store = TLS_ROOTS + .get_or_try_init(load_certs) + .map_err(ConnectionError::TlsCertificateError)? + .clone(); + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("aws_lc_rs should support the default protocol versions") + .with_root_certificates(root_store) }; let client_config = client_config.with_no_client_auth(); @@ -353,10 +372,15 @@ fn filtered_options(params: &StartupMessageParams) -> Option { Some(options) } -fn load_certs() -> Result, io::Error> { - let der_certs = rustls_native_certs::load_native_certs()?; +fn load_certs() -> Result, Vec> { + let der_certs = rustls_native_certs::load_native_certs(); + + if !der_certs.errors.is_empty() { + return Err(der_certs.errors); + } + let mut store = rustls::RootCertStore::empty(); - store.add_parsable_certificates(der_certs); + store.add_parsable_certificates(der_certs.certs); Ok(Arc::new(store)) } static TLS_ROOTS: OnceCell> = OnceCell::new(); diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs new file mode 100644 index 0000000000..2b57897223 --- /dev/null +++ b/proxy/src/compute_ctl/mod.rs @@ -0,0 +1,101 @@ +use compute_api::responses::GenericAPIError; +use hyper::{Method, StatusCode}; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::url::ApiUrl; +use crate::{http, DbName, RoleName}; + +pub struct ComputeCtlApi { + pub(crate) api: http::Endpoint, +} + +#[derive(Serialize, Debug)] +pub struct ExtensionInstallRequest { + pub extension: &'static str, + pub database: DbName, + pub version: &'static str, +} + +#[derive(Serialize, Debug)] +pub struct SetRoleGrantsRequest { + pub database: DbName, + pub schema: &'static str, + pub privileges: Vec, + pub role: RoleName, +} + +#[derive(Clone, Debug, Deserialize)] +pub struct ExtensionInstallResponse {} + +#[derive(Clone, Debug, Deserialize)] +pub struct SetRoleGrantsResponse {} + +#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +#[serde(rename_all = "UPPERCASE")] +pub enum Privilege { + Usage, +} + +#[derive(Error, Debug)] +pub enum ComputeCtlError { + #[error("connection error: {0}")] + ConnectionError(#[source] reqwest_middleware::Error), + #[error("request error [{status}]: {body:?}")] + RequestError { + status: StatusCode, + body: Option, + }, + #[error("response parsing error: {0}")] + ResponseError(#[source] reqwest::Error), +} + +impl ComputeCtlApi { + pub async fn install_extension( + &self, + req: &ExtensionInstallRequest, + ) -> Result { + self.generic_request(req, Method::POST, |url| { + url.path_segments_mut().push("extensions"); + }) + .await + } + + pub async fn grant_role( + &self, + req: &SetRoleGrantsRequest, + ) -> Result { + self.generic_request(req, Method::POST, |url| { + url.path_segments_mut().push("grants"); + }) + .await + } + + async fn generic_request( + &self, + req: &Req, + method: Method, + url: impl for<'a> FnOnce(&'a mut ApiUrl), + ) -> Result + where + Req: Serialize, + Resp: DeserializeOwned, + { + let resp = self + .api + .request_with_url(method, url) + .json(req) + .send() + .await + .map_err(ComputeCtlError::ConnectionError)?; + + let status = resp.status(); + if status.is_client_error() || status.is_server_error() { + let body = resp.json().await.ok(); + return Err(ComputeCtlError::RequestError { status, body }); + } + + resp.json().await.map_err(ComputeCtlError::ResponseError) + } +} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index c068fc50fb..3baa7ec751 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,29 +1,27 @@ -use crate::{ - auth::backend::{jwt::JwkCache, AuthRateLimiter}, - control_plane::locks::ApiLocks, - rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, - scram::threadpool::ThreadPool, - serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, - Host, -}; +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; + use anyhow::{bail, ensure, Context, Ok}; use clap::ValueEnum; use itertools::Itertools; use remote_storage::RemoteStorageConfig; -use rustls::{ - crypto::ring::sign, - pki_types::{CertificateDer, PrivateKeyDer}, -}; +use rustls::crypto::aws_lc_rs::{self, sign}; +use rustls::pki_types::{CertificateDer, PrivateKeyDer}; use sha2::{Digest, Sha256}; -use std::{ - collections::{HashMap, HashSet}, - str::FromStr, - sync::Arc, - time::Duration, -}; use tracing::{error, info}; use x509_parser::oid_registry; +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::AuthRateLimiter; +use crate::control_plane::locks::ApiLocks; +use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::cancel_set::CancelSet; +use crate::serverless::GlobalConnPoolOptions; +use crate::Host; + pub struct ProxyConfig { pub tls_config: Option, pub metric_collection: Option, @@ -128,12 +126,12 @@ pub fn configure_tls( let cert_resolver = Arc::new(cert_resolver); // allow TLS 1.2 to be compatible with older client libraries - let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[ - &rustls::version::TLS13, - &rustls::version::TLS12, - ]) - .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()); + let mut config = + rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("aws_lc_rs should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_cert_resolver(cert_resolver.clone()); config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; @@ -560,7 +558,7 @@ pub struct RetryConfig { } impl RetryConfig { - /// Default options for RetryConfig. + // Default options for RetryConfig. /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s. pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str = @@ -692,9 +690,8 @@ impl FromStr for ConcurrencyLockOptions { #[cfg(test)] mod tests { - use crate::rate_limiter::Aimd; - use super::*; + use crate::rate_limiter::Aimd; #[test] fn test_parse_cache_options() -> anyhow::Result<()> { diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs new file mode 100644 index 0000000000..81d1d70958 --- /dev/null +++ b/proxy/src/console_redirect_proxy.rs @@ -0,0 +1,214 @@ +use std::sync::Arc; + +use futures::TryFutureExt; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, Instrument}; + +use crate::auth::backend::ConsoleRedirectBackend; +use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::config::{ProxyConfig, ProxyProtocolV2}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism}; +use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::proxy::passthrough::ProxyPassthrough; +use crate::proxy::{ + prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, +}; + +pub async fn task_main( + config: &'static ProxyConfig, + backend: &'static ConsoleRedirectBackend, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, + cancellation_handler: Arc, +) -> anyhow::Result<()> { + scopeguard::defer! { + info!("proxy has shut down"); + } + + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; + + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); + + let session_id = uuid::Uuid::new_v4(); + let cancellation_handler = Arc::clone(&cancellation_handler); + + tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + + connections.spawn(async move { + let (socket, peer_addr) = match read_proxy_protocol(socket).await { + Err(e) => { + error!("per-client task finished with an error: {e:#}"); + return; + } + Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + error!("missing required proxy protocol header"); + return; + } + Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + error!("proxy protocol header not supported"); + return; + } + Ok((socket, Some(addr))) => (socket, addr.ip()), + Ok((socket, None)) => (socket, peer_addr.ip()), + }; + + match socket.inner.set_nodelay(true) { + Ok(()) => {} + Err(e) => { + error!("per-client task finished with an error: failed to set socket option: {e:#}"); + return; + } + }; + + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); + let span = ctx.span(); + + let startup = Box::pin( + handle_client( + config, + backend, + &ctx, + cancellation_handler, + socket, + conn_gauge, + ) + .instrument(span.clone()), + ); + let res = startup.await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + error!(parent: &span, "per-client task finished with an error: {e:#}"); + } + Ok(None) => { + ctx.set_success(); + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log_connect(); + match p.proxy_pass().instrument(span.clone()).await { + Ok(()) => {} + Err(ErrorSource::Client(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); + } + Err(ErrorSource::Compute(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}"); + } + } + } + } + }); + } + + connections.close(); + drop(listener); + + // Drain connections + connections.wait().await; + + Ok(()) +} + +pub(crate) async fn handle_client( + config: &'static ProxyConfig, + backend: &'static ConsoleRedirectBackend, + ctx: &RequestMonitoring, + cancellation_handler: Arc, + stream: S, + conn_gauge: NumClientConnectionsGuard<'static>, +) -> Result>, ClientRequestError> { + info!( + protocol = %ctx.protocol(), + "handling interactive connection from client" + ); + + let metrics = &Metrics::get().proxy; + let proto = ctx.protocol(); + let request_gauge = metrics.connection_requests.guard(proto); + + let tls = config.tls_config.as_ref(); + + let record_handshake_error = !ctx.has_private_peer_addr(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(ctx, stream, tls, record_handshake_error); + let (mut stream, params) = + match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(cancel_key_data) => { + return Ok(cancellation_handler + .cancel_session(cancel_key_data, ctx.session_id()) + .await + .map(|()| None)?) + } + }; + drop(pause); + + ctx.set_db_options(params.clone()); + + let user_info = match backend + .authenticate(ctx, &config.authentication_config, &mut stream) + .await + { + Ok(auth_result) => auth_result, + Err(e) => { + return stream.throw_error(e).await?; + } + }; + + let mut node = connect_to_compute( + ctx, + &TcpMechanism { + params: ¶ms, + locks: &config.connect_compute_locks, + }, + &user_info, + config.allow_self_signed_compute, + config.wake_compute_retry_config, + config.connect_to_compute_retry_config, + ) + .or_else(|e| stream.throw_error(e)) + .await?; + + let session = cancellation_handler.get_session(); + prepare_client_connection(&node, &session, &mut stream).await?; + + // Before proxy passing, forward to compute whatever data is left in the + // PqStream input buffer. Normally there is none, but our serverless npm + // driver in pipeline mode sends startup, password and first query + // immediately after opening the connection. + let (stream, read_buf) = stream.into_inner(); + node.stream.write_all(&read_buf).await?; + + Ok(Some(ProxyPassthrough { + client: stream, + aux: node.aux.clone(), + compute: node, + _req: request_gauge, + _conn: conn_gauge, + _cancel: session, + })) +} diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 7fb4e7c698..e2d2c1b766 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -1,24 +1,25 @@ //! Connection request monitoring contexts +use std::net::IpAddr; + use chrono::Utc; use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; use smol_str::SmolStr; -use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{debug, field::display, info, info_span, Span}; +use tracing::field::display; +use tracing::{debug, info, info_span, Span}; use try_lock::TryLock; use uuid::Uuid; -use crate::{ - control_plane::messages::{ColdStartInfo, MetricsAuxInfo}, - error::ErrorKind, - intern::{BranchIdInt, ProjectIdInt}, - metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting}, - DbName, EndpointId, RoleName, -}; - use self::parquet::RequestData; +use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::error::ErrorKind; +use crate::intern::{BranchIdInt, ProjectIdInt}; +use crate::metrics::{ + ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, +}; +use crate::{DbName, EndpointId, RoleName}; pub mod parquet; diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 9f6f83022e..3432ac5ff6 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -1,29 +1,28 @@ -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use bytes::{buf::Writer, BufMut, BytesMut}; +use bytes::buf::Writer; +use bytes::{BufMut, BytesMut}; use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; -use parquet::{ - basic::Compression, - file::{ - metadata::RowGroupMetaDataPtr, - properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}, - writer::SerializedFileWriter, - }, - record::RecordWriter, -}; +use parquet::basic::Compression; +use parquet::file::metadata::RowGroupMetaDataPtr; +use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}; +use parquet::file::writer::SerializedFileWriter; +use parquet::record::RecordWriter; use pq_proto::StartupMessageParams; use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use serde::ser::SerializeMap; -use tokio::{sync::mpsc, time}; +use tokio::sync::mpsc; +use tokio::time; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; - use super::{RequestMonitoringInner, LOG_CHAN}; +use crate::config::remote_storage_from_toml; +use crate::context::LOG_CHAN_DISCONNECT; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { @@ -105,7 +104,7 @@ struct Options<'a> { options: &'a StartupMessageParams, } -impl<'a> serde::Serialize for Options<'a> { +impl serde::Serialize for Options<'_> { fn serialize(&self, s: S) -> Result where S: serde::Serializer, @@ -407,26 +406,26 @@ async fn upload_parquet( #[cfg(test)] mod tests { - use std::{net::Ipv4Addr, num::NonZeroUsize, sync::Arc}; + use std::net::Ipv4Addr; + use std::num::NonZeroUsize; + use std::sync::Arc; use camino::Utf8Path; use clap::Parser; use futures::{Stream, StreamExt}; use itertools::Itertools; - use parquet::{ - basic::{Compression, ZstdLevel}, - file::{ - properties::{WriterProperties, DEFAULT_PAGE_SIZE}, - reader::FileReader, - serialized_reader::SerializedFileReader, - }, - }; - use rand::{rngs::StdRng, Rng, SeedableRng}; + use parquet::basic::{Compression, ZstdLevel}; + use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE}; + use parquet::file::reader::FileReader; + use parquet::file::serialized_reader::SerializedFileReader; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use remote_storage::{ GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; - use tokio::{sync::mpsc, time}; + use tokio::sync::mpsc; + use tokio::time; use walkdir::WalkDir; use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData}; diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 960bb5bc21..dae23f7c53 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -1,9 +1,9 @@ -use measured::FixedCardinalityLabel; -use serde::{Deserialize, Serialize}; use std::fmt::{self, Display}; -use crate::auth::IpPattern; +use measured::FixedCardinalityLabel; +use serde::{Deserialize, Serialize}; +use crate::auth::IpPattern; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::proxy::retry::CouldRetry; @@ -362,9 +362,10 @@ pub struct JwksSettings { #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + fn dummy_aux() -> serde_json::Value { json!({ "endpoint_id": "endpoint", diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs index 2c4b5a9b94..5ac3acd28a 100644 --- a/proxy/src/control_plane/mgmt.rs +++ b/proxy/src/control_plane/mgmt.rs @@ -1,16 +1,16 @@ -use crate::{ - control_plane::messages::{DatabaseInfo, KickSession}, - waiters::{self, Waiter, Waiters}, -}; +use std::convert::Infallible; + use anyhow::Context; use once_cell::sync::Lazy; use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::convert::Infallible; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, Instrument}; +use crate::control_plane::messages::{DatabaseInfo, KickSession}; +use crate::waiters::{self, Waiter, Waiters}; + static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); /// Give caller an opportunity to wait for the cloud's reply. diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs index ea2eb79e2a..fb061376e7 100644 --- a/proxy/src/control_plane/provider/mock.rs +++ b/proxy/src/control_plane/provider/mock.rs @@ -1,27 +1,29 @@ //! Mock console backend which relies on a user-provided postgres instance. -use super::{ - errors::{ApiError, GetAuthInfoError, WakeComputeError}, - AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, -}; -use crate::{ - auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName, -}; -use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; -use crate::{auth::IpPattern, cache::Cached}; -use crate::{ - control_plane::{ - messages::MetricsAuxInfo, - provider::{CachedAllowedIps, CachedRoleSecret}, - }, - BranchId, EndpointId, ProjectId, -}; +use std::str::FromStr; +use std::sync::Arc; + use futures::TryFutureExt; -use std::{str::FromStr, sync::Arc}; use thiserror::Error; -use tokio_postgres::{config::SslMode, Client}; +use tokio_postgres::config::SslMode; +use tokio_postgres::Client; use tracing::{error, info, info_span, warn, Instrument}; +use super::errors::{ApiError, GetAuthInfoError, WakeComputeError}; +use super::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::IpPattern; +use crate::cache::Cached; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret}; +use crate::error::io_error; +use crate::intern::RoleNameInt; +use crate::url::ApiUrl; +use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName}; + #[derive(Debug, Error)] enum MockApiError { #[error("Failed to read password: {0}")] @@ -120,7 +122,10 @@ impl Api { }) } - async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result> { + async fn do_get_endpoint_jwks( + &self, + endpoint: EndpointId, + ) -> Result, GetEndpointJwksError> { let (client, connection) = tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; @@ -224,7 +229,7 @@ impl super::Api for Api { &self, _ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(endpoint).await } diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs index 6cc525a324..a4a330cd5f 100644 --- a/proxy/src/control_plane/provider/mod.rs +++ b/proxy/src/control_plane/provider/mod.rs @@ -2,39 +2,36 @@ pub mod mock; pub mod neon; -use super::messages::{ControlPlaneError, MetricsAuxInfo}; -use crate::{ - auth::{ - backend::{ - jwt::{AuthRule, FetchAuthRules}, - ComputeCredentialKeys, ComputeUserInfo, - }, - IpPattern, - }, - cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, - compute, - config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, - context::RequestMonitoring, - error::ReportableError, - intern::ProjectIdInt, - metrics::ApiLockMetrics, - rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}, - scram, EndpointCacheKey, EndpointId, -}; +use std::hash::Hash; +use std::sync::Arc; +use std::time::Duration; + use dashmap::DashMap; -use std::{hash::Hash, sync::Arc, time::Duration}; use tokio::time::Instant; use tracing::info; +use super::messages::{ControlPlaneError, MetricsAuxInfo}; +use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; +use crate::auth::IpPattern; +use crate::cache::endpoints::EndpointsCache; +use crate::cache::project_info::ProjectInfoCacheImpl; +use crate::cache::{Cached, TimedLru}; +use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::intern::ProjectIdInt; +use crate::metrics::ApiLockMetrics; +use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}; +use crate::{compute, scram, EndpointCacheKey, EndpointId}; + pub(crate) mod errors { - use crate::{ - control_plane::messages::{self, ControlPlaneError, Reason}, - error::{io_error, ErrorKind, ReportableError, UserFacingError}, - proxy::retry::CouldRetry, - }; use thiserror::Error; use super::ApiLockError; + use crate::control_plane::messages::{self, ControlPlaneError, Reason}; + use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError}; + use crate::proxy::retry::CouldRetry; /// A go-to error message which doesn't leak any detail. pub(crate) const REQUEST_FAILED: &str = "Console request failed"; @@ -44,7 +41,7 @@ pub(crate) mod errors { pub(crate) enum ApiError { /// Error returned by the console itself. #[error("{REQUEST_FAILED} with {0}")] - ControlPlane(ControlPlaneError), + ControlPlane(Box), /// Various IO errors like broken pipe or malformed payload. #[error("{REQUEST_FAILED}: {0}")] @@ -90,7 +87,7 @@ pub(crate) mod errors { Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane, Reason::LockAlreadyTaken => ErrorKind::ControlPlane, Reason::RunningOperations => ErrorKind::ControlPlane, - Reason::Unknown => match &e { + Reason::Unknown => match &**e { ControlPlaneError { http_status_code: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, @@ -246,6 +243,33 @@ pub(crate) mod errors { } } } + + #[derive(Debug, Error)] + pub enum GetEndpointJwksError { + #[error("endpoint not found")] + EndpointNotFound, + + #[error("failed to build control plane request: {0}")] + RequestBuild(#[source] reqwest::Error), + + #[error("failed to send control plane request: {0}")] + RequestExecute(#[source] reqwest_middleware::Error), + + #[error(transparent)] + ControlPlane(#[from] ApiError), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + TokioPostgres(#[from] tokio_postgres::Error), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + ParseUrl(#[from] url::ParseError), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + TaskJoin(#[from] tokio::task::JoinError), + } } /// Auth secret which is managed by the cloud. @@ -342,7 +366,7 @@ pub(crate) trait Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result>; + ) -> Result, errors::GetEndpointJwksError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( @@ -401,7 +425,7 @@ impl Api for ControlPlaneBackend { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, errors::GetEndpointJwksError> { match self { Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(any(test, feature = "testing"))] @@ -583,7 +607,9 @@ impl FetchAuthRules for ControlPlaneBackend { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { - self.get_endpoint_jwks(ctx, endpoint).await + ) -> Result, FetchAuthRulesError> { + self.get_endpoint_jwks(ctx, endpoint) + .await + .map_err(FetchAuthRulesError::GetEndpointJwks) } } diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs index d01878741c..5d0692c7ca 100644 --- a/proxy/src/control_plane/provider/neon.rs +++ b/proxy/src/control_plane/provider/neon.rs @@ -1,29 +1,31 @@ //! Production console backend. -use super::{ - super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute}, - errors::{ApiError, GetAuthInfoError, WakeComputeError}, - ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, - NodeInfo, -}; -use crate::{ - auth::backend::{jwt::AuthRule, ComputeUserInfo}, - compute, - control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}, - http, - metrics::{CacheOutcome, Metrics}, - rate_limiter::WakeComputeRateLimiter, - scram, EndpointCacheKey, EndpointId, -}; -use crate::{cache::Cached, context::RequestMonitoring}; -use ::http::{header::AUTHORIZATION, HeaderName}; -use anyhow::bail; +use std::sync::Arc; +use std::time::Duration; + +use ::http::header::AUTHORIZATION; +use ::http::HeaderName; use futures::TryFutureExt; -use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{debug, info, info_span, warn, Instrument}; +use super::super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute}; +use super::errors::{ApiError, GetAuthInfoError, WakeComputeError}; +use super::{ + ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, + NodeInfo, +}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::cache::Cached; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; +use crate::metrics::{CacheOutcome, Metrics}; +use crate::rate_limiter::WakeComputeRateLimiter; +use crate::{compute, http, scram, EndpointCacheKey, EndpointId}; + const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); #[derive(Clone)] @@ -137,14 +139,14 @@ impl Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { if !self .caches .endpoints_cache .is_valid(ctx, &endpoint.normalize()) .await { - bail!("endpoint not found"); + return Err(GetEndpointJwksError::EndpointNotFound); } let request_id = ctx.session_id().to_string(); async { @@ -159,12 +161,17 @@ impl Api { .header(X_REQUEST_ID, &request_id) .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) - .build()?; + .build() + .map_err(GetEndpointJwksError::RequestBuild)?; info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); - let response = self.endpoint.execute(request).await?; + let response = self + .endpoint + .execute(request) + .await + .map_err(GetEndpointJwksError::RequestExecute)?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -330,7 +337,7 @@ impl super::Api for Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(ctx, endpoint).await } @@ -348,7 +355,7 @@ impl super::Api for Api { let (cached, info) = cached.take_value(); let info = info.map_err(|c| { info!(key = &*key, "found cached wake_compute error"); - WakeComputeError::ApiError(ApiError::ControlPlane(*c)) + WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c))) })?; debug!(key = &*key, "found cached compute node info"); @@ -418,7 +425,7 @@ impl super::Api for Api { self.caches.node_info.insert_ttl( key, - Err(Box::new(err.clone())), + Err(err.clone()), Duration::from_secs(30), ); @@ -457,7 +464,7 @@ async fn parse_body serde::Deserialize<'a>>( body.http_status_code = status; warn!("console responded with an error ({status}): {body:?}"); - Err(ApiError::ControlPlane(body)) + Err(ApiError::ControlPlane(Box::new(body))) } fn parse_host_port(input: &str) -> Option<(&str, u16)> { diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 1cd4dc2c22..e71ed0c048 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,4 +1,5 @@ -use std::{error::Error as StdError, fmt, io}; +use std::error::Error as StdError; +use std::{fmt, io}; use measured::FixedCardinalityLabel; diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index d0352351d5..978ad9f761 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,19 +1,18 @@ +use std::convert::Infallible; +use std::net::TcpListener; +use std::sync::{Arc, Mutex}; + use anyhow::{anyhow, bail}; -use hyper0::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; -use measured::{text::BufferedTextEncoder, MetricGroup}; +use hyper0::header::CONTENT_TYPE; +use hyper0::{Body, Request, Response, StatusCode}; +use measured::text::BufferedTextEncoder; +use measured::MetricGroup; use metrics::NeonMetrics; -use std::{ - convert::Infallible, - net::TcpListener, - sync::{Arc, Mutex}, -}; use tracing::{info, info_span}; -use utils::http::{ - endpoint::{self, request_span}, - error::ApiError, - json::json_response, - RouterBuilder, RouterService, -}; +use utils::http::endpoint::{self, request_span}; +use utils::http::error::ApiError; +use utils::http::json::json_response; +use utils::http::{RouterBuilder, RouterService}; use crate::jemalloc; diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index d8676d5b50..f1b632e704 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -8,19 +8,18 @@ use std::time::Duration; use anyhow::bail; use bytes::Bytes; +use http::Method; use http_body_util::BodyExt; use hyper::body::Body; +pub(crate) use reqwest::{Request, Response}; +use reqwest_middleware::RequestBuilder; +pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; +pub(crate) use reqwest_retry::policies::ExponentialBackoff; +pub(crate) use reqwest_retry::RetryTransientMiddleware; use serde::de::DeserializeOwned; -pub(crate) use reqwest::{Request, Response}; -pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; -pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; - -use crate::{ - metrics::{ConsoleRequest, Metrics}, - url::ApiUrl, -}; -use reqwest_middleware::RequestBuilder; +use crate::metrics::{ConsoleRequest, Metrics}; +use crate::url::ApiUrl; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). @@ -95,9 +94,19 @@ impl Endpoint { /// Return a [builder](RequestBuilder) for a `GET` request, /// accepting a closure to modify the url path segments for more complex paths queries. pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder { + self.request_with_url(Method::GET, f) + } + + /// Return a [builder](RequestBuilder) for a request, + /// accepting a closure to modify the url path segments for more complex paths queries. + pub(crate) fn request_with_url( + &self, + method: Method, + f: impl for<'a> FnOnce(&'a mut ApiUrl), + ) -> RequestBuilder { let mut url = self.endpoint.clone(); f(&mut url); - self.client.get(url.into_inner()) + self.client.request(method, url.into_inner()) } /// Execute a [request](reqwest::Request). @@ -142,9 +151,10 @@ pub(crate) async fn parse_json_body_with_limit( #[cfg(test)] mod tests { - use super::*; use reqwest::Client; + use super::*; + #[test] fn optional_query_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index 108420d7d7..49aab917e4 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -1,6 +1,8 @@ -use std::{ - hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock, -}; +use std::hash::BuildHasherDefault; +use std::marker::PhantomData; +use std::num::NonZeroUsize; +use std::ops::Index; +use std::sync::OnceLock; use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; use rustc_hash::FxHasher; @@ -53,7 +55,7 @@ impl std::ops::Deref for InternedString { impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString { fn deserialize>(d: D) -> Result { struct Visitor(PhantomData); - impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor { + impl serde::de::Visitor<'_> for Visitor { type Value = InternedString; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -208,9 +210,8 @@ impl From for ProjectIdInt { mod tests { use std::sync::OnceLock; - use crate::intern::StringInterner; - use super::InternId; + use crate::intern::StringInterner; struct MyId; impl InternId for MyId { @@ -222,7 +223,8 @@ mod tests { #[test] fn push_many_strings() { - use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use rand_distr::Zipf; let endpoint_dist = Zipf::new(500000, 0.8).unwrap(); diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index d307d80f4a..0fae78b60c 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -1,14 +1,12 @@ use std::marker::PhantomData; -use measured::{ - label::NoLabels, - metric::{ - gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding, - MetricFamilyEncoding, MetricType, - }, - text::TextEncoder, - LabelGroup, MetricGroup, -}; +use measured::label::NoLabels; +use measured::metric::gauge::GaugeState; +use measured::metric::group::Encoding; +use measured::metric::name::MetricNameEncoder; +use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType}; +use measured::text::TextEncoder; +use measured::{LabelGroup, MetricGroup}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 8d274baa10..ea17a88067 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -76,11 +76,7 @@ ) )] // List of temporarily allowed lints to unblock beta/nightly. -#![allow( - unknown_lints, - // TODO: 1.82: Add `use` where necessary and remove from this list. - impl_trait_overcaptures, -)] +#![allow(unknown_lints)] use std::convert::Infallible; @@ -94,7 +90,9 @@ pub mod auth; pub mod cache; pub mod cancellation; pub mod compute; +pub mod compute_ctl; pub mod config; +pub mod console_redirect_proxy; pub mod context; pub mod control_plane; pub mod error; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index a34eb820f8..11921867e4 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,14 +1,10 @@ use tracing::Subscriber; -use tracing_subscriber::{ - filter::{EnvFilter, LevelFilter}, - fmt::{ - format::{Format, Full}, - time::SystemTime, - FormatEvent, FormatFields, - }, - prelude::*, - registry::LookupSpan, -}; +use tracing_subscriber::filter::{EnvFilter, LevelFilter}; +use tracing_subscriber::fmt::format::{Format, Full}; +use tracing_subscriber::fmt::time::SystemTime; +use tracing_subscriber::fmt::{FormatEvent, FormatFields}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::registry::LookupSpan; /// Initialize logging and OpenTelemetry tracing and exporter. /// diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 272723a1bc..542826e833 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,14 +1,16 @@ use std::sync::{Arc, OnceLock}; use lasso::ThreadedRodeo; +use measured::label::{ + FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet, +}; +use measured::metric::histogram::Thresholds; +use measured::metric::name::MetricName; use measured::{ - label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet}, - metric::{histogram::Thresholds, name::MetricName}, Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, MetricGroup, }; use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; - use tokio::time::{self, Instant}; use crate::control_plane::messages::ColdStartInfo; diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 17764f78d1..ef2391cdd8 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,11 +1,9 @@ //! Proxy Protocol V2 implementation -use std::{ - io, - net::SocketAddr, - pin::Pin, - task::{Context, Poll}, -}; +use std::io; +use std::net::SocketAddr; +use std::pin::Pin; +use std::task::{Context, Poll}; use bytes::BytesMut; use pin_project_lite::pin_project; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index aac7720890..8e9663626a 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,24 +1,23 @@ -use crate::{ - auth::backend::ComputeCredentialKeys, - compute::COULD_NOT_CONNECT, - compute::{self, PostgresConnection}, - config::RetryConfig, - context::RequestMonitoring, - control_plane::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo}, - error::ReportableError, - metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType}, - proxy::{ - retry::{retry_after, should_retry, CouldRetry}, - wake_compute::wake_compute, - }, - Host, -}; use async_trait::async_trait; use pq_proto::StartupMessageParams; use tokio::time; use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; +use crate::auth::backend::ComputeCredentialKeys; +use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; +use crate::config::RetryConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; +use crate::error::ReportableError; +use crate::metrics::{ + ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, +}; +use crate::proxy::retry::{retry_after, should_retry, CouldRetry}; +use crate::proxy::wake_compute::wake_compute; +use crate::Host; const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 4ebda013ac..91a3ceff75 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -1,11 +1,11 @@ -use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; -use tracing::info; - use std::future::poll_fn; use std::io; use std::pin::Pin; use std::task::{ready, Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tracing::info; + #[derive(Debug)] enum TransferState { Running(CopyBuffer), @@ -256,9 +256,10 @@ impl CopyBuffer { #[cfg(test)] mod tests { - use super::*; use tokio::io::AsyncWriteExt; + use super::*; + #[tokio::test] async fn test_client_to_compute() { let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 5996b11c11..a67f1b8112 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -1,21 +1,19 @@ use bytes::Buf; +use pq_proto::framed::Framed; use pq_proto::{ - framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, - StartupMessageParams, + BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams, }; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; -use crate::{ - auth::endpoint_sni, - config::{TlsConfig, PG_ALPN_PROTOCOL}, - context::RequestMonitoring, - error::ReportableError, - metrics::Metrics, - proxy::ERR_INSECURE_CONNECTION, - stream::{PqStream, Stream, StreamUpgradeError}, -}; +use crate::auth::endpoint_sni; +use crate::config::{TlsConfig, PG_ALPN_PROTOCOL}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::Metrics; +use crate::proxy::ERR_INSECURE_CONNECTION; +use crate::stream::{PqStream, Stream, StreamUpgradeError}; #[derive(Error, Debug)] pub(crate) enum HandshakeError { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 3a43ccb74a..f646862caa 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -7,40 +7,32 @@ pub(crate) mod handshake; pub(crate) mod passthrough; pub(crate) mod retry; pub(crate) mod wake_compute; -pub use copy_bidirectional::copy_bidirectional_client_compute; -pub use copy_bidirectional::ErrorSource; +use std::sync::Arc; -use crate::config::ProxyProtocolV2; -use crate::{ - auth, - cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, - compute, - config::{ProxyConfig, TlsConfig}, - context::RequestMonitoring, - error::ReportableError, - metrics::{Metrics, NumClientConnectionsGuard}, - protocol2::read_proxy_protocol, - proxy::handshake::{handshake, HandshakeData}, - rate_limiter::EndpointRateLimiter, - stream::{PqStream, Stream}, - EndpointCacheKey, -}; +pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource}; use futures::TryFutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; use smol_str::{format_smolstr, SmolStr}; -use std::sync::Arc; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn, Instrument}; -use self::{ - connect_compute::{connect_to_compute, TcpMechanism}, - passthrough::ProxyPassthrough, -}; +use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::passthrough::ProxyPassthrough; +use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::rate_limiter::EndpointRateLimiter; +use crate::stream::{PqStream, Stream}; +use crate::{auth, compute, EndpointCacheKey}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; @@ -61,7 +53,7 @@ pub async fn run_until_cancelled( pub async fn task_main( config: &'static ProxyConfig, - auth_backend: &'static auth::Backend<'static, (), ()>, + auth_backend: &'static auth::Backend<'static, ()>, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, @@ -248,7 +240,7 @@ impl ReportableError for ClientRequestError { #[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, - auth_backend: &'static auth::Backend<'static, (), ()>, + auth_backend: &'static auth::Backend<'static, ()>, ctx: &RequestMonitoring, cancellation_handler: Arc, stream: S, @@ -356,7 +348,7 @@ pub(crate) async fn handle_client( /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -async fn prepare_client_connection

( +pub(crate) async fn prepare_client_connection

( node: &compute::PostgresConnection, session: &cancellation::Session

, stream: &mut PqStream, diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 497cf4bfd5..e3b4730982 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -1,16 +1,14 @@ -use crate::{ - cancellation, - compute::PostgresConnection, - control_plane::messages::MetricsAuxInfo, - metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, - stream::Stream, - usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; +use crate::cancellation; +use crate::compute::PostgresConnection; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; +use crate::stream::Stream; +use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 15895d37e6..d3f0c3e7d4 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -1,7 +1,11 @@ -use crate::{compute, config::RetryConfig}; -use std::{error::Error, io}; +use std::error::Error; +use std::io; + use tokio::time; +use crate::compute; +use crate::config::RetryConfig; + pub(crate) trait CouldRetry { /// Returns true if the error could be retried fn could_retry(&self) -> bool; diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 33a2162bc7..df9f79a7e3 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -6,7 +6,6 @@ use std::fmt::Debug; -use super::*; use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use postgres_protocol::message::frontend; @@ -14,6 +13,8 @@ use tokio::io::{AsyncReadExt, DuplexStream}; use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; +use super::*; + enum Intercept { None, Methods, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 3861ddc8ed..3f54b0661b 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -4,6 +4,17 @@ mod mitm; use std::time::Duration; +use anyhow::{bail, Context}; +use async_trait::async_trait; +use http::StatusCode; +use retry::{retry_after, ShouldRetryWakeCompute}; +use rstest::rstest; +use rustls::crypto::aws_lc_rs; +use rustls::pki_types; +use tokio_postgres::config::SslMode; +use tokio_postgres::tls::{MakeTlsConnect, NoTls}; +use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; + use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; use super::*; @@ -18,15 +29,6 @@ use crate::control_plane::provider::{ use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; use crate::{sasl, scram, BranchId, EndpointId, ProjectId}; -use anyhow::{bail, Context}; -use async_trait::async_trait; -use http::StatusCode; -use retry::{retry_after, ShouldRetryWakeCompute}; -use rstest::rstest; -use rustls::pki_types; -use tokio_postgres::config::SslMode; -use tokio_postgres::tls::{MakeTlsConnect, NoTls}; -use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; /// Generate a set of TLS certificates: CA + server. fn generate_certs( @@ -37,25 +39,27 @@ fn generate_certs( pki_types::CertificateDer<'static>, pki_types::PrivateKeyDer<'static>, )> { - let ca = rcgen::Certificate::from_params({ + let ca_key = rcgen::KeyPair::generate()?; + let ca = { let mut params = rcgen::CertificateParams::default(); params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained); - params - })?; + params.self_signed(&ca_key)? + }; - let cert = rcgen::Certificate::from_params({ - let mut params = rcgen::CertificateParams::new(vec![hostname.into()]); + let cert_key = rcgen::KeyPair::generate()?; + let cert = { + let mut params = rcgen::CertificateParams::new(vec![hostname.into()])?; params.distinguished_name = rcgen::DistinguishedName::new(); params .distinguished_name .push(rcgen::DnType::CommonName, common_name); - params - })?; + params.signed_by(&cert_key, &ca, &ca_key)? + }; Ok(( - pki_types::CertificateDer::from(ca.serialize_der()?), - pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?), - pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()), + ca.der().clone(), + cert.der().clone(), + pki_types::PrivateKeyDer::Pkcs8(cert_key.serialize_der().into()), )) } @@ -69,11 +73,11 @@ impl ClientConfig<'_> { self, ) -> anyhow::Result< impl tokio_postgres::tls::TlsConnect< - S, - Error = impl std::fmt::Debug, - Future = impl Send, - Stream = RustlsStream, - >, + S, + Error = impl std::fmt::Debug + use, + Future = impl Send + use, + Stream = RustlsStream, + > + use, > { let mut mk = MakeRustlsConnect::new(self.config); let tls = MakeTlsConnect::::make_tls_connect(&mut mk, self.hostname)?; @@ -89,10 +93,13 @@ fn generate_tls_config<'a>( let (ca, cert, key) = generate_certs(hostname, common_name)?; let tls_config = { - let config = rustls::ServerConfig::builder() - .with_no_client_auth() - .with_single_cert(vec![cert.clone()], key.clone_key())? - .into(); + let config = + rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .context("aws_lc_rs should support the default protocol versions")? + .with_no_client_auth() + .with_single_cert(vec![cert.clone()], key.clone_key())? + .into(); let mut cert_resolver = CertResolver::new(); cert_resolver.add_cert(key, vec![cert], true)?; @@ -107,13 +114,16 @@ fn generate_tls_config<'a>( }; let client_config = { - let config = rustls::ClientConfig::builder() - .with_root_certificates({ - let mut store = rustls::RootCertStore::empty(); - store.add(ca)?; - store - }) - .with_no_client_auth(); + let config = + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .context("aws_lc_rs should support the default protocol versions")? + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(ca)?; + store + }) + .with_no_client_auth(); ClientConfig { config, hostname } }; @@ -336,7 +346,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); - use rand::{distributions::Alphanumeric, Rng}; + use rand::distributions::Alphanumeric; + use rand::Rng; let password: String = rand::thread_rng() .sample_iter(&Alphanumeric) .take(rand::random::() as usize) @@ -492,30 +503,32 @@ impl TestBackend for TestConnectMechanism { match action { ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { - let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError { - http_status_code: StatusCode::BAD_REQUEST, - error: "TEST".into(), - status: None, - }); + let err = + control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: None, + })); assert!(!err.could_retry()); Err(control_plane::errors::WakeComputeError::ApiError(err)) } ConnectAction::WakeRetry => { - let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError { - http_status_code: StatusCode::BAD_REQUEST, - error: "TEST".into(), - status: Some(Status { - code: "error".into(), - message: "error".into(), - details: Details { - error_info: None, - retry_info: Some(control_plane::messages::RetryInfo { - retry_delay_ms: 1, - }), - user_facing_message: None, - }, - }), - }); + let err = + control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: Some(Status { + code: "error".into(), + message: "error".into(), + details: Details { + error_info: None, + retry_info: Some(control_plane::messages::RetryInfo { + retry_delay_ms: 1, + }), + user_facing_message: None, + }, + }), + })); assert!(err.could_retry()); Err(control_plane::errors::WakeComputeError::ApiError(err)) } @@ -552,7 +565,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> auth::Backend<'static, ComputeCredentials, &()> { +) -> auth::Backend<'static, ComputeCredentials> { let user_info = auth::Backend::ControlPlane( MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))), ComputeCredentials { diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index ba674f5d0d..9dfa485fa4 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,16 +1,17 @@ +use hyper::StatusCode; +use tracing::{error, info, warn}; + +use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::{ControlPlaneError, Reason}; -use crate::control_plane::{errors::WakeComputeError, provider::CachedNodeInfo}; +use crate::control_plane::provider::CachedNodeInfo; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, WakeupFailureKind, }; use crate::proxy::retry::{retry_after, should_retry}; -use hyper::StatusCode; -use tracing::{error, info, warn}; - -use super::connect_compute::ComputeConnectBackend; pub(crate) async fn wake_compute( num_retries: &mut u32, @@ -79,7 +80,7 @@ fn report_error(e: &WakeComputeError, retry: bool) { Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked, Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked, Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked, - Reason::Unknown => match e { + Reason::Unknown => match **e { ControlPlaneError { http_status_code: StatusCode::LOCKED, ref error, diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index bf4d85f2e4..45f9630dde 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -1,7 +1,5 @@ -use std::{ - hash::Hash, - sync::atomic::{AtomicUsize, Ordering}, -}; +use std::hash::Hash; +use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; use dashmap::DashMap; diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 25607b7e10..16c398f303 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -1,10 +1,12 @@ //! Algorithms for controlling concurrency limits. +use std::pin::pin; +use std::sync::Arc; +use std::time::Duration; + use parking_lot::Mutex; -use std::{pin::pin, sync::Arc, time::Duration}; -use tokio::{ - sync::Notify, - time::{error::Elapsed, Instant}, -}; +use tokio::sync::Notify; +use tokio::time::error::Elapsed; +use tokio::time::Instant; use self::aimd::Aimd; diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 86b56e38fb..5332a5184f 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -60,12 +60,11 @@ impl LimitAlgorithm for Aimd { mod tests { use std::time::Duration; + use super::*; use crate::rate_limiter::limit_algorithm::{ DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig, }; - use super::*; - #[tokio::test(start_paused = true)] async fn increase_decrease() { let config = RateLimiterConfig { diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index be529f174d..5de64c2254 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,17 +1,14 @@ -use std::{ - borrow::Cow, - collections::hash_map::RandomState, - hash::{BuildHasher, Hash}, - sync::{ - atomic::{AtomicUsize, Ordering}, - Mutex, - }, -}; +use std::borrow::Cow; +use std::collections::hash_map::RandomState; +use std::hash::{BuildHasher, Hash}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Mutex; use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; -use rand::{rngs::StdRng, Rng, SeedableRng}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; use tokio::time::{Duration, Instant}; use tracing::info; @@ -243,14 +240,17 @@ impl BucketRateLimiter { #[cfg(test)] mod tests { - use std::{hash::BuildHasherDefault, time::Duration}; + use std::hash::BuildHasherDefault; + use std::time::Duration; use rand::SeedableRng; use rustc_hash::FxHasher; use tokio::time; use super::{BucketRateLimiter, WakeComputeRateLimiter}; - use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; + use crate::intern::EndpointIdInt; + use crate::rate_limiter::RateBucketInfo; + use crate::EndpointId; #[test] fn rate_bucket_rpi() { diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs index 6e38f89458..3ae2ecaf8f 100644 --- a/proxy/src/rate_limiter/mod.rs +++ b/proxy/src/rate_limiter/mod.rs @@ -2,13 +2,11 @@ mod leaky_bucket; mod limit_algorithm; mod limiter; +pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; #[cfg(test)] pub(crate) use limit_algorithm::aimd::Aimd; - pub(crate) use limit_algorithm::{ DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; pub(crate) use limiter::GlobalRateLimiter; - -pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 95bdfc0965..0000246971 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,13 +5,10 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}; use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; -use super::{ - connection_with_credentials_provider::ConnectionWithCredentialsProvider, - notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}, -}; - pub trait CancellationPublisherMut: Send + Sync + 'static { #[allow(async_fn_in_trait)] async fn try_publish( diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index ccd48f1481..82139ea1d5 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -1,10 +1,9 @@ -use std::{sync::Arc, time::Duration}; +use std::sync::Arc; +use std::time::Duration; use futures::FutureExt; -use redis::{ - aio::{ConnectionLike, MultiplexedConnection}, - ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, -}; +use redis::aio::{ConnectionLike, MultiplexedConnection}; +use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult}; use tokio::task::JoinHandle; use tracing::{debug, error, info, warn}; diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index c3af6740cb..e56c5a3414 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -1,4 +1,5 @@ -use std::{convert::Infallible, sync::Arc}; +use std::convert::Infallible; +use std::sync::Arc; use futures::StreamExt; use pq_proto::CancelKeyData; @@ -8,12 +9,10 @@ use tokio_util::sync::CancellationToken; use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::{ - cache::project_info::ProjectInfoCache, - cancellation::{CancelMap, CancellationHandler}, - intern::{ProjectIdInt, RoleNameInt}, - metrics::{Metrics, RedisErrors, RedisEventsCount}, -}; +use crate::cache::project_info::ProjectInfoCache; +use crate::cancellation::{CancelMap, CancellationHandler}; +use crate::intern::{ProjectIdInt, RoleNameInt}; +use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; @@ -269,10 +268,10 @@ where #[cfg(test)] mod tests { - use crate::{ProjectId, RoleName}; + use serde_json::json; use super::*; - use serde_json::json; + use crate::{ProjectId, RoleName}; #[test] fn parse_allowed_ips() -> anyhow::Result<()> { diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 6c9a42b2db..1373dfba3d 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -1,8 +1,9 @@ //! Definitions for SASL messages. -use crate::parse::{split_at_const, split_cstr}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; +use crate::parse::{split_at_const, split_cstr}; + /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] pub(crate) struct FirstMessage<'a> { diff --git a/proxy/src/sasl/mod.rs b/proxy/src/sasl/mod.rs index 0a36694359..f0181b404f 100644 --- a/proxy/src/sasl/mod.rs +++ b/proxy/src/sasl/mod.rs @@ -10,13 +10,14 @@ mod channel_binding; mod messages; mod stream; -use crate::error::{ReportableError, UserFacingError}; use std::io; -use thiserror::Error; pub(crate) use channel_binding::ChannelBinding; pub(crate) use messages::FirstMessage; pub(crate) use stream::{Outcome, SaslStream}; +use thiserror::Error; + +use crate::error::{ReportableError, UserFacingError}; /// Fine-grained auth errors help in writing tests. #[derive(Error, Debug)] diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index b6becd28e1..f1c916daa2 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -1,11 +1,14 @@ //! Abstraction for the string-oriented SASL protocols. -use super::{messages::ServerMessage, Mechanism}; -use crate::stream::PqStream; use std::io; + use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; +use super::messages::ServerMessage; +use super::Mechanism; +use crate::stream::PqStream; + /// Abstracts away all peculiarities of the libpq's protocol. pub(crate) struct SaslStream<'a, S> { /// The underlying stream. diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 64ee0135e1..87ab6e0d5f 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -69,7 +69,9 @@ impl CountMinSketch { #[cfg(test)] mod tests { - use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + use rand::rngs::StdRng; + use rand::seq::SliceRandom; + use rand::{Rng, SeedableRng}; use super::CountMinSketch; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index afb5604666..6a13f645a5 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -209,7 +209,8 @@ impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { - use {sasl::Step, ExchangeState}; + use sasl::Step; + use ExchangeState; match &self.state { ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { @@ -217,16 +218,12 @@ impl sasl::Mechanism for Exchange<'_> { self.state = ExchangeState::SaltSent(sent); Ok(Step::Continue(self, msg)) } - #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match - Step::Success(x, _) => match x {}, Step::Failure(msg) => Ok(Step::Failure(msg)), } } ExchangeState::SaltSent(sent) => { match sent.transition(self.secret, &self.tls_server_end_point, input)? { Step::Success(keys, msg) => Ok(Step::Success(keys, msg)), - #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match - Step::Continue(x, _) => match x {}, Step::Failure(msg) => Ok(Step::Failure(msg)), } } diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index fd9e77764c..5ee3a51352 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -1,11 +1,12 @@ //! Definitions for SCRAM messages. +use std::fmt; +use std::ops::Range; + use super::base64_decode_array; use super::key::{ScramKey, SCRAM_KEY_LEN}; use super::signature::SignatureBuilder; use crate::sasl::ChannelBinding; -use std::fmt; -use std::ops::Range; /// Faithfully taken from PostgreSQL. pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18; diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index d058f1c3f8..97644b6282 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -16,10 +16,9 @@ mod signature; pub mod threadpool; pub(crate) use exchange::{exchange, Exchange}; +use hmac::{Hmac, Mac}; pub(crate) use key::ScramKey; pub(crate) use secret::ServerSecret; - -use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; @@ -59,13 +58,11 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use crate::{ - intern::EndpointIdInt, - sasl::{Mechanism, Step}, - EndpointId, - }; - - use super::{threadpool::ThreadPool, Exchange, ServerSecret}; + use super::threadpool::ThreadPool; + use super::{Exchange, ServerSecret}; + use crate::intern::EndpointIdInt; + use crate::sasl::{Mechanism, Step}; + use crate::EndpointId; #[test] fn snapshot() { diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs index 4cf76c8452..9c559e9082 100644 --- a/proxy/src/scram/pbkdf2.rs +++ b/proxy/src/scram/pbkdf2.rs @@ -1,7 +1,6 @@ -use hmac::{ - digest::{consts::U32, generic_array::GenericArray}, - Hmac, Mac, -}; +use hmac::digest::consts::U32; +use hmac::digest::generic_array::GenericArray; +use hmac::{Hmac, Mac}; use sha2::Sha256; pub(crate) struct Pbkdf2 { @@ -66,10 +65,11 @@ impl Pbkdf2 { #[cfg(test)] mod tests { - use super::Pbkdf2; use pbkdf2::pbkdf2_hmac_array; use sha2::Sha256; + use super::Pbkdf2; + #[test] fn works() { let salt = b"sodium chloride"; diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index c027a0cd20..cc1b69fcf9 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -4,28 +4,21 @@ //! 1. Fairness per endpoint. //! 2. Yield support for high iteration counts. -use std::{ - cell::RefCell, - future::Future, - pin::Pin, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Weak, - }, - task::{Context, Poll}, -}; +use std::cell::RefCell; +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::task::{Context, Poll}; use futures::FutureExt; -use rand::Rng; -use rand::{rngs::SmallRng, SeedableRng}; - -use crate::{ - intern::EndpointIdInt, - metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}, - scram::countmin::CountMinSketch, -}; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; use super::pbkdf2::Pbkdf2; +use crate::intern::EndpointIdInt; +use crate::metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}; +use crate::scram::countmin::CountMinSketch; pub struct ThreadPool { runtime: Option, @@ -195,9 +188,8 @@ impl Drop for JobHandle { #[cfg(test)] mod tests { - use crate::EndpointId; - use super::*; + use crate::EndpointId; #[tokio::test] async fn hash_is_correct() { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 9e49478cf3..5d59b4d252 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -1,48 +1,46 @@ -use std::{io, sync::Arc, time::Duration}; +use std::io; +use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; +use p256::ecdsa::SigningKey; +use p256::elliptic_curve::JwkEcKey; +use rand::rngs::OsRng; use tokio::net::{lookup_host, TcpStream}; -use tokio_postgres::types::ToSql; -use tracing::{debug, field::display, info}; +use tracing::field::display; +use tracing::{debug, info}; -use crate::{ - auth::{ - self, - backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo}, - check_peer_addr_is_in_list, AuthError, - }, - compute, - config::ProxyConfig, - context::RequestMonitoring, - control_plane::{ - errors::{GetAuthInfoError, WakeComputeError}, - locks::ApiLocks, - provider::ApiLockError, - CachedNodeInfo, - }, - error::{ErrorKind, ReportableError, UserFacingError}, - intern::EndpointIdInt, - proxy::{ - connect_compute::ConnectMechanism, - retry::{CouldRetry, ShouldRetryWakeCompute}, - }, - rate_limiter::EndpointRateLimiter, - EndpointId, Host, -}; - -use super::{ - conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}, - http_conn_pool::{self, poll_http2_client}, - local_conn_pool::{self, LocalClient, LocalConnPool}, +use super::conn_pool::poll_client; +use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool}; +use super::http_conn_pool::{self, poll_http2_client, Send}; +use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION}; +use crate::auth::backend::local::StaticAuthRules; +use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; +use crate::auth::{self, check_peer_addr_is_in_list, AuthError}; +use crate::compute_ctl::{ + ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, }; +use crate::config::ProxyConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::provider::ApiLockError; +use crate::control_plane::CachedNodeInfo; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::intern::EndpointIdInt; +use crate::proxy::connect_compute::ConnectMechanism; +use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; +use crate::rate_limiter::EndpointRateLimiter; +use crate::{compute, EndpointId, Host}; pub(crate) struct PoolingBackend { - pub(crate) http_conn_pool: Arc, + pub(crate) http_conn_pool: Arc>, pub(crate) local_pool: Arc>, pub(crate) pool: Arc>, + pub(crate) config: &'static ProxyConfig, - pub(crate) auth_backend: &'static crate::auth::Backend<'static, (), ()>, + pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, pub(crate) endpoint_rate_limiter: Arc, } @@ -135,9 +133,6 @@ impl PoolingBackend { keys: crate::auth::backend::ComputeCredentialKeys::None, }) } - crate::auth::Backend::ConsoleRedirect(_, ()) => Err(AuthError::auth_failed( - "JWT login over web auth proxy is not supported", - )), crate::auth::Backend::Local(_) => { let keys = self .config @@ -209,7 +204,7 @@ impl PoolingBackend { &self, ctx: &RequestMonitoring, conn_info: ConnInfo, - ) -> Result { + ) -> Result, HttpConnError> { info!("pool: looking for an existing connection"); if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) { return Ok(client); @@ -259,61 +254,100 @@ impl PoolingBackend { return Ok(client); } + let local_backend = match &self.auth_backend { + auth::Backend::ControlPlane(_, ()) => { + unreachable!("only local_proxy can connect to local postgres") + } + auth::Backend::Local(local) => local, + }; + + if !self.local_pool.initialized(&conn_info) { + // only install and grant usage one at a time. + let _permit = local_backend.initialize.acquire().await.unwrap(); + + // check again for race + if !self.local_pool.initialized(&conn_info) { + local_backend + .compute_ctl + .install_extension(&ExtensionInstallRequest { + extension: EXT_NAME, + database: conn_info.dbname.clone(), + version: EXT_VERSION, + }) + .await?; + + local_backend + .compute_ctl + .grant_role(&SetRoleGrantsRequest { + schema: EXT_SCHEMA, + privileges: vec![Privilege::Usage], + database: conn_info.dbname.clone(), + role: conn_info.user_info.user.clone(), + }) + .await?; + + self.local_pool.set_initialized(&conn_info); + } + } + let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "local_pool: opening a new connection '{conn_info}'"); - let mut node_info = match &self.auth_backend { - auth::Backend::ControlPlane(_, ()) | auth::Backend::ConsoleRedirect(_, ()) => { - unreachable!("only local_proxy can connect to local postgres") - } - auth::Backend::Local(local) => local.node_info.clone(), - }; + let mut node_info = local_backend.node_info.clone(); + + let (key, jwk) = create_random_jwk(); let config = node_info .config .user(&conn_info.user_info.user) - .dbname(&conn_info.dbname); + .dbname(&conn_info.dbname) + .options(&format!( + "-c pg_session_jwt.jwk={}", + serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") + )); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(tokio_postgres::NoTls).await?; drop(pause); - tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); + let pid = client.get_process_id(); + tracing::Span::current().record("pid", pid); - let handle = local_conn_pool::poll_client( + let mut handle = local_conn_pool::poll_client( self.local_pool.clone(), ctx, conn_info, client, connection, + key, conn_id, node_info.aux.clone(), ); - let kid = handle.get_client().get_process_id() as i64; - let jwk = p256::PublicKey::from(handle.key().verifying_key()).to_jwk(); + { + let (client, mut discard) = handle.inner(); + debug!("setting up backend session state"); - debug!(kid, ?jwk, "setting up backend session state"); + // initiates the auth session + if let Err(e) = client.query("select auth.init()", &[]).await { + discard.discard(); + return Err(e.into()); + } - // initiates the auth session - handle - .get_client() - .query( - "select auth.init($1, $2);", - &[ - &kid as &(dyn ToSql + Sync), - &tokio_postgres::types::Json(jwk), - ], - ) - .await?; - - info!(?kid, "backend session state init"); + info!("backend session state initialized"); + } Ok(handle) } } +fn create_random_jwk() -> (SigningKey, JwkEcKey) { + let key = SigningKey::random(&mut OsRng); + let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk(); + (key, jwk) +} + #[derive(Debug, thiserror::Error)] pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] @@ -325,6 +359,8 @@ pub(crate) enum HttpConnError { #[error("could not parse JWT payload")] JwtPayloadError(serde_json::Error), + #[error("could not install extension: {0}")] + ComputeCtl(#[from] ComputeCtlError), #[error("could not get auth info")] GetAuthInfo(#[from] GetAuthInfoError), #[error("user not authenticated")] @@ -349,6 +385,7 @@ impl ReportableError for HttpConnError { HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, HttpConnError::PostgresConnectionError(p) => p.get_error_kind(), HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute, + HttpConnError::ComputeCtl(_) => ErrorKind::Service, HttpConnError::JwtPayloadError(_) => ErrorKind::User, HttpConnError::GetAuthInfo(a) => a.get_error_kind(), HttpConnError::AuthError(a) => a.get_error_kind(), @@ -364,6 +401,7 @@ impl UserFacingError for HttpConnError { HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(), HttpConnError::PostgresConnectionError(p) => p.to_string(), HttpConnError::LocalProxyConnectionError(p) => p.to_string(), + HttpConnError::ComputeCtl(_) => "could not set up the JWT authorization database extension".to_string(), HttpConnError::JwtPayloadError(p) => p.to_string(), HttpConnError::GetAuthInfo(c) => c.to_string_client(), HttpConnError::AuthError(c) => c.to_string_client(), @@ -380,6 +418,7 @@ impl CouldRetry for HttpConnError { match self { HttpConnError::PostgresConnectionError(e) => e.could_retry(), HttpConnError::LocalProxyConnectionError(e) => e.could_retry(), + HttpConnError::ComputeCtl(_) => false, HttpConnError::ConnectionClosedAbruptly(_) => false, HttpConnError::JwtPayloadError(_) => false, HttpConnError::GetAuthInfo(_) => false, @@ -483,7 +522,7 @@ impl ConnectMechanism for TokioMechanism { } struct HyperMechanism { - pool: Arc, + pool: Arc>, conn_info: ConnInfo, conn_id: uuid::Uuid, @@ -493,7 +532,7 @@ struct HyperMechanism { #[async_trait] impl ConnectMechanism for HyperMechanism { - type Connection = http_conn_pool::Client; + type Connection = http_conn_pool::Client; type ConnectError = HttpConnError; type Error = HttpConnError; diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs index 7659745473..6db986f1f7 100644 --- a/proxy/src/serverless/cancel_set.rs +++ b/proxy/src/serverless/cancel_set.rs @@ -1,10 +1,8 @@ //! A set for cancelling random http connections -use std::{ - hash::{BuildHasher, BuildHasherDefault}, - num::NonZeroUsize, - time::Duration, -}; +use std::hash::{BuildHasher, BuildHasherDefault}; +use std::num::NonZeroUsize; +use std::time::Duration; use indexmap::IndexMap; use parking_lot::Mutex; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 2e576e0ded..8401e3a1c9 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,33 +1,27 @@ -use dashmap::DashMap; -use futures::{future::poll_fn, Future}; -use parking_lot::RwLock; -use rand::Rng; +use std::fmt; +use std::pin::pin; +use std::sync::{Arc, Weak}; +use std::task::{ready, Poll}; + +use futures::future::poll_fn; +use futures::Future; use smallvec::SmallVec; -use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; -use std::{ - fmt, - task::{ready, Poll}, -}; -use std::{ - ops::Deref, - sync::atomic::{self, AtomicUsize}, -}; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; -use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_postgres::{AsyncMessage, Socket}; use tokio_util::sync::CancellationToken; - -use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; -use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{ - auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, +use tracing::{error, info, info_span, warn, Instrument}; +#[cfg(test)] +use { + super::conn_pool_lib::GlobalConnPoolOptions, + crate::auth::backend::ComputeUserInfo, + std::{sync::atomic, time::Duration}, }; -use tracing::{debug, error, warn, Span}; -use tracing::{info, info_span, Instrument}; - -use super::backend::HttpConnError; +use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool}; +use crate::context::RequestMonitoring; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::metrics::Metrics; #[derive(Debug, Clone)] pub(crate) struct ConnInfoWithAuth { @@ -35,34 +29,12 @@ pub(crate) struct ConnInfoWithAuth { pub(crate) auth: AuthData, } -#[derive(Debug, Clone)] -pub(crate) struct ConnInfo { - pub(crate) user_info: ComputeUserInfo, - pub(crate) dbname: DbName, -} - #[derive(Debug, Clone)] pub(crate) enum AuthData { Password(SmallVec<[u8; 16]>), Jwt(String), } -impl ConnInfo { - // hm, change to hasher to avoid cloning? - pub(crate) fn db_and_user(&self) -> (DbName, RoleName) { - (self.dbname.clone(), self.user_info.user.clone()) - } - - pub(crate) fn endpoint_cache_key(&self) -> Option { - // We don't want to cache http connections for ephemeral endpoints. - if self.user_info.options.is_ephemeral() { - None - } else { - Some(self.user_info.endpoint_cache_key()) - } - } -} - impl fmt::Display for ConnInfo { // use custom display to avoid logging password fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -77,402 +49,6 @@ impl fmt::Display for ConnInfo { } } -struct ConnPoolEntry { - conn: ClientInner, - _last_access: std::time::Instant, -} - -// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool -// Number of open connections is limited by the `max_conns_per_endpoint`. -pub(crate) struct EndpointConnPool { - pools: HashMap<(DbName, RoleName), DbUserConnPool>, - total_conns: usize, - max_conns: usize, - _guard: HttpEndpointPoolsGuard<'static>, - global_connections_count: Arc, - global_pool_size_max_conns: usize, -} - -impl EndpointConnPool { - fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option> { - let Self { - pools, - total_conns, - global_connections_count, - .. - } = self; - pools.get_mut(&db_user).and_then(|pool_entries| { - pool_entries.get_conn_entry(total_conns, global_connections_count.clone()) - }) - } - - fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool { - let Self { - pools, - total_conns, - global_connections_count, - .. - } = self; - if let Some(pool) = pools.get_mut(&db_user) { - let old_len = pool.conns.len(); - pool.conns.retain(|conn| conn.conn.conn_id != conn_id); - let new_len = pool.conns.len(); - let removed = old_len - new_len; - if removed > 0 { - global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(removed as i64); - } - *total_conns -= removed; - removed > 0 - } else { - false - } - } - - fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) { - let conn_id = client.conn_id; - - if client.is_closed() { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); - return; - } - let global_max_conn = pool.read().global_pool_size_max_conns; - if pool - .read() - .global_connections_count - .load(atomic::Ordering::Relaxed) - >= global_max_conn - { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full"); - return; - } - - // return connection to the pool - let mut returned = false; - let mut per_db_size = 0; - let total_conns = { - let mut pool = pool.write(); - - if pool.total_conns < pool.max_conns { - let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); - pool_entries.conns.push(ConnPoolEntry { - conn: client, - _last_access: std::time::Instant::now(), - }); - - returned = true; - per_db_size = pool_entries.conns.len(); - - pool.total_conns += 1; - pool.global_connections_count - .fetch_add(1, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .inc(); - } - - pool.total_conns - }; - - // do logging outside of the mutex - if returned { - info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); - } else { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); - } - } -} - -impl Drop for EndpointConnPool { - fn drop(&mut self) { - if self.total_conns > 0 { - self.global_connections_count - .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(self.total_conns as i64); - } - } -} - -pub(crate) struct DbUserConnPool { - conns: Vec>, -} - -impl Default for DbUserConnPool { - fn default() -> Self { - Self { conns: Vec::new() } - } -} - -impl DbUserConnPool { - fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { - let old_len = self.conns.len(); - - self.conns.retain(|conn| !conn.conn.is_closed()); - - let new_len = self.conns.len(); - let removed = old_len - new_len; - *conns -= removed; - removed - } - - fn get_conn_entry( - &mut self, - conns: &mut usize, - global_connections_count: Arc, - ) -> Option> { - let mut removed = self.clear_closed_clients(conns); - let conn = self.conns.pop(); - if conn.is_some() { - *conns -= 1; - removed += 1; - } - global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(removed as i64); - conn - } -} - -pub(crate) struct GlobalConnPool { - // endpoint -> per-endpoint connection pool - // - // That should be a fairly conteded map, so return reference to the per-endpoint - // pool as early as possible and release the lock. - global_pool: DashMap>>>, - - /// Number of endpoint-connection pools - /// - /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. - /// That seems like far too much effort, so we're using a relaxed increment counter instead. - /// It's only used for diagnostics. - global_pool_size: AtomicUsize, - - /// Total number of connections in the pool - global_connections_count: Arc, - - config: &'static crate::config::HttpConfig, -} - -#[derive(Debug, Clone, Copy)] -pub struct GlobalConnPoolOptions { - // Maximum number of connections per one endpoint. - // Can mix different (dbname, username) connections. - // When running out of free slots for a particular endpoint, - // falls back to opening a new connection for each request. - pub max_conns_per_endpoint: usize, - - pub gc_epoch: Duration, - - pub pool_shards: usize, - - pub idle_timeout: Duration, - - pub opt_in: bool, - - // Total number of connections in the pool. - pub max_total_conns: usize, -} - -impl GlobalConnPool { - pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { - let shards = config.pool_options.pool_shards; - Arc::new(Self { - global_pool: DashMap::with_shard_amount(shards), - global_pool_size: AtomicUsize::new(0), - config, - global_connections_count: Arc::new(AtomicUsize::new(0)), - }) - } - - #[cfg(test)] - pub(crate) fn get_global_connections_count(&self) -> usize { - self.global_connections_count - .load(atomic::Ordering::Relaxed) - } - - pub(crate) fn get_idle_timeout(&self) -> Duration { - self.config.pool_options.idle_timeout - } - - pub(crate) fn shutdown(&self) { - // drops all strong references to endpoint-pools - self.global_pool.clear(); - } - - pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { - let epoch = self.config.pool_options.gc_epoch; - let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); - loop { - interval.tick().await; - - let shard = rng.gen_range(0..self.global_pool.shards().len()); - self.gc(shard); - } - } - - fn gc(&self, shard: usize) { - debug!(shard, "pool: performing epoch reclamation"); - - // acquire a random shard lock - let mut shard = self.global_pool.shards()[shard].write(); - - let timer = Metrics::get() - .proxy - .http_pool_reclaimation_lag_seconds - .start_timer(); - let current_len = shard.len(); - let mut clients_removed = 0; - shard.retain(|endpoint, x| { - // if the current endpoint pool is unique (no other strong or weak references) - // then it is currently not in use by any connections. - if let Some(pool) = Arc::get_mut(x.get_mut()) { - let EndpointConnPool { - pools, total_conns, .. - } = pool.get_mut(); - - // ensure that closed clients are removed - for db_pool in pools.values_mut() { - clients_removed += db_pool.clear_closed_clients(total_conns); - } - - // we only remove this pool if it has no active connections - if *total_conns == 0 { - info!("pool: discarding pool for endpoint {endpoint}"); - return false; - } - } - - true - }); - - let new_len = shard.len(); - drop(shard); - timer.observe(); - - // Do logging outside of the lock. - if clients_removed > 0 { - let size = self - .global_connections_count - .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - - clients_removed; - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(clients_removed as i64); - info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); - } - let removed = current_len - new_len; - - if removed > 0 { - let global_pool_size = self - .global_pool_size - .fetch_sub(removed, atomic::Ordering::Relaxed) - - removed; - info!("pool: performed global pool gc. size now {global_pool_size}"); - } - } - - pub(crate) fn get( - self: &Arc, - ctx: &RequestMonitoring, - conn_info: &ConnInfo, - ) -> Result>, HttpConnError> { - let mut client: Option> = None; - let Some(endpoint) = conn_info.endpoint_cache_key() else { - return Ok(None); - }; - - let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); - if let Some(entry) = endpoint_pool - .write() - .get_conn_entry(conn_info.db_and_user()) - { - client = Some(entry.conn); - } - let endpoint_pool = Arc::downgrade(&endpoint_pool); - - // ok return cached connection if found and establish a new one otherwise - if let Some(client) = client { - if client.is_closed() { - info!("pool: cached connection '{conn_info}' is closed, opening a new one"); - return Ok(None); - } - tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); - tracing::Span::current().record( - "pid", - tracing::field::display(client.inner.get_process_id()), - ); - info!( - cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), - "pool: reusing connection '{conn_info}'" - ); - client.session.send(ctx.session_id())?; - ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); - ctx.success(); - return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); - } - Ok(None) - } - - fn get_or_create_endpoint_pool( - self: &Arc, - endpoint: &EndpointCacheKey, - ) -> Arc>> { - // fast path - if let Some(pool) = self.global_pool.get(endpoint) { - return pool.clone(); - } - - // slow path - let new_pool = Arc::new(RwLock::new(EndpointConnPool { - pools: HashMap::new(), - total_conns: 0, - max_conns: self.config.pool_options.max_conns_per_endpoint, - _guard: Metrics::get().proxy.http_endpoint_pools.guard(), - global_connections_count: self.global_connections_count.clone(), - global_pool_size_max_conns: self.config.pool_options.max_total_conns, - })); - - // find or create a pool for this endpoint - let mut created = false; - let pool = self - .global_pool - .entry(endpoint.clone()) - .or_insert_with(|| { - created = true; - new_pool - }) - .clone(); - - // log new global pool size - if created { - let global_pool_size = self - .global_pool_size - .fetch_add(1, atomic::Ordering::Relaxed) - + 1; - info!( - "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" - ); - } - - pool - } -} - pub(crate) fn poll_client( global_pool: Arc>, ctx: &RequestMonitoring, @@ -576,7 +152,7 @@ pub(crate) fn poll_client( } .instrument(span)); - let inner = ClientInner { + let inner = ClientInnerRemote { inner: client, session: tx, cancel, @@ -586,7 +162,7 @@ pub(crate) fn poll_client( Client::new(inner, conn_info, pool_clone) } -struct ClientInner { +pub(crate) struct ClientInnerRemote { inner: C, session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -594,143 +170,48 @@ struct ClientInner { conn_id: uuid::Uuid, } -impl Drop for ClientInner { +impl ClientInnerRemote { + pub(crate) fn inner_mut(&mut self) -> &mut C { + &mut self.inner + } + + pub(crate) fn inner(&self) -> &C { + &self.inner + } + + pub(crate) fn session(&mut self) -> &mut tokio::sync::watch::Sender { + &mut self.session + } + + pub(crate) fn aux(&self) -> &MetricsAuxInfo { + &self.aux + } + + pub(crate) fn get_conn_id(&self) -> uuid::Uuid { + self.conn_id + } + + pub(crate) fn is_closed(&self) -> bool { + self.inner.is_closed() + } +} + +impl Drop for ClientInnerRemote { fn drop(&mut self) { // on client drop, tell the conn to shut down self.cancel.cancel(); } } -pub(crate) trait ClientInnerExt: Sync + Send + 'static { - fn is_closed(&self) -> bool; - fn get_process_id(&self) -> i32; -} - -impl ClientInnerExt for tokio_postgres::Client { - fn is_closed(&self) -> bool { - self.is_closed() - } - fn get_process_id(&self) -> i32 { - self.get_process_id() - } -} - -impl ClientInner { - pub(crate) fn is_closed(&self) -> bool { - self.inner.is_closed() - } -} - -impl Client { - pub(crate) fn metrics(&self) -> Arc { - let aux = &self.inner.as_ref().unwrap().aux; - USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id, - branch_id: aux.branch_id, - }) - } -} - -pub(crate) struct Client { - span: Span, - inner: Option>, - conn_info: ConnInfo, - pool: Weak>>, -} - -pub(crate) struct Discard<'a, C: ClientInnerExt> { - conn_info: &'a ConnInfo, - pool: &'a mut Weak>>, -} - -impl Client { - pub(self) fn new( - inner: ClientInner, - conn_info: ConnInfo, - pool: Weak>>, - ) -> Self { - Self { - inner: Some(inner), - span: Span::current(), - conn_info, - pool, - } - } - pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) { - let Self { - inner, - pool, - conn_info, - span: _, - } = self; - let inner = inner.as_mut().expect("client inner should not be removed"); - (&mut inner.inner, Discard { conn_info, pool }) - } -} - -impl Discard<'_, C> { - pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { - let conn_info = &self.conn_info; - if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is not idle"); - } - } - pub(crate) fn discard(&mut self) { - let conn_info = &self.conn_info; - if std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); - } - } -} - -impl Deref for Client { - type Target = C; - - fn deref(&self) -> &Self::Target { - &self - .inner - .as_ref() - .expect("client inner should not be removed") - .inner - } -} - -impl Client { - fn do_drop(&mut self) -> Option { - let conn_info = self.conn_info.clone(); - let client = self - .inner - .take() - .expect("client inner should not be removed"); - if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { - let current_span = self.span.clone(); - // return connection to the pool - return Some(move || { - let _span = current_span.enter(); - EndpointConnPool::put(&conn_pool, &conn_info, client); - }); - } - None - } -} - -impl Drop for Client { - fn drop(&mut self) { - if let Some(drop) = self.do_drop() { - tokio::task::spawn_blocking(drop); - } - } -} - #[cfg(test)] mod tests { - use std::{mem, sync::atomic::AtomicBool}; - - use crate::{ - proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId, - }; + use std::mem; + use std::sync::atomic::AtomicBool; use super::*; + use crate::proxy::NeonOptions; + use crate::serverless::cancel_set::CancelSet; + use crate::{BranchId, EndpointId, ProjectId}; struct MockClient(Arc); impl MockClient { @@ -747,12 +228,12 @@ mod tests { } } - fn create_inner() -> ClientInner { + fn create_inner() -> ClientInnerRemote { create_inner_with(MockClient::new(false)) } - fn create_inner_with(client: MockClient) -> ClientInner { - ClientInner { + fn create_inner_with(client: MockClient) -> ClientInnerRemote { + ClientInnerRemote { inner: client, session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), cancel: CancellationToken::new(), @@ -799,7 +280,7 @@ mod tests { { let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); assert_eq!(0, pool.get_global_connections_count()); - client.inner().1.discard(); + client.inner_mut().1.discard(); // Discard should not add the connection from the pool. assert_eq!(0, pool.get_global_connections_count()); } diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs new file mode 100644 index 0000000000..844730194d --- /dev/null +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -0,0 +1,560 @@ +use std::collections::HashMap; +use std::ops::Deref; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::{Arc, Weak}; +use std::time::Duration; + +use dashmap::DashMap; +use parking_lot::RwLock; +use rand::Rng; +use tokio_postgres::ReadyForQueryStatus; +use tracing::{debug, info, Span}; + +use super::backend::HttpConnError; +use super::conn_pool::ClientInnerRemote; +use crate::auth::backend::ComputeUserInfo; +use crate::context::RequestMonitoring; +use crate::control_plane::messages::ColdStartInfo; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::{DbName, EndpointCacheKey, RoleName}; + +#[derive(Debug, Clone)] +pub(crate) struct ConnInfo { + pub(crate) user_info: ComputeUserInfo, + pub(crate) dbname: DbName, +} + +impl ConnInfo { + // hm, change to hasher to avoid cloning? + pub(crate) fn db_and_user(&self) -> (DbName, RoleName) { + (self.dbname.clone(), self.user_info.user.clone()) + } + + pub(crate) fn endpoint_cache_key(&self) -> Option { + // We don't want to cache http connections for ephemeral endpoints. + if self.user_info.options.is_ephemeral() { + None + } else { + Some(self.user_info.endpoint_cache_key()) + } + } +} + +pub(crate) struct ConnPoolEntry { + pub(crate) conn: ClientInnerRemote, + pub(crate) _last_access: std::time::Instant, +} + +// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool +// Number of open connections is limited by the `max_conns_per_endpoint`. +pub(crate) struct EndpointConnPool { + pools: HashMap<(DbName, RoleName), DbUserConnPool>, + total_conns: usize, + max_conns: usize, + _guard: HttpEndpointPoolsGuard<'static>, + global_connections_count: Arc, + global_pool_size_max_conns: usize, +} + +impl EndpointConnPool { + fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option> { + let Self { + pools, + total_conns, + global_connections_count, + .. + } = self; + pools.get_mut(&db_user).and_then(|pool_entries| { + let (entry, removed) = pool_entries.get_conn_entry(total_conns); + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + entry + }) + } + + pub(crate) fn remove_client( + &mut self, + db_user: (DbName, RoleName), + conn_id: uuid::Uuid, + ) -> bool { + let Self { + pools, + total_conns, + global_connections_count, + .. + } = self; + if let Some(pool) = pools.get_mut(&db_user) { + let old_len = pool.conns.len(); + pool.conns.retain(|conn| conn.conn.get_conn_id() != conn_id); + let new_len = pool.conns.len(); + let removed = old_len - new_len; + if removed > 0 { + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + } + *total_conns -= removed; + removed > 0 + } else { + false + } + } + + pub(crate) fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInnerRemote) { + let conn_id = client.get_conn_id(); + + if client.is_closed() { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); + return; + } + + let global_max_conn = pool.read().global_pool_size_max_conns; + if pool + .read() + .global_connections_count + .load(atomic::Ordering::Relaxed) + >= global_max_conn + { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full"); + return; + } + + // return connection to the pool + let mut returned = false; + let mut per_db_size = 0; + let total_conns = { + let mut pool = pool.write(); + + if pool.total_conns < pool.max_conns { + let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); + pool_entries.conns.push(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), + }); + + returned = true; + per_db_size = pool_entries.conns.len(); + + pool.total_conns += 1; + pool.global_connections_count + .fetch_add(1, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); + } + + pool.total_conns + }; + + // do logging outside of the mutex + if returned { + info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); + } else { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); + } + } +} + +impl Drop for EndpointConnPool { + fn drop(&mut self) { + if self.total_conns > 0 { + self.global_connections_count + .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); + } + } +} + +pub(crate) struct DbUserConnPool { + pub(crate) conns: Vec>, +} + +impl Default for DbUserConnPool { + fn default() -> Self { + Self { conns: Vec::new() } + } +} + +impl DbUserConnPool { + fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { + let old_len = self.conns.len(); + + self.conns.retain(|conn| !conn.conn.is_closed()); + + let new_len = self.conns.len(); + let removed = old_len - new_len; + *conns -= removed; + removed + } + + pub(crate) fn get_conn_entry( + &mut self, + conns: &mut usize, + ) -> (Option>, usize) { + let mut removed = self.clear_closed_clients(conns); + let conn = self.conns.pop(); + if conn.is_some() { + *conns -= 1; + removed += 1; + } + + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + + (conn, removed) + } +} + +pub(crate) struct GlobalConnPool { + // endpoint -> per-endpoint connection pool + // + // That should be a fairly conteded map, so return reference to the per-endpoint + // pool as early as possible and release the lock. + global_pool: DashMap>>>, + + /// Number of endpoint-connection pools + /// + /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. + /// That seems like far too much effort, so we're using a relaxed increment counter instead. + /// It's only used for diagnostics. + global_pool_size: AtomicUsize, + + /// Total number of connections in the pool + global_connections_count: Arc, + + config: &'static crate::config::HttpConfig, +} + +#[derive(Debug, Clone, Copy)] +pub struct GlobalConnPoolOptions { + // Maximum number of connections per one endpoint. + // Can mix different (dbname, username) connections. + // When running out of free slots for a particular endpoint, + // falls back to opening a new connection for each request. + pub max_conns_per_endpoint: usize, + + pub gc_epoch: Duration, + + pub pool_shards: usize, + + pub idle_timeout: Duration, + + pub opt_in: bool, + + // Total number of connections in the pool. + pub max_total_conns: usize, +} + +impl GlobalConnPool { + pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { + let shards = config.pool_options.pool_shards; + Arc::new(Self { + global_pool: DashMap::with_shard_amount(shards), + global_pool_size: AtomicUsize::new(0), + config, + global_connections_count: Arc::new(AtomicUsize::new(0)), + }) + } + + #[cfg(test)] + pub(crate) fn get_global_connections_count(&self) -> usize { + self.global_connections_count + .load(atomic::Ordering::Relaxed) + } + + pub(crate) fn get_idle_timeout(&self) -> Duration { + self.config.pool_options.idle_timeout + } + + pub(crate) fn shutdown(&self) { + // drops all strong references to endpoint-pools + self.global_pool.clear(); + } + + pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { + let epoch = self.config.pool_options.gc_epoch; + let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); + loop { + interval.tick().await; + + let shard = rng.gen_range(0..self.global_pool.shards().len()); + self.gc(shard); + } + } + + pub(crate) fn gc(&self, shard: usize) { + debug!(shard, "pool: performing epoch reclamation"); + + // acquire a random shard lock + let mut shard = self.global_pool.shards()[shard].write(); + + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); + let current_len = shard.len(); + let mut clients_removed = 0; + shard.retain(|endpoint, x| { + // if the current endpoint pool is unique (no other strong or weak references) + // then it is currently not in use by any connections. + if let Some(pool) = Arc::get_mut(x.get_mut()) { + let EndpointConnPool { + pools, total_conns, .. + } = pool.get_mut(); + + // ensure that closed clients are removed + for db_pool in pools.values_mut() { + clients_removed += db_pool.clear_closed_clients(total_conns); + } + + // we only remove this pool if it has no active connections + if *total_conns == 0 { + info!("pool: discarding pool for endpoint {endpoint}"); + return false; + } + } + + true + }); + + let new_len = shard.len(); + drop(shard); + timer.observe(); + + // Do logging outside of the lock. + if clients_removed > 0 { + let size = self + .global_connections_count + .fetch_sub(clients_removed, atomic::Ordering::Relaxed) + - clients_removed; + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); + info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + } + let removed = current_len - new_len; + + if removed > 0 { + let global_pool_size = self + .global_pool_size + .fetch_sub(removed, atomic::Ordering::Relaxed) + - removed; + info!("pool: performed global pool gc. size now {global_pool_size}"); + } + } + + pub(crate) fn get_or_create_endpoint_pool( + self: &Arc, + endpoint: &EndpointCacheKey, + ) -> Arc>> { + // fast path + if let Some(pool) = self.global_pool.get(endpoint) { + return pool.clone(); + } + + // slow path + let new_pool = Arc::new(RwLock::new(EndpointConnPool { + pools: HashMap::new(), + total_conns: 0, + max_conns: self.config.pool_options.max_conns_per_endpoint, + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), + global_connections_count: self.global_connections_count.clone(), + global_pool_size_max_conns: self.config.pool_options.max_total_conns, + })); + + // find or create a pool for this endpoint + let mut created = false; + let pool = self + .global_pool + .entry(endpoint.clone()) + .or_insert_with(|| { + created = true; + new_pool + }) + .clone(); + + // log new global pool size + if created { + let global_pool_size = self + .global_pool_size + .fetch_add(1, atomic::Ordering::Relaxed) + + 1; + info!( + "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" + ); + } + + pool + } + + pub(crate) fn get( + self: &Arc, + ctx: &RequestMonitoring, + conn_info: &ConnInfo, + ) -> Result>, HttpConnError> { + let mut client: Option> = None; + let Some(endpoint) = conn_info.endpoint_cache_key() else { + return Ok(None); + }; + + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + if let Some(entry) = endpoint_pool + .write() + .get_conn_entry(conn_info.db_and_user()) + { + client = Some(entry.conn); + } + let endpoint_pool = Arc::downgrade(&endpoint_pool); + + // ok return cached connection if found and establish a new one otherwise + if let Some(mut client) = client { + if client.is_closed() { + info!("pool: cached connection '{conn_info}' is closed, opening a new one"); + return Ok(None); + } + tracing::Span::current() + .record("conn_id", tracing::field::display(client.get_conn_id())); + tracing::Span::current().record( + "pid", + tracing::field::display(client.inner().get_process_id()), + ); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + + client.session().send(ctx.session_id())?; + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); + ctx.success(); + return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); + } + Ok(None) + } +} + +impl Client { + pub(crate) fn new( + inner: ClientInnerRemote, + conn_info: ConnInfo, + pool: Weak>>, + ) -> Self { + Self { + inner: Some(inner), + span: Span::current(), + conn_info, + pool, + } + } + + pub(crate) fn inner_mut(&mut self) -> (&mut C, Discard<'_, C>) { + let Self { + inner, + pool, + conn_info, + span: _, + } = self; + let inner = inner.as_mut().expect("client inner should not be removed"); + let inner_ref = inner.inner_mut(); + (inner_ref, Discard { conn_info, pool }) + } + + pub(crate) fn metrics(&self) -> Arc { + let aux = &self.inner.as_ref().unwrap().aux(); + USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, + }) + } + + pub(crate) fn do_drop(&mut self) -> Option> { + let conn_info = self.conn_info.clone(); + let client = self + .inner + .take() + .expect("client inner should not be removed"); + if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { + let current_span = self.span.clone(); + // return connection to the pool + return Some(move || { + let _span = current_span.enter(); + EndpointConnPool::put(&conn_pool, &conn_info, client); + }); + } + None + } +} + +pub(crate) struct Client { + span: Span, + inner: Option>, + conn_info: ConnInfo, + pool: Weak>>, +} + +impl Drop for Client { + fn drop(&mut self) { + if let Some(drop) = self.do_drop() { + tokio::task::spawn_blocking(drop); + } + } +} + +impl Deref for Client { + type Target = C; + + fn deref(&self) -> &Self::Target { + self.inner + .as_ref() + .expect("client inner should not be removed") + .inner() + } +} + +pub(crate) trait ClientInnerExt: Sync + Send + 'static { + fn is_closed(&self) -> bool; + fn get_process_id(&self) -> i32; +} + +impl ClientInnerExt for tokio_postgres::Client { + fn is_closed(&self) -> bool { + self.is_closed() + } + + fn get_process_id(&self) -> i32 { + self.get_process_id() + } +} + +pub(crate) struct Discard<'a, C: ClientInnerExt> { + conn_info: &'a ConnInfo, + pool: &'a mut Weak>>, +} + +impl Discard<'_, C> { + pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { + let conn_info = &self.conn_info; + if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { + info!("pool: throwing away connection '{conn_info}' because connection is not idle"); + } + } + pub(crate) fn discard(&mut self) { + let conn_info = &self.conn_info; + if std::mem::take(self.pool).strong_count() > 0 { + info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); + } + } +} diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 6d61536f1a..363e397976 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -1,37 +1,36 @@ +use std::collections::VecDeque; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::{Arc, Weak}; + use dashmap::DashMap; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; use rand::Rng; -use std::collections::VecDeque; -use std::sync::atomic::{self, AtomicUsize}; -use std::{sync::Arc, sync::Weak}; use tokio::net::TcpStream; +use tracing::{debug, error, info, info_span, Instrument}; +use super::conn_pool_lib::{ClientInnerExt, ConnInfo}; +use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{context::RequestMonitoring, EndpointCacheKey}; - -use tracing::{debug, error}; -use tracing::{info, info_span, Instrument}; - -use super::conn_pool::ConnInfo; +use crate::EndpointCacheKey; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; #[derive(Clone)] -struct ConnPoolEntry { - conn: Send, +pub(crate) struct ConnPoolEntry { + conn: C, conn_id: uuid::Uuid, aux: MetricsAuxInfo, } // Per-endpoint connection pool // Number of open connections is limited by the `max_conns_per_endpoint`. -pub(crate) struct EndpointConnPool { +pub(crate) struct EndpointConnPool { // TODO(conrad): // either we should open more connections depending on stream count // (not exposed by hyper, need our own counter) @@ -41,13 +40,13 @@ pub(crate) struct EndpointConnPool { // seems somewhat redundant though. // // Probably we should run a semaphore and just the single conn. TBD. - conns: VecDeque, + conns: VecDeque>, _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, } -impl EndpointConnPool { - fn get_conn_entry(&mut self) -> Option { +impl EndpointConnPool { + fn get_conn_entry(&mut self) -> Option> { let Self { conns, .. } = self; loop { @@ -82,7 +81,7 @@ impl EndpointConnPool { } } -impl Drop for EndpointConnPool { +impl Drop for EndpointConnPool { fn drop(&mut self) { if !self.conns.is_empty() { self.global_connections_count @@ -96,12 +95,12 @@ impl Drop for EndpointConnPool { } } -pub(crate) struct GlobalConnPool { +pub(crate) struct GlobalConnPool { // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>, + global_pool: DashMap>>>, /// Number of endpoint-connection pools /// @@ -116,7 +115,7 @@ pub(crate) struct GlobalConnPool { config: &'static crate::config::HttpConfig, } -impl GlobalConnPool { +impl GlobalConnPool { pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { let shards = config.pool_options.pool_shards; Arc::new(Self { @@ -211,7 +210,7 @@ impl GlobalConnPool { self: &Arc, ctx: &RequestMonitoring, conn_info: &ConnInfo, - ) -> Option { + ) -> Option> { let endpoint = conn_info.endpoint_cache_key()?; let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); let client = endpoint_pool.write().get_conn_entry()?; @@ -229,7 +228,7 @@ impl GlobalConnPool { fn get_or_create_endpoint_pool( self: &Arc, endpoint: &EndpointCacheKey, - ) -> Arc> { + ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); @@ -269,14 +268,14 @@ impl GlobalConnPool { } pub(crate) fn poll_http2_client( - global_pool: Arc, + global_pool: Arc>, ctx: &RequestMonitoring, conn_info: &ConnInfo, client: Send, connection: Connect, conn_id: uuid::Uuid, aux: MetricsAuxInfo, -) -> Client { +) -> Client { let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); let session_id = ctx.session_id(); @@ -323,13 +322,13 @@ pub(crate) fn poll_http2_client( Client::new(client, aux) } -pub(crate) struct Client { - pub(crate) inner: Send, +pub(crate) struct Client { + pub(crate) inner: C, aux: MetricsAuxInfo, } -impl Client { - pub(self) fn new(inner: Send, aux: MetricsAuxInfo) -> Self { +impl Client { + pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self { Self { inner, aux } } @@ -340,3 +339,14 @@ impl Client { }) } } + +impl ClientInnerExt for Send { + fn is_closed(&self) -> bool { + self.is_closed() + } + + fn get_process_id(&self) -> i32 { + // ideally throw something meaningful + -1 + } +} diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index 87a72ec5f0..c0208d4f68 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -1,12 +1,11 @@ //! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility //! Will merge back in at some point in the future. -use bytes::Bytes; - use anyhow::Context; +use bytes::Bytes; use http::{Response, StatusCode}; -use http_body_util::{combinators::BoxBody, BodyExt, Full}; - +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Full}; use serde::Serialize; use utils::http::error::ApiError; @@ -41,6 +40,10 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::TOO_MANY_REQUESTS, + ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 9f328a0e1d..569e2da571 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,7 +1,5 @@ -use serde_json::Map; -use serde_json::Value; -use tokio_postgres::types::Kind; -use tokio_postgres::types::Type; +use serde_json::{Map, Value}; +use tokio_postgres::types::{Kind, Type}; use tokio_postgres::Row; // @@ -157,10 +155,10 @@ fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result Result { - _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v) + pg_array_parse_inner(pg_array, elem_type, false).map(|(v, _)| v) } -fn _pg_array_parse( +fn pg_array_parse_inner( pg_array: &str, elem_type: &Type, nested: bool, @@ -213,7 +211,7 @@ fn _pg_array_parse( '{' if !quote => { level += 1; if level > 1 { - let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?; + let (res, off) = pg_array_parse_inner(&pg_array[i..], elem_type, true)?; entries.push(res); for _ in 0..off - 1 { pg_array_chr.next(); @@ -256,9 +254,10 @@ fn _pg_array_parse( #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + #[test] fn test_atomic_types_to_pg_params() { let json = vec![Value::Bool(true), Value::Bool(false)]; diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 1dde5952e1..beb2ad4e8f 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -1,38 +1,52 @@ -use futures::{future::poll_fn, Future}; +//! Manages the pool of connections between local_proxy and postgres. +//! +//! The pool is keyed by database and role_name, and can contain multiple connections +//! shared between users. +//! +//! The pool manages the pg_session_jwt extension used for authorizing +//! requests in the db. +//! +//! The first time a db/role pair is seen, local_proxy attempts to install the extension +//! and grant usage to the role on the given schema. + +use std::collections::HashMap; +use std::pin::pin; +use std::sync::{Arc, Weak}; +use std::task::{ready, Poll}; +use std::time::Duration; + +use futures::future::poll_fn; +use futures::Future; +use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; -use rand::rngs::OsRng; -use serde_json::Value; +use serde_json::value::RawValue; use signature::Signer; -use std::task::{ready, Poll}; -use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::types::ToSql; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_util::sync::CancellationToken; -use typed_json::json; +use tracing::{error, info, info_span, warn, Instrument, Span}; +use super::backend::HttpConnError; +use super::conn_pool_lib::{ClientInnerExt, ConnInfo}; +use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{context::RequestMonitoring, DbName, RoleName}; +use crate::{DbName, RoleName}; -use tracing::{debug, error, warn, Span}; -use tracing::{info, info_span, Instrument}; - -use super::backend::HttpConnError; -use super::conn_pool::{ClientInnerExt, ConnInfo}; +pub(crate) const EXT_NAME: &str = "pg_session_jwt"; +pub(crate) const EXT_VERSION: &str = "0.1.1"; +pub(crate) const EXT_SCHEMA: &str = "auth"; struct ConnPoolEntry { conn: ClientInner, _last_access: std::time::Instant, } -// /// key id for the pg_session_jwt state -// static PG_SESSION_JWT_KID: AtomicU64 = AtomicU64::new(1); - // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. pub(crate) struct EndpointConnPool { @@ -138,11 +152,18 @@ impl Drop for EndpointConnPool { pub(crate) struct DbUserConnPool { conns: Vec>, + + // true if we have definitely installed the extension and + // granted the role access to the auth schema. + initialized: bool, } impl Default for DbUserConnPool { fn default() -> Self { - Self { conns: Vec::new() } + Self { + conns: Vec::new(), + initialized: false, + } } } @@ -197,25 +218,16 @@ impl LocalConnPool { self.config.pool_options.idle_timeout } - // pub(crate) fn shutdown(&self) { - // let mut pool = self.global_pool.write(); - // pool.pools.clear(); - // pool.total_conns = 0; - // } - pub(crate) fn get( self: &Arc, ctx: &RequestMonitoring, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { - let mut client: Option> = None; - if let Some(entry) = self + let client = self .global_pool .write() .get_conn_entry(conn_info.db_and_user()) - { - client = Some(entry.conn); - } + .map(|entry| entry.conn); // ok return cached connection if found and establish a new one otherwise if let Some(client) = client { @@ -243,14 +255,33 @@ impl LocalConnPool { } Ok(None) } + + pub(crate) fn initialized(self: &Arc, conn_info: &ConnInfo) -> bool { + self.global_pool + .read() + .pools + .get(&conn_info.db_and_user()) + .map_or(false, |pool| pool.initialized) + } + + pub(crate) fn set_initialized(self: &Arc, conn_info: &ConnInfo) { + self.global_pool + .write() + .pools + .entry(conn_info.db_and_user()) + .or_default() + .initialized = true; + } } +#[allow(clippy::too_many_arguments)] pub(crate) fn poll_client( global_pool: Arc>, ctx: &RequestMonitoring, conn_info: ConnInfo, client: tokio_postgres::Client, mut connection: tokio_postgres::Connection, + key: SigningKey, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> LocalClient { @@ -346,8 +377,6 @@ pub(crate) fn poll_client( } .instrument(span)); - let key = SigningKey::random(&mut OsRng); - let inner = ClientInner { inner: client, session: tx, @@ -360,7 +389,7 @@ pub(crate) fn poll_client( LocalClient::new(inner, conn_info, pool_clone) } -struct ClientInner { +pub(crate) struct ClientInner { inner: C, session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -385,13 +414,24 @@ impl ClientInner { } } -impl LocalClient { - pub(crate) fn metrics(&self) -> Arc { - let aux = &self.inner.as_ref().unwrap().aux; - USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id, - branch_id: aux.branch_id, - }) +impl ClientInner { + pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> { + self.jti += 1; + let token = resign_jwt(&self.key, payload, self.jti)?; + + // initiates the auth session + self.inner.simple_query("discard all").await?; + self.inner + .query( + "select auth.jwt_session_init($1)", + &[&token as &(dyn ToSql + Sync)], + ) + .await?; + + let pid = self.inner.get_process_id(); + info!(pid, jti = self.jti, "user session state init"); + + Ok(()) } } @@ -420,6 +460,18 @@ impl LocalClient { pool, } } + + pub(crate) fn client_inner(&mut self) -> (&mut ClientInner, Discard<'_, C>) { + let Self { + inner, + pool, + conn_info, + span: _, + } = self; + let inner_m = inner.as_mut().expect("client inner should not be removed"); + (inner_m, Discard { conn_info, pool }) + } + pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) { let Self { inner, @@ -430,94 +482,81 @@ impl LocalClient { let inner = inner.as_mut().expect("client inner should not be removed"); (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn key(&self) -> &SigningKey { - let inner = &self - .inner - .as_ref() - .expect("client inner should not be removed"); - &inner.key - } } -impl LocalClient { - pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> { - let inner = self - .inner - .as_mut() - .expect("client inner should not be removed"); - inner.jti += 1; - - let kid = inner.inner.get_process_id(); - let header = json!({"kid":kid}).to_string(); - - let mut payload = serde_json::from_slice::>(payload) - .map_err(HttpConnError::JwtPayloadError)?; - payload.insert("jti".to_string(), Value::Number(inner.jti.into())); - let payload = Value::Object(payload).to_string(); - - debug!( - kid, - jti = inner.jti, - ?header, - ?payload, - "signing new ephemeral JWT" - ); - - let token = sign_jwt(&inner.key, header, payload); - - // initiates the auth session - inner.inner.simple_query("discard all").await?; - inner - .inner - .query( - "select auth.jwt_session_init($1)", - &[&token as &(dyn ToSql + Sync)], - ) - .await?; - - info!(kid, jti = inner.jti, "user session state init"); - - Ok(()) - } +/// implements relatively efficient in-place json object key upserting +/// +/// only supports top-level keys +fn upsert_json_object( + payload: &[u8], + key: &str, + value: &RawValue, +) -> Result { + let mut payload = serde_json::from_slice::>(payload)?; + payload.insert(key, value); + serde_json::to_string(&payload) } -fn sign_jwt(sk: &SigningKey, header: String, payload: String) -> String { - let header = Base64UrlUnpadded::encode_string(header.as_bytes()); - let payload = Base64UrlUnpadded::encode_string(payload.as_bytes()); +fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result { + let mut buffer = itoa::Buffer::new(); - let message = format!("{header}.{payload}"); - let sig: Signature = sk.sign(message.as_bytes()); - let base64_sig = Base64UrlUnpadded::encode_string(&sig.to_bytes()); - format!("{message}.{base64_sig}") + // encode the jti integer to a json rawvalue + let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)).unwrap(); + + // update the jti in-place + let payload = + upsert_json_object(payload, "jti", jti).map_err(HttpConnError::JwtPayloadError)?; + + // sign the jwt + let token = sign_jwt(sk, payload.as_bytes()); + + Ok(token) } -impl Discard<'_, C> { - pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { - let conn_info = &self.conn_info; - if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!( - "local_pool: throwing away connection '{conn_info}' because connection is not idle" - ); - } - } - pub(crate) fn discard(&mut self) { - let conn_info = &self.conn_info; - if std::mem::take(self.pool).strong_count() > 0 { - info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); - } - } +fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { + let header_len = 20; + let payload_len = Base64UrlUnpadded::encoded_len(payload); + let signature_len = Base64UrlUnpadded::encoded_len(&[0; 64]); + let total_len = header_len + payload_len + signature_len + 2; + + let mut jwt = String::with_capacity(total_len); + let cap = jwt.capacity(); + + // we only need an empty header with the alg specified. + // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9" + jwt.push_str("eyJhbGciOiJFUzI1NiJ9."); + + // encode the jwt payload in-place + base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt); + + // create the signature from the encoded header || payload + let sig: Signature = sk.sign(jwt.as_bytes()); + + jwt.push('.'); + + // encode the jwt signature in-place + base64::encode_config_buf(sig.to_bytes(), base64::URL_SAFE_NO_PAD, &mut jwt); + + debug_assert_eq!( + jwt.len(), + total_len, + "the jwt len should match our expected len" + ); + debug_assert_eq!(jwt.capacity(), cap, "the jwt capacity should not change"); + + jwt } impl LocalClient { - pub fn get_client(&self) -> &C { - &self - .inner - .as_ref() - .expect("client inner should not be removed") - .inner + pub(crate) fn metrics(&self) -> Arc { + let aux = &self.inner.as_ref().unwrap().aux; + USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, + }) } - fn do_drop(&mut self) -> Option { + fn do_drop(&mut self) -> Option> { let conn_info = self.conn_info.clone(); let client = self .inner @@ -542,3 +581,47 @@ impl Drop for LocalClient { } } } + +impl Discard<'_, C> { + pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { + let conn_info = &self.conn_info; + if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { + info!( + "local_pool: throwing away connection '{conn_info}' because connection is not idle" + ); + } + } + pub(crate) fn discard(&mut self) { + let conn_info = &self.conn_info; + if std::mem::take(self.pool).strong_count() > 0 { + info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); + } + } +} + +#[cfg(test)] +mod tests { + use p256::ecdsa::SigningKey; + use typed_json::json; + + use super::resign_jwt; + + #[test] + fn jwt_token_snapshot() { + let key = SigningKey::from_bytes(&[1; 32].into()).unwrap(); + let data = + json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string(); + + let jwt = resign_jwt(&key, data.as_bytes(), 2).unwrap(); + + // To validate the JWT, copy the JWT string and paste it into https://jwt.io/. + // In the public-key box, paste the following jwk public key + // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}` + + // let pub_key = p256::ecdsa::VerifyingKey::from(&key); + // let pub_key = p256::PublicKey::from(pub_key); + // println!("{}", pub_key.to_jwk_string()); + + assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA"); + } +} diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 95f64e972c..29ff7b9d91 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -5,6 +5,7 @@ mod backend; pub mod cancel_set; mod conn_pool; +mod conn_pool_lib; mod http_conn_pool; mod http_util; mod json; @@ -12,12 +13,15 @@ mod local_conn_pool; mod sql_over_http; mod websocket; +use std::net::{IpAddr, SocketAddr}; +use std::pin::{pin, Pin}; +use std::sync::Arc; + +use anyhow::Context; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; -pub use conn_pool::GlobalConnPoolOptions; - -use anyhow::Context; +pub use conn_pool_lib::GlobalConnPoolOptions; use futures::future::{select, Either}; use futures::TryFutureExt; use http::{Method, Response, StatusCode}; @@ -29,9 +33,13 @@ use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; +use tracing::{info, warn, Instrument}; +use utils::http::error::ApiError; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; @@ -43,19 +51,11 @@ use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::{IpAddr, SocketAddr}; -use std::pin::{pin, Pin}; -use std::sync::Arc; -use tokio::net::{TcpListener, TcpStream}; -use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; -use utils::http::error::ApiError; - pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( config: &'static ProxyConfig, - auth_backend: &'static crate::auth::Backend<'static, (), ()>, + auth_backend: &'static crate::auth::Backend<'static, ()>, ws_listener: TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, @@ -66,7 +66,7 @@ pub async fn task_main( } let local_pool = local_conn_pool::LocalConnPool::new(&config.http_config); - let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config); + let conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config); { let conn_pool = Arc::clone(&conn_pool); tokio::spawn(async move { diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index cf3324926c..6fbb044669 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -2,77 +2,44 @@ use std::pin::pin; use std::sync::Arc; use bytes::Bytes; -use futures::future::select; -use futures::future::try_join; -use futures::future::Either; -use futures::StreamExt; -use futures::TryFutureExt; +use futures::future::{select, try_join, Either}; +use futures::{StreamExt, TryFutureExt}; use http::header::AUTHORIZATION; use http::Method; use http_body_util::combinators::BoxBody; -use http_body_util::BodyExt; -use http_body_util::Full; -use hyper::body::Body; -use hyper::body::Incoming; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{HeaderMap, Request}; +use http_body_util::{BodyExt, Full}; +use hyper::body::{Body, Incoming}; +use hyper::http::{HeaderName, HeaderValue}; +use hyper::{header, HeaderMap, Request, Response, StatusCode}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; use tokio::time; -use tokio_postgres::error::DbError; -use tokio_postgres::error::ErrorPosition; -use tokio_postgres::error::SqlState; -use tokio_postgres::GenericClient; -use tokio_postgres::IsolationLevel; -use tokio_postgres::NoTls; -use tokio_postgres::ReadyForQueryStatus; -use tokio_postgres::Transaction; +use tokio_postgres::error::{DbError, ErrorPosition, SqlState}; +use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use tokio_util::sync::CancellationToken; -use tracing::error; -use tracing::info; +use tracing::{error, info}; use typed_json::json; use url::Url; use urlencoding; use utils::http::error::ApiError; -use crate::auth::backend::ComputeCredentialKeys; -use crate::auth::backend::ComputeUserInfo; -use crate::auth::endpoint_sni; -use crate::auth::ComputeUserInfoParseError; -use crate::config::AuthenticationConfig; -use crate::config::HttpConfig; -use crate::config::ProxyConfig; -use crate::config::TlsConfig; -use crate::context::RequestMonitoring; -use crate::error::ErrorKind; -use crate::error::ReportableError; -use crate::error::UserFacingError; -use crate::metrics::HttpDirection; -use crate::metrics::Metrics; -use crate::proxy::run_until_cancelled; -use crate::proxy::NeonOptions; -use crate::serverless::backend::HttpConnError; -use crate::usage_metrics::MetricCounter; -use crate::usage_metrics::MetricCounterRecorder; -use crate::DbName; -use crate::RoleName; - -use super::backend::LocalProxyConnError; -use super::backend::PoolingBackend; -use super::conn_pool; -use super::conn_pool::AuthData; -use super::conn_pool::ConnInfo; -use super::conn_pool::ConnInfoWithAuth; +use super::backend::{LocalProxyConnError, PoolingBackend}; +use super::conn_pool::{AuthData, ConnInfoWithAuth}; +use super::conn_pool_lib::{self, ConnInfo}; use super::http_util::json_response; -use super::json::json_to_pg_text; -use super::json::pg_text_row_to_json; -use super::json::JsonConversionError; +use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; use super::local_conn_pool; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; +use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; +use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; +use crate::context::RequestMonitoring; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::{HttpDirection, Metrics}; +use crate::proxy::{run_until_cancelled, NeonOptions}; +use crate::serverless::backend::HttpConnError; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; +use crate::{DbName, RoleName}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] @@ -641,7 +608,8 @@ async fn handle_db_inner( let client = match keys.keys { ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => { let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?; - client.set_jwt_session(&payload).await?; + let (cli_inner, _dsc) = client.client_inner(); + cli_inner.set_jwt_session(&payload).await?; Client::Local(client) } _ => { @@ -1055,12 +1023,12 @@ async fn query_to_json( } enum Client { - Remote(conn_pool::Client), + Remote(conn_pool_lib::Client), Local(local_conn_pool::LocalClient), } enum Discard<'a> { - Remote(conn_pool::Discard<'a, tokio_postgres::Client>), + Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>), Local(local_conn_pool::Discard<'a, tokio_postgres::Client>), } @@ -1075,7 +1043,7 @@ impl Client { fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) { match self { Client::Remote(client) => { - let (c, d) = client.inner(); + let (c, d) = client.inner_mut(); (c, Discard::Remote(d)) } Client::Local(local_client) => { diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index fd0f0cac7f..ba36116c2c 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,13 +1,7 @@ -use crate::proxy::ErrorSource; -use crate::{ - cancellation::CancellationHandlerMain, - config::ProxyConfig, - context::RequestMonitoring, - error::{io_error, ReportableError}, - metrics::Metrics, - proxy::{handle_client, ClientMode}, - rate_limiter::EndpointRateLimiter, -}; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{ready, Context, Poll}; + use anyhow::Context as _; use bytes::{Buf, BufMut, Bytes, BytesMut}; use framed_websockets::{Frame, OpCode, WebSocketServer}; @@ -15,15 +9,17 @@ use futures::{Sink, Stream}; use hyper::upgrade::OnUpgrade; use hyper_util::rt::TokioIo; use pin_project_lite::pin_project; - -use std::{ - pin::Pin, - sync::Arc, - task::{ready, Context, Poll}, -}; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; +use crate::cancellation::CancellationHandlerMain; +use crate::config::ProxyConfig; +use crate::context::RequestMonitoring; +use crate::error::{io_error, ReportableError}; +use crate::metrics::Metrics; +use crate::proxy::{handle_client, ClientMode, ErrorSource}; +use crate::rate_limiter::EndpointRateLimiter; + pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. @@ -129,7 +125,7 @@ impl AsyncBufRead for WebSocketRw { pub(crate) async fn serve_websocket( config: &'static ProxyConfig, - auth_backend: &'static crate::auth::Backend<'static, (), ()>, + auth_backend: &'static crate::auth::Backend<'static, ()>, ctx: RequestMonitoring, websocket: OnUpgrade, cancellation_handler: Arc, @@ -184,14 +180,11 @@ mod tests { use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use tokio::{ - io::{duplex, AsyncReadExt, AsyncWriteExt}, - task::JoinSet, - }; - use tokio_tungstenite::{ - tungstenite::{protocol::Role, Message}, - WebSocketStream, - }; + use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt}; + use tokio::task::JoinSet; + use tokio_tungstenite::tungstenite::protocol::Role; + use tokio_tungstenite::tungstenite::Message; + use tokio_tungstenite::WebSocketStream; use super::WebSocketRw; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index e2fc73235e..89df48c5d3 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,19 +1,20 @@ -use crate::config::TlsServerEndPoint; -use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::metrics::Metrics; -use bytes::BytesMut; - -use pq_proto::framed::{ConnectionError, Framed}; -use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; -use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; + +use bytes::BytesMut; +use pq_proto::framed::{ConnectionError, Framed}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; +use rustls::ServerConfig; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; use tracing::debug; +use crate::config::TlsServerEndPoint; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::Metrics; + /// Stream wrapper which implements libpq's protocol. /// /// NOTE: This object deliberately doesn't implement [`AsyncRead`] diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index ee36ed462d..f944d5aec3 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,36 +1,33 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::{ - config::{MetricBackupCollectionConfig, MetricCollectionConfig}, - context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, - http, - intern::{BranchIdInt, EndpointIdInt}, -}; +use std::convert::Infallible; +use std::pin::pin; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; -use dashmap::{mapref::entry::Entry, DashMap}; +use dashmap::mapref::entry::Entry; +use dashmap::DashMap; use futures::future::select; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; -use std::{ - convert::Infallible, - pin::pin, - sync::{ - atomic::{AtomicU64, AtomicUsize, Ordering}, - Arc, - }, - time::Duration, -}; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace, warn}; use utils::backoff; use uuid::{NoContext, Timestamp}; +use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig}; +use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}; +use crate::http; +use crate::intern::{BranchIdInt, EndpointIdInt}; + const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); @@ -378,7 +375,7 @@ pub async fn task_backup( let now = Utc::now(); collect_metrics_backup_iteration( &USAGE_METRICS.backup_endpoints, - &storage, + storage.as_ref(), &hostname, prev, now, @@ -398,7 +395,7 @@ pub async fn task_backup( #[instrument(skip_all)] async fn collect_metrics_backup_iteration( endpoints: &DashMap, FastHasher>, - storage: &Option, + storage: Option<&GenericRemoteStorage>, hostname: &str, prev: DateTime, now: DateTime, @@ -449,7 +446,7 @@ async fn collect_metrics_backup_iteration( } async fn upload_events_chunk( - storage: &Option, + storage: Option<&GenericRemoteStorage>, chunk: EventChunk<'_, Event>, remote_path: &RemotePath, cancel: &CancellationToken, @@ -485,19 +482,23 @@ async fn upload_events_chunk( #[cfg(test)] mod tests { - use super::*; + use std::sync::{Arc, Mutex}; - use crate::{http, BranchId, EndpointId}; use anyhow::Error; use chrono::Utc; use consumption_metrics::{Event, EventChunk}; use http_body_util::BodyExt; - use hyper::{body::Incoming, server::conn::http1, service::service_fn, Request, Response}; + use hyper::body::Incoming; + use hyper::server::conn::http1; + use hyper::service::service_fn; + use hyper::{Request, Response}; use hyper_util::rt::TokioIo; - use std::sync::{Arc, Mutex}; use tokio::net::TcpListener; use url::Url; + use super::*; + use crate::{http, BranchId, EndpointId}; + #[tokio::test] async fn metrics() { type Report = EventChunk<'static, Event>; @@ -576,10 +577,10 @@ mod tests { // counter is unregistered assert!(metrics.endpoints.is_empty()); - collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000) .await; assert!(!metrics.backup_endpoints.is_empty()); - collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000) .await; // backup counter is unregistered after the second iteration assert!(metrics.backup_endpoints.is_empty()); diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 86d0f9e8b2..330e73f02f 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -1,8 +1,9 @@ +use std::pin::Pin; +use std::task; + use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; -use std::pin::Pin; -use std::task; use thiserror::Error; use tokio::sync::oneshot; @@ -72,7 +73,7 @@ struct DropKey<'a, T> { registry: &'a Waiters, } -impl<'a, T> Drop for DropKey<'a, T> { +impl Drop for DropKey<'_, T> { fn drop(&mut self) { self.registry.0.lock().remove(&self.key); } @@ -99,9 +100,10 @@ impl std::future::Future for Waiter<'_, T> { #[cfg(test)] mod tests { - use super::*; use std::sync::Arc; + use super::*; + #[tokio::test] async fn test_waiter() -> anyhow::Result<()> { let waiters = Arc::new(Waiters::default()); diff --git a/pyproject.toml b/pyproject.toml index 9cd315bb96..862ed49638 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ kafka-python = "^2.0.2" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" -ruff = "^0.2.2" +ruff = "^0.7.0" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 3c5d0b12a6..92b7929c7f 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.81.0" +channel = "1.82.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index e35f806e90..2a9ca85299 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -498,21 +498,18 @@ impl WalAcceptor { // we will send keepalives by replying to these requests once per second. let mut next_keepalive = Instant::now(); - loop { - let opt_msg = self.msg_rx.recv().await; - if opt_msg.is_none() { - return Ok(()); // chan closed, streaming terminated - } - let mut next_msg = opt_msg.unwrap(); - + while let Some(mut next_msg) = self.msg_rx.recv().await { // Update walreceiver state in shmem for reporting. if let ProposerAcceptorMessage::Elected(_) = &next_msg { walreceiver_guard.get().status = WalReceiverStatus::Streaming; } let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) { - // loop through AppendRequest's while it's readily available to - // write as many WAL as possible without fsyncing + // Loop through AppendRequests while available to write as many WAL records as + // possible without fsyncing. + // + // Make sure the WAL is flushed before returning, see: + // https://github.com/neondatabase/neon/issues/9259 // // Note: this will need to be rewritten if we want to read non-AppendRequest messages here. // Otherwise, we might end up in a situation where we read a message, but don't @@ -522,7 +519,7 @@ impl WalAcceptor { if let Some(reply) = self.tli.process_msg(&noflush_msg).await? { if self.reply_tx.send(reply).await.is_err() { - return Ok(()); // chan closed, streaming terminated + break; // disconnected, flush WAL and return on next send/recv } } @@ -531,11 +528,13 @@ impl WalAcceptor { break; } + // continue pulling AppendRequests if available match self.msg_rx.try_recv() { Ok(msg) => next_msg = msg, Err(TryRecvError::Empty) => break, - Err(TryRecvError::Disconnected) => return Ok(()), // chan closed, streaming terminated - } + // on disconnect, flush WAL and return on next send/recv + Err(TryRecvError::Disconnected) => break, + }; } // flush all written WAL to the disk @@ -555,5 +554,6 @@ impl WalAcceptor { next_keepalive = Instant::now() + KEEPALIVE_INTERVAL; } } + Ok(()) } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 3494b0b764..41b9490088 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -122,7 +122,7 @@ impl<'a> WriteGuardSharedState<'a> { } } -impl<'a> Deref for WriteGuardSharedState<'a> { +impl Deref for WriteGuardSharedState<'_> { type Target = SharedState; fn deref(&self) -> &Self::Target { @@ -130,13 +130,13 @@ impl<'a> Deref for WriteGuardSharedState<'a> { } } -impl<'a> DerefMut for WriteGuardSharedState<'a> { +impl DerefMut for WriteGuardSharedState<'_> { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.guard } } -impl<'a> Drop for WriteGuardSharedState<'a> { +impl Drop for WriteGuardSharedState<'_> { fn drop(&mut self) { let term_flush_lsn = TermLsn::from((self.guard.sk.last_log_term(), self.guard.sk.flush_lsn())); diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index bafae1f551..b63a322b87 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -28,7 +28,7 @@ struct UnshardedComputeHookTenant { node_id: NodeId, // Must hold this lock to send a notification. - send_lock: Arc>>, + send_lock: Arc>>, } struct ShardedComputeHookTenant { stripe_size: ShardStripeSize, @@ -38,7 +38,22 @@ struct ShardedComputeHookTenant { // Must hold this lock to send a notification. The contents represent // the last successfully sent notification, and are used to coalesce multiple // updates by only sending when there is a chance since our last successful send. - send_lock: Arc>>, + send_lock: Arc>>, +} + +/// Represents our knowledge of the compute's state: we can update this when we get a +/// response from a notify API call, which tells us what has been applied. +/// +/// Should be wrapped in an Option<>, as we cannot always know the remote state. +#[derive(PartialEq, Eq, Debug)] +struct ComputeRemoteState { + // The request body which was acked by the compute + request: ComputeHookNotifyRequest, + + // Whether the cplane indicated that the state was applied to running computes, or just + // persisted. In the Neon control plane, this is the difference between a 423 response (meaning + // persisted but not applied), and a 2xx response (both persisted and applied) + applied: bool, } enum ComputeHookTenant { @@ -64,7 +79,7 @@ impl ComputeHookTenant { } } - fn get_send_lock(&self) -> &Arc>> { + fn get_send_lock(&self) -> &Arc>> { match self { Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, @@ -188,11 +203,11 @@ enum MaybeSendResult { Transmit( ( ComputeHookNotifyRequest, - tokio::sync::OwnedMutexGuard>, + tokio::sync::OwnedMutexGuard>, ), ), // Something requires sending, but you must wait for a current sender then call again - AwaitLock(Arc>>), + AwaitLock(Arc>>), // Nothing requires sending Noop, } @@ -201,7 +216,7 @@ impl ComputeHookTenant { fn maybe_send( &self, tenant_id: TenantId, - lock: Option>>, + lock: Option>>, ) -> MaybeSendResult { let locked = match lock { Some(already_locked) => already_locked, @@ -257,11 +272,22 @@ impl ComputeHookTenant { tracing::info!("Tenant isn't yet ready to emit a notification"); MaybeSendResult::Noop } - Some(request) if Some(&request) == locked.as_ref() => { - // No change from the last value successfully sent + Some(request) + if Some(&request) == locked.as_ref().map(|s| &s.request) + && locked.as_ref().map(|s| s.applied).unwrap_or(false) => + { + tracing::info!( + "Skipping notification because remote state already matches ({:?})", + &request + ); + // No change from the last value successfully sent, and our state indicates that the last + // value sent was fully applied on the control plane side. MaybeSendResult::Noop } - Some(request) => MaybeSendResult::Transmit((request, locked)), + Some(request) => { + // Our request differs from the last one sent, or the last one sent was not fully applied on the compute side + MaybeSendResult::Transmit((request, locked)) + } } } } @@ -550,10 +576,28 @@ impl ComputeHook { }) }; - if result.is_ok() { - // Before dropping the send lock, stash the request we just sent so that - // subsequent callers can avoid redundantly re-sending the same thing. - *send_lock_guard = Some(request); + match result { + Ok(_) => { + // Before dropping the send lock, stash the request we just sent so that + // subsequent callers can avoid redundantly re-sending the same thing. + *send_lock_guard = Some(ComputeRemoteState { + request, + applied: true, + }); + } + Err(NotifyError::Busy) => { + // Busy result means that the server responded and has stored the new configuration, + // but was not able to fully apply it to the compute + *send_lock_guard = Some(ComputeRemoteState { + request, + applied: false, + }); + } + Err(_) => { + // General error case: we can no longer know the remote state, so clear it. This will result in + // the logic in maybe_send recognizing that we should call the hook again. + *send_lock_guard = None; + } } result } @@ -707,7 +751,10 @@ pub(crate) mod tests { assert!(request.stripe_size.is_none()); // Simulate successful send - *guard = Some(request); + *guard = Some(ComputeRemoteState { + request, + applied: true, + }); drop(guard); // Try asking again: this should be a no-op @@ -750,7 +797,10 @@ pub(crate) mod tests { assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); // Simulate successful send - *guard = Some(request); + *guard = Some(ComputeRemoteState { + request, + applied: true, + }); drop(guard); Ok(()) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 46b6f4f2bf..afefe8598c 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -381,14 +381,16 @@ async fn handle_tenant_timeline_delete( R: std::future::Future> + Send + 'static, F: Fn(Arc) -> R + Send + Sync + 'static, { + // On subsequent retries, wait longer. + // Enable callers with a 25 second request timeout to reliably get a response + const MAX_WAIT: Duration = Duration::from_secs(25); + const MAX_RETRY_PERIOD: Duration = Duration::from_secs(5); + let started_at = Instant::now(); + // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion // completed. let mut retry_period = Duration::from_secs(1); - // On subsequent retries, wait longer. - let max_retry_period = Duration::from_secs(5); - // Enable callers with a 30 second request timeout to reliably get a response - let max_wait = Duration::from_secs(25); loop { let status = f(service.clone()).await?; @@ -396,7 +398,11 @@ async fn handle_tenant_timeline_delete( StatusCode::ACCEPTED => { tracing::info!("Deletion accepted, waiting to try again..."); tokio::time::sleep(retry_period).await; - retry_period = max_retry_period; + retry_period = MAX_RETRY_PERIOD; + } + StatusCode::CONFLICT => { + tracing::info!("Deletion already in progress, waiting to try again..."); + tokio::time::sleep(retry_period).await; } StatusCode::NOT_FOUND => { tracing::info!("Deletion complete"); @@ -409,7 +415,7 @@ async fn handle_tenant_timeline_delete( } let now = Instant::now(); - if now + retry_period > started_at + max_wait { + if now + retry_period > started_at + MAX_WAIT { tracing::info!("Deletion timed out waiting for 404"); // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of // the pageserver's swagger definition for this endpoint, and has the same desired diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index cc735dc27e..01aa8f1dab 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -246,6 +246,11 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { // storage controller's auth configuration. ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}")) } + mgmt_api::Error::ApiError(status @ StatusCode::TOO_MANY_REQUESTS, msg) => { + // Pass through 429 errors: if pageserver is asking us to wait + retry, we in + // turn ask our clients to wait + retry + ApiError::Conflict(format!("{node} {status}: {status} {msg}")) + } mgmt_api::Error::ApiError(status, msg) => { // Presume general case of pageserver API errors is that we tried to do something // that can't be done right now. @@ -1069,8 +1074,9 @@ impl Service { /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`] /// will indicate that reconciliation is not needed. #[instrument(skip_all, fields( - tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), - sequence=%result.sequence + seq=%result.sequence, + tenant_id=%result.tenant_shard_id.tenant_id, + shard_id=%result.tenant_shard_id.shard_slug(), ))] fn process_result(&self, result: ReconcileResult) { let mut locked = self.inner.write().unwrap(); @@ -2856,17 +2862,12 @@ impl Service { let _tenant_lock = trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; - // Detach all shards - let (detach_waiters, shard_ids, node) = { - let mut shard_ids = Vec::new(); + // Detach all shards. This also deletes local pageserver shard data. + let (detach_waiters, node) = { let mut detach_waiters = Vec::new(); let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); - for (tenant_shard_id, shard) in - tenants.range_mut(TenantShardId::tenant_range(tenant_id)) - { - shard_ids.push(*tenant_shard_id); - + for (_, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { // Update the tenant's intent to remove all attachments shard.policy = PlacementPolicy::Detached; shard @@ -2886,7 +2887,7 @@ impl Service { let node = nodes .get(&node_id) .expect("Pageservers may not be deleted while lock is active"); - (detach_waiters, shard_ids, node.clone()) + (detach_waiters, node.clone()) }; // This reconcile wait can fail in a few ways: @@ -2901,38 +2902,34 @@ impl Service { self.await_waiters(detach_waiters, RECONCILE_TIMEOUT) .await?; - let locations = shard_ids - .into_iter() - .map(|s| (s, node.clone())) - .collect::>(); - let results = self.tenant_for_shards_api( - locations, - |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await }, - 1, - 3, - RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; - for result in results { - match result { - Ok(StatusCode::ACCEPTED) => { - // This should never happen: we waited for detaches to finish above - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Unexpectedly still attached on {}", - node - ))); - } - Ok(_) => {} - Err(mgmt_api::Error::Cancelled) => { - return Err(ApiError::ShuttingDown); - } - Err(e) => { - // This is unexpected: remote deletion should be infallible, unless the object store - // at large is unavailable. - tracing::error!("Error deleting via node {}: {e}", node); - return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); - } + // Delete the entire tenant (all shards) from remote storage via a random pageserver. + // Passing an unsharded tenant ID will cause the pageserver to remove all remote paths with + // the tenant ID prefix, including all shards (even possibly stale ones). + match node + .with_client_retries( + |client| async move { + client + .tenant_delete(TenantShardId::unsharded(tenant_id)) + .await + }, + &self.config.jwt_token, + 1, + 3, + RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + .unwrap_or(Err(mgmt_api::Error::Cancelled)) + { + Ok(_) => {} + Err(mgmt_api::Error::Cancelled) => { + return Err(ApiError::ShuttingDown); + } + Err(e) => { + // This is unexpected: remote deletion should be infallible, unless the object store + // at large is unavailable. + tracing::error!("Error deleting via node {node}: {e}"); + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); } } @@ -3633,14 +3630,21 @@ impl Service { ); let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); - client + let res = client .timeline_delete(tenant_shard_id, timeline_id) - .await - .map_err(|e| { - ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", - )) - }) + .await; + + match res { + Ok(ok) => Ok(ok), + Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT), + Err(e) => { + Err( + ApiError::InternalServerError(anyhow::anyhow!( + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) + ) + } + } } let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect(); @@ -3655,7 +3659,13 @@ impl Service { }) .await?; - // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero + // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero. + // We return 409 (Conflict) if deletion was already in progress on any of the shards + // and 202 (Accepted) if deletion was not already in progress on any of the shards. + if statuses.iter().any(|s| s == &StatusCode::CONFLICT) { + return Ok(StatusCode::CONFLICT); + } + if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) { return Ok(StatusCode::ACCEPTED); } diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs index 70b108cf23..7b82a0b116 100644 --- a/storage_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -138,7 +138,7 @@ pub struct ProjectData { pub name: String, pub region_id: String, pub platform_id: String, - pub user_id: String, + pub user_id: Option, pub pageserver_id: Option, #[serde(deserialize_with = "from_nullable_id")] pub tenant: TenantId, diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index d53611ed6e..a0040ada08 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -16,13 +16,13 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; -use utils::id::TenantId; +use utils::{backoff, id::TenantId}; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, init_remote, list_objects_with_retries, metadata_stream::{stream_tenant_timelines, stream_tenants}, - BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, + BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES, }; #[derive(Serialize, Deserialize, Debug)] @@ -250,13 +250,16 @@ async fn find_garbage_inner( &target.tenant_root(&tenant_shard_id), ) .await?; - let object = tenant_objects.keys.first().unwrap(); - if object.key.get_path().as_str().ends_with("heatmap-v1.json") { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); - garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); - continue; + if let Some(object) = tenant_objects.keys.first() { + if object.key.get_path().as_str().ends_with("heatmap-v1.json") { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); + continue; + } else { + tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + } } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran"); } } else { // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial @@ -406,14 +409,17 @@ pub async fn get_tenant_objects( // TODO: apply extra validation based on object modification time. Don't purge // tenants where any timeline's index_part.json has been touched recently. - let list = s3_client - .list( - Some(&tenant_root), - ListingMode::NoDelimiter, - None, - &CancellationToken::new(), - ) - .await?; + let cancel = CancellationToken::new(); + let list = backoff::retry( + || s3_client.list(Some(&tenant_root), ListingMode::NoDelimiter, None, &cancel), + |_| false, + 3, + MAX_RETRIES as u32, + "get_tenant_objects", + &cancel, + ) + .await + .expect("dummy cancellation token")?; Ok(list.keys) } @@ -424,14 +430,25 @@ pub async fn get_timeline_objects( tracing::debug!("Listing objects in timeline {ttid}"); let timeline_root = super::remote_timeline_path_id(&ttid); - let list = s3_client - .list( - Some(&timeline_root), - ListingMode::NoDelimiter, - None, - &CancellationToken::new(), - ) - .await?; + let cancel = CancellationToken::new(); + let list = backoff::retry( + || { + s3_client.list( + Some(&timeline_root), + ListingMode::NoDelimiter, + None, + &cancel, + ) + }, + |_| false, + 3, + MAX_RETRIES as u32, + "get_timeline_objects", + &cancel, + ) + .await + .expect("dummy cancellation token")?; + Ok(list.keys) } diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 15f3665fac..6c312d0036 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -1,10 +1,12 @@ use std::{collections::HashSet, str::FromStr, sync::Arc}; +use anyhow::{bail, Context}; use futures::stream::{StreamExt, TryStreamExt}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; use postgres_ffi::{XLogFileName, PG_TLI}; use remote_storage::GenericRemoteStorage; +use rustls::crypto::aws_lc_rs; use serde::Serialize; use tokio_postgres::types::PgLsn; use tracing::{debug, error, info}; @@ -231,10 +233,15 @@ async fn check_timeline( }) } -fn load_certs() -> Result, std::io::Error> { - let der_certs = rustls_native_certs::load_native_certs()?; +fn load_certs() -> anyhow::Result> { + let der_certs = rustls_native_certs::load_native_certs(); + + if !der_certs.errors.is_empty() { + bail!("could not load native tls certs: {:?}", der_certs.errors); + } + let mut store = rustls::RootCertStore::empty(); - store.add_parsable_certificates(der_certs); + store.add_parsable_certificates(der_certs.certs); Ok(Arc::new(store)) } static TLS_ROOTS: OnceCell> = OnceCell::new(); @@ -248,9 +255,12 @@ async fn load_timelines_from_db( // Use rustls (Neon requires TLS) let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); - let client_config = rustls::ClientConfig::builder() - .with_root_certificates(root_store) - .with_no_client_auth(); + let client_config = + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .context("aws_lc_rs should support the default protocol versions")? + .with_root_certificates(root_store) + .with_no_client_auth(); let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; // The connection object performs the actual communication with the database, diff --git a/test_runner/README.md b/test_runner/README.md index e087241c1f..55d8d2faa9 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,7 +6,7 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions - If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands. + To run tests you need to add `--features testing` to Rust code build commands. For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release` - Tests can be run from the git tree; or see the environment variables diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 26895df8a6..ea8291c1e0 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -28,3 +28,21 @@ class EndpointHttpClient(requests.Session): res = self.get(f"http://localhost:{self.port}/installed_extensions") res.raise_for_status() return res.json() + + def extensions(self, extension: str, version: str, database: str): + body = { + "extension": extension, + "version": version, + "database": database, + } + res = self.post(f"http://localhost:{self.port}/extensions", json=body) + res.raise_for_status() + return res.json() + + def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]): + res = self.post( + f"http://localhost:{self.port}/grants", + json={"database": database, "schema": schema, "role": role, "privileges": privileges}, + ) + res.raise_for_status() + return res.json() diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 0d3dcd1671..1b2767e296 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -1,6 +1,5 @@ from __future__ import annotations -import abc import json import os import re @@ -30,7 +29,8 @@ if TYPE_CHECKING: T = TypeVar("T") -class AbstractNeonCli(abc.ABC): +# Used to be an ABC. abc.ABC removed due to linter without name change. +class AbstractNeonCli: """ A typed wrapper around an arbitrary Neon CLI tool. Supports a way to run arbitrary command directly via CLI. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7789855fe4..3cd8019e32 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -386,9 +386,9 @@ class NeonEnvBuilder: self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine - self.pageserver_default_tenant_config_compaction_algorithm: Optional[ - dict[str, Any] - ] = pageserver_default_tenant_config_compaction_algorithm + self.pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = ( + pageserver_default_tenant_config_compaction_algorithm + ) if self.pageserver_default_tenant_config_compaction_algorithm is not None: log.debug( f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" @@ -1062,9 +1062,9 @@ class NeonEnv: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine if config.pageserver_default_tenant_config_compaction_algorithm is not None: tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config[ - "compaction_algorithm" - ] = config.pageserver_default_tenant_config_compaction_algorithm + tenant_config["compaction_algorithm"] = ( + config.pageserver_default_tenant_config_compaction_algorithm + ) if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( @@ -1108,9 +1108,9 @@ class NeonEnv: if config.auth_enabled: sk_cfg["auth_enabled"] = True if self.safekeepers_remote_storage is not None: - sk_cfg[ - "remote_storage" - ] = self.safekeepers_remote_storage.to_toml_inline_table().strip() + sk_cfg["remote_storage"] = ( + self.safekeepers_remote_storage.to_toml_inline_table().strip() + ) self.safekeepers.append( Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts) ) @@ -1986,11 +1986,11 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"reconcile_all waited for {n} shards") return n - def reconcile_until_idle(self, timeout_secs=30): + def reconcile_until_idle(self, timeout_secs=30, max_interval=5): start_at = time.time() n = 1 - delay_sec = 0.5 - delay_max = 5 + delay_sec = 0.1 + delay_max = max_interval while n > 0: n = self.reconcile_all() if n == 0: @@ -4280,6 +4280,7 @@ SKIP_FILES = frozenset( "postmaster.opts", "postmaster.pid", "pg_control", + "pg_dynshmem", ) ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index aa4435af4e..18d65cb7de 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -583,6 +583,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter): log.info(f"Got GC request response code: {res.status_code}") self.verbose_error(res) + def timeline_offload( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + ): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting offload: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/offload", + ) + log.info(f"Got offload request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + def timeline_compact( self, tenant_id: Union[TenantId, TenantShardId], diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 377a95fbeb..4c4306be9e 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -303,9 +303,10 @@ def assert_prefix_empty( remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None, allowed_postfix: Optional[str] = None, + delimiter: str = "/", ) -> None: assert remote_storage is not None - response = list_prefix(remote_storage, prefix) + response = list_prefix(remote_storage, prefix, delimiter) keys = response["KeyCount"] objects: list[ObjectTypeDef] = response.get("Contents", []) common_prefixes = response.get("CommonPrefixes", []) @@ -338,16 +339,18 @@ def assert_prefix_empty( if not (allowed_postfix.endswith(key)): filtered_count += 1 - assert ( - filtered_count == 0 - ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}" + assert filtered_count == 0, f"remote prefix {prefix} is not empty: {objects}" # remote_storage must not be None, but that's easier for callers to make mypy happy -def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None): +def assert_prefix_not_empty( + remote_storage: Optional[RemoteStorage], + prefix: Optional[str] = None, + delimiter: str = "/", +): assert remote_storage is not None response = list_prefix(remote_storage, prefix) - assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}" + assert response["KeyCount"] != 0, f"remote prefix {prefix} is empty: {response}" def list_prefix( diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 76575d330c..7ca6b3dd1c 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -417,7 +417,7 @@ def wait_until( time.sleep(interval) continue return res - raise Exception("timed out while waiting for %s" % func) from last_exception + raise Exception(f"timed out while waiting for {func}") from last_exception def assert_eq(a, b) -> None: diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index dbf94a2cf5..815d186ab9 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -144,9 +144,10 @@ def test_subscriber_lag( check_pgbench_still_running(pub_workload, "pub") check_pgbench_still_running(sub_workload, "sub") - with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( - sub_connstr - ) as sub_conn: + with ( + psycopg2.connect(pub_connstr) as pub_conn, + psycopg2.connect(sub_connstr) as sub_conn, + ): with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: lag = measure_logical_replication_lag(sub_cur, pub_cur) @@ -242,9 +243,10 @@ def test_publisher_restart( ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env, ) - with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( - sub_connstr - ) as sub_conn: + with ( + psycopg2.connect(pub_connstr) as pub_conn, + psycopg2.connect(sub_connstr) as sub_conn, + ): with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: lag = measure_logical_replication_lag(sub_cur, pub_cur) diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index 14b527acca..8b368977df 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -102,10 +102,14 @@ def test_ro_replica_lag( check_pgbench_still_running(master_workload) check_pgbench_still_running(replica_workload) time.sleep(sync_interval_min * 60) - with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect( - replica_connstr - ) as conn_replica: - with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica: + with ( + psycopg2.connect(master_connstr) as conn_master, + psycopg2.connect(replica_connstr) as conn_replica, + ): + with ( + conn_master.cursor() as cur_master, + conn_replica.cursor() as cur_replica, + ): lag = measure_replication_lag(cur_master, cur_replica) log.info(f"Replica lagged behind master by {lag} seconds") zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 452a856714..d2eba751f8 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -4,9 +4,10 @@ import concurrent.futures import random import time from collections import defaultdict +from enum import Enum import pytest -from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -34,6 +35,7 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[ if tenant_placement[tid]["intent"]["attached"] == tenant_placement[tid]["observed"]["attached"] } + assert len(matching) == total_shards attached_per_node: defaultdict[str, int] = defaultdict(int) @@ -107,15 +109,48 @@ def test_storage_controller_many_tenants( ps.allowed_errors.append(".*request was dropped before completing.*") # Total tenants - tenant_count = 4000 + small_tenant_count = 7800 + large_tenant_count = 200 + tenant_count = small_tenant_count + large_tenant_count + large_tenant_shard_count = 8 + total_shards = small_tenant_count + large_tenant_count * large_tenant_shard_count - # Shards per tenant - shard_count = 2 - stripe_size = 1024 + # A small stripe size to encourage all shards to get some data + stripe_size = 1 - total_shards = tenant_count * shard_count + # We use a fixed seed to make the test somewhat reproducible: we want a randomly + # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. + rng = random.Random(1234) - tenants = set(TenantId.generate() for _i in range(0, tenant_count)) + class Tenant: + def __init__(self): + # Tenants may optionally contain a timeline + self.timeline_id = None + + # Tenants may be marked as 'large' to get multiple shard during creation phase + self.large = False + + tenant_ids = list(TenantId.generate() for _i in range(0, tenant_count)) + tenants = dict((tid, Tenant()) for tid in tenant_ids) + + # We will create timelines in only a subset of tenants, because creating timelines + # does many megabytes of IO, and we want to densely simulate huge tenant counts on + # a single test node. + tenant_timelines_count = 100 + + # These lists are maintained for use with rng.choice + tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count)) + tenants_without_timelines = list( + tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines + ) + + # For our sharded tenants, we will make half of them with timelines and half without + assert large_tenant_count >= tenant_timelines_count / 2 + for tenant_id in tenants_with_timelines[0 : large_tenant_count // 2]: + tenants[tenant_id].large = True + + for tenant_id in tenants_without_timelines[0 : large_tenant_count // 2]: + tenants[tenant_id].large = True virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -125,23 +160,39 @@ def test_storage_controller_many_tenants( rss = env.storage_controller.get_metric_value("process_resident_memory_bytes") assert rss is not None - log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)") - assert rss < expect_memory_per_shard * shard_count * tenant_count - - # We use a fixed seed to make the test somewhat reproducible: we want a randomly - # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. - rng = random.Random(1234) + log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)") + assert rss < expect_memory_per_shard * total_shards # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore # permits, to ensure that we are exercising stressing that. api_concurrency = 135 - # We will create tenants directly via API, not via neon_local, to avoid any false - # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) - with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: - futs = [] + # A different concurrency limit for bulk tenant+timeline creations: these do I/O and will + # start timing on test nodes if we aren't a bit careful. + create_concurrency = 16 + + class Operation(str, Enum): + TIMELINE_OPS = "timeline_ops" + SHARD_MIGRATE = "shard_migrate" + TENANT_PASSTHROUGH = "tenant_passthrough" + + run_ops = api_concurrency * 4 + assert run_ops < len(tenants) + + # Creation phase: make a lot of tenants, and create timelines in a subset of them + # This executor has concurrency set modestly, to avoid overloading pageservers with timeline creations. + with concurrent.futures.ThreadPoolExecutor(max_workers=create_concurrency) as executor: + tenant_create_futs = [] t1 = time.time() - for tenant_id in tenants: + + for tenant_id, tenant in tenants.items(): + if tenant.large: + shard_count = large_tenant_shard_count + else: + shard_count = 1 + + # We will create tenants directly via API, not via neon_local, to avoid any false + # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) f = executor.submit( env.storage_controller.tenant_create, tenant_id, @@ -152,44 +203,106 @@ def test_storage_controller_many_tenants( tenant_config={"heatmap_period": "10s"}, placement_policy={"Attached": 1}, ) - futs.append(f) + tenant_create_futs.append(f) - # Wait for creations to finish - for f in futs: + # Wait for tenant creations to finish + for f in tenant_create_futs: f.result() log.info( f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s" ) - run_ops = api_concurrency * 4 - assert run_ops < len(tenants) - op_tenants = list(tenants)[0:run_ops] + # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior + # would be for original scheduling decisions to always match optimizer's preference) + # (workaround for https://github.com/neondatabase/neon/issues/8969) + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + + # Create timelines in those tenants which are going to get one + t1 = time.time() + timeline_create_futs = [] + for tenant_id in tenants_with_timelines: + timeline_id = TimelineId.generate() + tenants[tenant_id].timeline_id = timeline_id + f = executor.submit( + env.storage_controller.pageserver_api().timeline_create, + PgVersion.NOT_SET, + tenant_id, + timeline_id, + ) + timeline_create_futs.append(f) + + for f in timeline_create_futs: + f.result() + log.info( + f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s" + ) + + # Plan operations: ensure each tenant with a timeline gets at least + # one of each operation type. Then add other tenants to make up the + # numbers. + ops_plan = [] + for tenant_id in tenants_with_timelines: + ops_plan.append((tenant_id, Operation.TIMELINE_OPS)) + ops_plan.append((tenant_id, Operation.SHARD_MIGRATE)) + ops_plan.append((tenant_id, Operation.TENANT_PASSTHROUGH)) + + # Fill up remaining run_ops with migrations of tenants without timelines + other_migrate_tenants = rng.sample(tenants_without_timelines, run_ops - len(ops_plan)) + + for tenant_id in other_migrate_tenants: + ops_plan.append( + ( + tenant_id, + rng.choice([Operation.SHARD_MIGRATE, Operation.TENANT_PASSTHROUGH]), + ) + ) + + # Exercise phase: pick pseudo-random operations to do on the tenants + timelines + # This executor has concurrency high enough to stress the storage controller API. + with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: + + def exercise_timeline_ops(tenant_id, timeline_id): + # A read operation: this requires looking up shard zero and routing there + detail = virtual_ps_http.timeline_detail(tenant_id, timeline_id) + assert detail["timeline_id"] == str(timeline_id) + + # A fan-out write operation to all shards in a tenant. + # - We use a metadata operation rather than something like a timeline create, because + # timeline creations are I/O intensive and this test isn't meant to be a stress test for + # doing lots of concurrent timeline creations. + archival_state = rng.choice( + [TimelineArchivalState.ARCHIVED, TimelineArchivalState.UNARCHIVED] + ) + virtual_ps_http.timeline_archival_config(tenant_id, timeline_id, archival_state) # Generate a mixture of operations and dispatch them all concurrently futs = [] - for tenant_id in op_tenants: - op = rng.choice([0, 1, 2]) - if op == 0: - # A fan-out write operation to all shards in a tenant (timeline creation) + for tenant_id, op in ops_plan: + if op == Operation.TIMELINE_OPS: + op_timeline_id = tenants[tenant_id].timeline_id + assert op_timeline_id is not None + + # Exercise operations that modify tenant scheduling state but require traversing + # the fan-out-to-all-shards functionality. f = executor.submit( - virtual_ps_http.timeline_create, - PgVersion.NOT_SET, + exercise_timeline_ops, tenant_id, - TimelineId.generate(), + op_timeline_id, ) - elif op == 1: + elif op == Operation.SHARD_MIGRATE: # A reconciler operation: migrate a shard. - shard_number = rng.randint(0, shard_count - 1) - tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) + desc = env.storage_controller.tenant_describe(tenant_id) + + shard_number = rng.randint(0, len(desc["shards"]) - 1) + tenant_shard_id = TenantShardId(tenant_id, shard_number, len(desc["shards"])) # Migrate it to its secondary location - desc = env.storage_controller.tenant_describe(tenant_id) dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] f = executor.submit( env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id ) - elif op == 2: + elif op == Operation.TENANT_PASSTHROUGH: # A passthrough read to shard zero f = executor.submit(virtual_ps_http.tenant_status, tenant_id) @@ -199,10 +312,18 @@ def test_storage_controller_many_tenants( for f in futs: f.result() + log.info("Completed mixed operations phase") + # Some of the operations above (notably migrations) might leave the controller in a state where it has # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system # to reach a quiescent state before doing following checks. - env.storage_controller.reconcile_until_idle() + # + # - Set max_interval low because we probably have a significant number of optimizations to complete and would like + # the test to run quickly. + # - Set timeout high because we might be waiting for optimizations that reuqire a secondary + # to warm up, and if we just started a secondary in the previous step, it might wait some time + # before downloading its heatmap + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() check_memory() @@ -213,6 +334,7 @@ def test_storage_controller_many_tenants( # # We do not require that the system is quiescent already here, although at present in this point in the test # that may be the case. + log.info("Reconciling all & timing") while True: t1 = time.time() reconcilers = env.storage_controller.reconcile_all() @@ -225,6 +347,7 @@ def test_storage_controller_many_tenants( break # Restart the storage controller + log.info("Restarting controller") env.storage_controller.stop() env.storage_controller.start() @@ -246,7 +369,16 @@ def test_storage_controller_many_tenants( # Restart pageservers gracefully: this exercises the /re-attach pageserver API # and the storage controller drain and fill API + log.info("Restarting pageservers...") + + # Parameters for how long we expect it to take to migrate all of the tenants from/to + # a node during a drain/fill operation + DRAIN_FILL_TIMEOUT = 240 + DRAIN_FILL_BACKOFF = 5 + for ps in env.pageservers: + log.info(f"Draining pageserver {ps.id}") + t1 = time.time() env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -255,9 +387,10 @@ def test_storage_controller_many_tenants( ps.id, PageserverAvailability.ACTIVE, PageserverSchedulingPolicy.PAUSE_FOR_RESTART, - max_attempts=24, - backoff=5, + max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF, + backoff=DRAIN_FILL_BACKOFF, ) + log.info(f"Drained pageserver {ps.id} in {time.time() - t1}s") shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") @@ -275,6 +408,7 @@ def test_storage_controller_many_tenants( backoff=1, ) + log.info(f"Filling pageserver {ps.id}") env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -282,16 +416,23 @@ def test_storage_controller_many_tenants( ps.id, PageserverAvailability.ACTIVE, PageserverSchedulingPolicy.ACTIVE, - max_attempts=24, - backoff=5, + max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF, + backoff=DRAIN_FILL_BACKOFF, ) + log.info(f"Filled pageserver {ps.id} in {time.time() - t1}s") + + # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior + # would be for original scheduling decisions to always match optimizer's preference) + # (workaround for https://github.com/neondatabase/neon/issues/8969) + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") assert_consistent_balanced_attachments(env, total_shards) - env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py deleted file mode 100644 index 91d674d0db..0000000000 --- a/test_runner/regress/test_aux_files.py +++ /dev/null @@ -1,78 +0,0 @@ -from __future__ import annotations - -from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - AuxFileStore, - NeonEnvBuilder, - logical_replication_sync, -) - - -def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): - env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start("main") - client = env.pageserver.http_client() - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - tenant_config = client.tenant_config(tenant_id).effective_config - tenant_config["switch_aux_file_policy"] = AuxFileStore.V2 - client.set_tenant_config(tenant_id, tenant_config) - # aux file v2 is enabled on the write path, so for now, it should be unset (or null) - assert ( - client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"] - is None - ) - - pg_conn = endpoint.connect() - cur = pg_conn.cursor() - - cur.execute("create table t(pk integer primary key, payload integer)") - cur.execute( - "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));" - ) - cur.execute("create publication pub1 for table t, replication_example") - - # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils) - # instead of going through the full logical replication process. - vanilla_pg.start() - vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)") - vanilla_pg.safe_psql( - "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);" - ) - connstr = endpoint.connstr().replace("'", "''") - log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") - vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") - - # Wait logical replication channel to be established - logical_replication_sync(vanilla_pg, endpoint) - vanilla_pg.stop() - endpoint.stop() - - with env.pageserver.http_client() as client: - # aux file v2 flag should be enabled at this point - assert ( - client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] - == AuxFileStore.V2 - ) - with env.pageserver.http_client() as client: - tenant_config = client.tenant_config(tenant_id).effective_config - tenant_config["switch_aux_file_policy"] = "V1" - client.set_tenant_config(tenant_id, tenant_config) - # the flag should still be enabled - assert ( - client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ - "last_aux_file_policy" - ] - == AuxFileStore.V2 - ) - env.pageserver.restart() - with env.pageserver.http_client() as client: - # aux file v2 flag should be persisted - assert ( - client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ - "last_aux_file_policy" - ] - == AuxFileStore.V2 - ) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 04916a6b6f..0134f80769 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -74,7 +74,7 @@ def test_remote_extensions( mimetype="application/octet-stream", headers=[ ("Content-Length", str(file_size)), - ("Content-Disposition", 'attachment; filename="%s"' % file_name), + ("Content-Disposition", f'attachment; filename="{file_name}"'), ], direct_passthrough=True, ) diff --git a/test_runner/regress/test_extensions.py b/test_runner/regress/test_extensions.py new file mode 100644 index 0000000000..100fd4b048 --- /dev/null +++ b/test_runner/regress/test_extensions.py @@ -0,0 +1,50 @@ +from logging import info + +from fixtures.neon_fixtures import NeonEnv + + +def test_extensions(neon_simple_env: NeonEnv): + """basic test for the extensions endpoint testing installing extensions""" + + env = neon_simple_env + + env.create_branch("test_extensions") + + endpoint = env.endpoints.create_start("test_extensions") + extension = "neon_test_utils" + database = "test_extensions" + + endpoint.safe_psql("CREATE DATABASE test_extensions") + + with endpoint.connect(dbname=database) as pg_conn: + with pg_conn.cursor() as cur: + cur.execute( + "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'" + ) + res = cur.fetchone() + assert res is not None + version = res[0] + + with pg_conn.cursor() as cur: + cur.execute( + "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'", + ) + res = cur.fetchone() + assert not res, "The 'neon_test_utils' extension is installed" + + client = endpoint.http_client() + install_res = client.extensions(extension, version, database) + + info("Extension install result: %s", res) + assert install_res["extension"] == extension and install_res["version"] == version + + with endpoint.connect(dbname=database) as pg_conn: + with pg_conn.cursor() as cur: + cur.execute( + "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'", + ) + res = cur.fetchone() + assert res is not None + (db_extension_name, db_extension_version) = res + + assert db_extension_name == extension and db_extension_version == version diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 87991eadf1..c26bf058e2 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -558,10 +558,10 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication return publisher_flush_lsn -# Test that subscriber takes into account quorum committed flush_lsn in -# flush_lsn reporting to publisher. Without this, it may ack too far, losing -# data on restart because publisher advances START_REPLICATION position to the -# confirmed_flush_lsn of the slot. +# Test that neon subscriber takes into account quorum committed flush_lsn in +# flush_lsn reporting to publisher. Without this, subscriber may ack too far, +# losing data on restart because publisher implicitly advances positition given +# in START_REPLICATION to the confirmed_flush_lsn of the slot. def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env # use vanilla as publisher to allow writes on it when safekeeper is down @@ -578,7 +578,10 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): vanilla_pg.safe_psql("create extension neon;") env.create_branch("subscriber") - sub = env.endpoints.create("subscriber") + # We want all data to fit into shared_buffers because later we stop + # safekeeper and insert more; this shouldn't cause page requests as they + # will be stuck. + sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"]) sub.start() with vanilla_pg.cursor() as pcur: diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index 980f6b5694..db8da51125 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -254,13 +254,13 @@ def advance_multixid_to( # missing. That's OK for our purposes. Autovacuum will print some warnings about the # missing segments, but will clean it up by truncating the SLRUs up to the new value, # closing the gap. - segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid) + segname = f"{MultiXactIdToOffsetSegment(next_multi_xid):04X}" log.info(f"Creating dummy segment pg_multixact/offsets/{segname}") with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of: of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) of.flush() - segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset) + segname = f"{MXOffsetToMemberSegment(next_multi_offset):04X}" log.info(f"Creating dummy segment pg_multixact/members/{segname}") with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of: of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py new file mode 100644 index 0000000000..b2251875f0 --- /dev/null +++ b/test_runner/regress/test_role_grants.py @@ -0,0 +1,41 @@ +import psycopg2 +from fixtures.neon_fixtures import NeonEnv + + +def test_role_grants(neon_simple_env: NeonEnv): + """basic test for the endpoint that grants permissions for a role against a schema""" + + env = neon_simple_env + + env.create_branch("test_role_grants") + + endpoint = env.endpoints.create_start("test_role_grants") + + endpoint.safe_psql("CREATE DATABASE test_role_grants") + endpoint.safe_psql("CREATE SCHEMA IF NOT EXISTS test_schema", dbname="test_role_grants") + endpoint.safe_psql("CREATE ROLE test_role WITH LOGIN", dbname="test_role_grants") + + # confirm we do not yet have access + pg_conn = endpoint.connect(dbname="test_role_grants", user="test_role") + with pg_conn.cursor() as cur: + try: + cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)') + raise ValueError("create table should not succeed") + except psycopg2.errors.InsufficientPrivilege: + pass + except BaseException as e: + raise e + + client = endpoint.http_client() + res = client.set_role_grants( + "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"] + ) + + # confirm we have access + with pg_conn.cursor() as cur: + cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)') + cur.execute('INSERT INTO "test_schema"."test_table" (id) VALUES (1)') + cur.execute('SELECT id from "test_schema"."test_table"') + res = cur.fetchall() + + assert res == [(1,)], "select should not succeed" diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 1dcc37c407..a4e293da9e 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -576,6 +576,14 @@ def test_storage_controller_compute_hook( env.storage_controller.consistency_check() +NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*" +NOTIFY_FAILURE_LOGS = [ + ".*Failed to notify compute.*", + ".*Reconcile error.*Cancelled", + ".*Reconcile error.*Control plane tenant busy", +] + + def test_storage_controller_stuck_compute_hook( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, @@ -620,15 +628,8 @@ def test_storage_controller_stuck_compute_hook( dest_pageserver = env.get_pageserver(dest_ps_id) shard_0_id = TenantShardId(tenant_id, 0, 0) - NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*" - env.storage_controller.allowed_errors.extend( - [ - NOTIFY_BLOCKED_LOG, - ".*Failed to notify compute.*", - ".*Reconcile error.*Cancelled", - ".*Reconcile error.*Control plane tenant busy", - ] - ) + env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) + env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: # We expect the controller to hit the 423 (locked) and retry. Migration shouldn't complete until that @@ -719,6 +720,114 @@ def test_storage_controller_stuck_compute_hook( env.storage_controller.consistency_check() +@run_only_on_default_postgres("this test doesn't start an endpoint") +def test_storage_controller_compute_hook_revert( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, +): + """ + 'revert' in the sense of a migration which gets reversed shortly after, as may happen during + a rolling upgrade. + + This is a reproducer for https://github.com/neondatabase/neon/issues/9417 + + The buggy behavior was that when the compute hook gave us errors, we assumed our last successfully + sent state was still in effect, so when migrating back to the original pageserver we didn't bother + notifying of that. This is wrong because even a failed request might mutate the state on the server. + """ + + # We will run two pageserver to migrate and check that the storage controller sends notifications + # when migrating. + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + # Set up fake HTTP notify endpoint + notifications = [] + + handle_params = {"status": 200} + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) + tenant_id = env.initial_tenant + tenant_shard_id = TenantShardId(tenant_id, 0, 0) + + pageserver_a = env.get_tenant_pageserver(tenant_id) + pageserver_b = [p for p in env.pageservers if p.id != pageserver_a.id][0] + + def notified_ps(ps_id: int) -> None: + latest = notifications[-1] + log.info(f"Waiting for {ps_id}, have {latest}") + assert latest is not None + assert latest["shards"] is not None + assert latest["shards"][0]["node_id"] == ps_id + + wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) + + env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) + env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) + + # Migrate A -> B, and make notifications fail while this is happening + handle_params["status"] = 423 + + with pytest.raises(StorageControllerApiException, match="Timeout waiting for shard"): + # We expect the controller to give us an error because its reconciliation timed out + # waiting for the compute hook. + env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_b.id) + + # Although the migration API failed, the hook should still see pageserver B (it remembers what + # was posted even when returning an error code) + wait_until(30, 1, lambda: notified_ps(pageserver_b.id)) + + # Although the migration API failed, the tenant should still have moved to the right pageserver + assert len(pageserver_b.http_client().tenant_list()) == 1 + + # Before we clear the failure on the migration hook, we need the controller to give up + # trying to notify about B -- the bug case we're reproducing is when the controller + # _never_ successfully notified for B, then tries to notify for A. + # + # The controller will give up notifying if the origin of a migration becomes unavailable. + pageserver_a.stop() + + # Preempt heartbeats for a faster test + env.storage_controller.node_configure(pageserver_a.id, {"availability": "Offline"}) + + def logged_giving_up(): + env.storage_controller.assert_log_contains(".*Giving up on compute notification.*") + + wait_until(30, 1, logged_giving_up) + + pageserver_a.start() + + # Preempt heartbeats for determinism + env.storage_controller.node_configure(pageserver_a.id, {"availability": "Active"}) + # Starting node will prompt a reconcile to clean up old AttachedStale location, for a deterministic test + # we want that complete before we start our migration. Tolerate failure because our compute hook is + # still configured to fail + try: + env.storage_controller.reconcile_all() + except StorageControllerApiException as e: + # This exception _might_ be raised: it depends if our reconcile_all hit the on-node-activation + # Reconciler lifetime or ran after it already completed. + log.info(f"Expected error from reconcile_all: {e}") + + # Migrate B -> A, with a working compute hook: the controller should notify the hook because the + # last update it made that was acked (423) by the compute was for node B. + handle_params["status"] = 200 + env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id) + + wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) + + def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): """ Verify that occasional-use debug APIs work as expected. This is a lightweight test diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 294c1248c5..f486327445 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -20,6 +20,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.utils import run_pg_bench_small, wait_until +from fixtures.workload import Workload from requests.exceptions import ReadTimeout from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -404,3 +405,57 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder cloud_admin_api_token=cloud_admin_token, ) assert healthy + + +def test_tenant_delete_stale_shards(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Deleting a tenant should also delete any stale (pre-split) shards from remote storage. + """ + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + env = neon_env_builder.init_start() + + # Create an unsharded tenant. + tenant_id, timeline_id = env.create_tenant() + + # Write some data. + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + workload.write_rows(256) + workload.validate() + + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join(("tenants", str(tenant_id))), + ) + + # Upload a heatmap as well. + env.pageserver.http_client().tenant_heatmap_upload(tenant_id) + + # Split off a few shards, in two rounds. + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + env.storage_controller.tenant_shard_split(tenant_id, shard_count=16) + + # Delete the tenant. This should also delete data for the unsharded and count=4 parents. + env.storage_controller.pageserver_api().tenant_delete(tenant_id=tenant_id) + + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join(("tenants", str(tenant_id))), + delimiter="", # match partial prefixes, i.e. all shards + ) + + dirs = list(env.pageserver.tenant_dir(None).glob(f"{tenant_id}*")) + assert dirs == [], f"found tenant directories: {dirs}" + + # The initial tenant created by the test harness should still be there. + # Only the tenant we deleted should be removed. + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join(("tenants", str(env.initial_tenant))), + ) + dirs = list(env.pageserver.tenant_dir(None).glob(f"{env.initial_tenant}*")) + assert dirs != [], "missing initial tenant directory" + + env.stop() diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 9ea09d10d7..b41f1709bd 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -479,9 +479,9 @@ def assert_size_approx_equal(size_a, size_b): """ # Determined empirically from examples of equality failures: they differ - # by page multiples of 8272, and usually by 1-3 pages. Tolerate 4 to avoid + # by page multiples of 8272, and usually by 1-3 pages. Tolerate 6 to avoid # failing on outliers from that observed range. - threshold = 4 * 8272 + threshold = 6 * 8272 assert size_a == pytest.approx(size_b, abs=threshold) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 95dc0fec78..03cb79fc1d 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -2,6 +2,7 @@ from __future__ import annotations import concurrent.futures import os +import threading import time from contextlib import closing from datetime import datetime @@ -10,7 +11,7 @@ from pathlib import Path import pytest import requests -from fixtures.common_types import Lsn, TenantId +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -18,6 +19,7 @@ from fixtures.metrics import ( parse_metrics, ) from fixtures.neon_fixtures import ( + Endpoint, NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn, @@ -476,3 +478,38 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): assert counts log.info(f"directory counts: {counts}") assert counts[2] > COUNT_AT_LEAST_EXPECTED + + +def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): + """ + (Relaxed) regression test for issue that led to https://github.com/neondatabase/neon/pull/9268 + Create many endpoints in parallel and then restart them + """ + env = neon_simple_env + + # This param needs to be 200+ to reproduce the limit issue + n_threads = 16 + barrier = threading.Barrier(n_threads) + + def test_timeline(branch_name: str, timeline_id: TimelineId, endpoint: Endpoint): + endpoint.start() + endpoint.stop() + # Use a barrier to make sure we restart endpoints at the same time + barrier.wait() + endpoint.start() + + workers = [] + + for i in range(0, n_threads): + branch_name = f"branch_{i}" + timeline_id = env.create_branch(branch_name) + endpoint = env.endpoints.create(branch_name) + w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id, endpoint]) + workers.append(w) + + # Only start the restarts once we're done creating all timelines & endpoints + for w in workers: + w.start() + + for w in workers: + w.join() diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 841707d32e..85e1077fd5 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -6,6 +6,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverApiException +from fixtures.utils import wait_until @pytest.mark.parametrize("shard_count", [0, 4]) @@ -114,3 +115,107 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): leaf_timeline_id, state=TimelineArchivalState.UNARCHIVED, ) + + +@pytest.mark.parametrize("manual_offload", [False, True]) +def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool): + if not manual_offload: + # (automatic) timeline offloading defaults to false for now + neon_env_builder.pageserver_config_override = "timeline_offloading = true" + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, initial_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s" if manual_offload else "1s", + } + ) + + # Create two branches and archive them + parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id) + leaf_timeline_id = env.create_branch( + "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" + ) + + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,1000)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + ps_http.timeline_archival_config( + tenant_id, + parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + def timeline_offloaded(timeline_id: TimelineId) -> bool: + return ( + env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*") + is not None + ) + + if manual_offload: + with pytest.raises( + PageserverApiException, + match="timeline has attached children", + ): + # This only tests the (made for testing only) http handler, + # but still demonstrates the constraints we have. + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id) + + def parent_offloaded(): + if manual_offload: + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id) + assert timeline_offloaded(parent_timeline_id) + + def leaf_offloaded(): + if manual_offload: + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id) + assert timeline_offloaded(leaf_timeline_id) + + wait_until(30, 1, leaf_offloaded) + wait_until(30, 1, parent_offloaded) + + ps_http.timeline_archival_config( + tenant_id, + parent_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is False + + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + assert sum == sum_again + + assert not timeline_offloaded(initial_timeline_id) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 306f22acf9..155709e106 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -649,7 +649,7 @@ def test_timeline_delete_works_for_remote_smoke( env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() - pg = env.endpoints.create_start("main") + env.endpoints.create_start("main") tenant_id = env.initial_tenant timeline_id = env.initial_timeline diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 0a90b6b6f7..28c51b8ac1 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -32,7 +32,6 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt", digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } -futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } @@ -46,8 +45,9 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } -indexmap = { version = "1", default-features = false, features = ["std"] } -itertools = { version = "0.12" } +indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } +indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } +itertools = { version = "0.10" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -65,6 +65,8 @@ regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } +rustls = { version = "0.23", features = ["ring"] } +rustls-webpki = { version = "0.102", default-features = false, features = ["aws_lc_rs", "ring", "std"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["alloc", "raw_value"] } @@ -78,6 +80,7 @@ tikv-jemalloc-sys = { version = "0.5" } time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", features = ["with-serde_json-1"] } +tokio-rustls = { version = "0.26", features = ["ring"] } tokio-stream = { version = "0.1", features = ["net"] } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } @@ -101,8 +104,9 @@ either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } -indexmap = { version = "1", default-features = false, features = ["std"] } -itertools = { version = "0.12" } +indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } +indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } +itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } @@ -120,8 +124,7 @@ regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } serde = { version = "1", features = ["alloc", "derive"] } -syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } +syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } toml_edit = { version = "0.22", features = ["serde"] } zstd = { version = "0.13" }