diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 11adc8df86..2bdb727719 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -183,7 +183,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 4008cd0d36..037b9aeb1e 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -88,7 +88,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} @@ -218,6 +218,9 @@ runs: name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/ + # The lack of compatibility snapshot shouldn't fail the job + # (for example if we didn't run the test for non build-and-test workflow) + skip-if-does-not-exist: true - name: Upload test results if: ${{ !cancelled() }} diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index edcece7d2b..8a4cfe2eff 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -7,6 +7,10 @@ inputs: path: description: "A directory or file to upload" required: true + skip-if-does-not-exist: + description: "Allow to skip if path doesn't exist, fail otherwise" + default: false + required: false prefix: description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false @@ -15,10 +19,12 @@ runs: using: "composite" steps: - name: Prepare artifact + id: prepare-artifact shell: bash -euxo pipefail {0} env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} run: | mkdir -p $(dirname $ARCHIVE) @@ -33,14 +39,22 @@ runs: elif [ -f ${SOURCE} ]; then time tar -cf ${ARCHIVE} --zstd ${SOURCE} elif ! ls ${SOURCE} > /dev/null 2>&1; then - echo >&2 "${SOURCE} does not exist" - exit 2 + if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then + echo 'SKIPPED=true' >> $GITHUB_OUTPUT + exit 0 + else + echo >&2 "${SOURCE} does not exist" + exit 2 + fi else echo >&2 "${SOURCE} is neither a directory nor a file, do not know how to handle it" exit 3 fi + echo 'SKIPPED=false' >> $GITHUB_OUTPUT + - name: Upload artifact + if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }} shell: bash -euxo pipefail {0} env: SOURCE: ${{ inputs.path }} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 5fc6aa247a..3aa671fab1 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -124,28 +124,28 @@ jobs: uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v16 build id: cache_pg_16 uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v17 build id: cache_pg_17 uses: actions/cache@v4 with: path: pg_install/v17 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index ca5ff573e1..0f05276579 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -19,9 +19,16 @@ defaults: run: shell: bash -euo pipefail {0} -concurrency: - group: build-build-tools-image-${{ inputs.image-tag }} - cancel-in-progress: false +# The initial idea was to prevent the waste of resources by not re-building the `build-tools` image +# for the same tag in parallel workflow runs, and queue them to be skipped once we have +# the first image pushed to Docker registry, but GitHub's concurrency mechanism is not working as expected. +# GitHub can't have more than 1 job in a queue and removes the previous one, it causes failures if the dependent jobs. +# +# Ref https://github.com/orgs/community/discussions/41518 +# +# concurrency: +# group: build-build-tools-image-${{ inputs.image-tag }} +# cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -36,6 +43,7 @@ jobs: strategy: matrix: + debian-version: [ bullseye, bookworm ] arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -74,22 +82,22 @@ jobs: - uses: docker/build-push-action@v6 with: + file: Dockerfile.build-tools context: . provenance: false push: true pull: true - file: Dockerfile.build-tools - cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} - tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} + build-args: | + DEBIAN_VERSION=${{ matrix.debian-version }} + cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }} + tags: | + neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.debian-version }}-${{ matrix.arch }} merge-images: needs: [ build-image ] runs-on: ubuntu-22.04 - env: - IMAGE_TAG: ${{ inputs.image-tag }} - steps: - uses: docker/login-action@v3 with: @@ -97,7 +105,17 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Create multi-arch image + env: + DEFAULT_DEBIAN_VERSION: bullseye + IMAGE_TAG: ${{ inputs.image-tag }} run: | - docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \ - neondatabase/build-tools:${IMAGE_TAG}-x64 \ - neondatabase/build-tools:${IMAGE_TAG}-arm64 + for debian_version in bullseye bookworm; do + tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}") + if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then + tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}") + fi + + docker buildx imagetools create "${tags[@]}" \ + neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \ + neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64 + done diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a759efb56c..b669eaeb11 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -92,7 +92,7 @@ jobs: needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -106,7 +106,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -120,6 +120,24 @@ jobs: - name: Run mypy to check types run: poetry run mypy . + check-codestyle-jsonnet: + needs: [ check-permissions, build-build-tools-image ] + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Check Jsonnet code formatting + run: | + make -C compute jsonnetfmt-test + # Check that the vendor/postgres-* submodules point to the # corresponding REL_*_STABLE_neon branches. check-submodules: @@ -181,7 +199,7 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -193,16 +211,15 @@ jobs: with: submodules: true -# Disabled for now -# - name: Restore cargo deps cache -# id: cache_cargo -# uses: actions/cache@v4 -# with: -# path: | -# !~/.cargo/registry/src -# ~/.cargo/git/ -# target/ -# key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} + - name: Cache cargo deps + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + !~/.cargo/registry/src + ~/.cargo/git + target + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers @@ -262,7 +279,7 @@ jobs: uses: ./.github/workflows/_build-and-test-locally.yml with: arch: ${{ matrix.arch }} - build-tools-image: ${{ needs.build-build-tools-image.outputs.image }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds @@ -277,7 +294,7 @@ jobs: needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -290,7 +307,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -310,7 +327,7 @@ jobs: needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -368,7 +385,7 @@ jobs: runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -416,7 +433,7 @@ jobs: needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -560,15 +577,16 @@ jobs: ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm + DEBIAN_VERSION=bookworm provenance: false push: true pull: true file: Dockerfile - cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: needs: [ neon-image-arch, tag ] @@ -583,8 +601,9 @@ jobs: - name: Create multi-arch image run: | docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64 + -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 - uses: docker/login-action@v3 with: @@ -605,17 +624,16 @@ jobs: version: # Much data was already generated on old PG versions with bullseye's # libraries, the locales of which can cause data incompatibilities. - # However, new PG versions should check if they can be built on newer - # images, as that reduces the support burden of old and ancient - # distros. + # However, new PG versions should be build on newer images, + # as that reduces the support burden of old and ancient distros. - pg: v14 - debian: bullseye-slim + debian: bullseye - pg: v15 - debian: bullseye-slim + debian: bullseye - pg: v16 - debian: bullseye-slim + debian: bullseye - pg: v17 - debian: bookworm-slim + debian: bookworm arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -660,16 +678,16 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node - cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg == 'v16' @@ -680,17 +698,17 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node target: neon-pg-ext-test - cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once @@ -705,14 +723,16 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: needs: [ compute-node-image-arch, tag ] @@ -720,7 +740,16 @@ jobs: strategy: matrix: - version: [ v14, v15, v16, v17 ] + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + debian: bullseye + - pg: v15 + debian: bullseye + - pg: v16 + debian: bullseye + - pg: v17 + debian: bookworm steps: - uses: docker/login-action@v3 @@ -730,23 +759,26 @@ jobs: - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image - if: matrix.version == 'v16' + if: matrix.version.pg == 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch compute-tools image - if: matrix.version == 'v17' + if: matrix.version.pg == 'v16' run: | docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64 + -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - uses: docker/login-action@v3 with: @@ -754,13 +786,13 @@ jobs: username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR + - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Push multi-arch compute-tools image to ECR - if: matrix.version == 'v17' + if: matrix.version.pg == 'v16' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} @@ -771,7 +803,16 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16, v17 ] + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + debian: bullseye + - pg: v15 + debian: bullseye + - pg: v16 + debian: bullseye + - pg: v17 + debian: bookworm env: VM_BUILDER_VERSION: v0.35.0 @@ -793,18 +834,18 @@ jobs: # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ - -spec=compute/vm-image-spec.yaml \ - -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ + -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] @@ -1059,7 +1100,6 @@ jobs: run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release" ]]; then gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 140aac032a..287c9ea281 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -155,7 +155,7 @@ jobs: github.ref_name == 'main' runs-on: [ self-hosted, large ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index 23a2e3876c..df40b5beda 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -55,7 +55,7 @@ jobs: runs-on: ubuntu-22.04 container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -150,7 +150,7 @@ jobs: runs-on: ubuntu-22.04 container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 2e79498fc4..c196d07d3e 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -71,7 +71,6 @@ jobs: steps: - uses: docker/login-action@v3 - with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -94,8 +93,22 @@ jobs: az acr login --name=neoneastus2 - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR + env: + DEFAULT_DEBIAN_VERSION: bullseye run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ - -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \ - -t neondatabase/build-tools:${TO_TAG} \ - neondatabase/build-tools:${FROM_TAG} + for debian_version in bullseye bookworm; do + tags=() + + tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}") + tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}") + tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}") + + if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then + tags+=("-t" "neondatabase/build-tools:${TO_TAG}") + tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}") + tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}") + fi + + docker buildx imagetools create "${tags[@]}" \ + neondatabase/build-tools:${FROM_TAG}-${debian_version} + done diff --git a/.github/workflows/report-workflow-stats.yml b/.github/workflows/report-workflow-stats.yml index 1afe896600..6abeff7695 100644 --- a/.github/workflows/report-workflow-stats.yml +++ b/.github/workflows/report-workflow-stats.yml @@ -33,7 +33,7 @@ jobs: actions: read steps: - name: Export GH Workflow Stats - uses: fedordikarev/gh-workflow-stats-action@v0.1.2 + uses: neondatabase/gh-workflow-stats-action@v0.1.4 with: DB_URI: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} DB_TABLE: "gh_workflow_stats_neon" diff --git a/CODEOWNERS b/CODEOWNERS index 606dbb4e22..f8ed4be816 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,5 +1,6 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute /storage_controller @neondatabase/storage +/storage_scrubber @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage /libs/remote_storage/ @neondatabase/storage diff --git a/Cargo.lock b/Cargo.lock index 5edf5cf7b4..6b212bac2e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2695,6 +2695,7 @@ checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" dependencies = [ "equivalent", "hashbrown 0.14.5", + "serde", ] [[package]] @@ -2794,9 +2795,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" @@ -4296,6 +4297,7 @@ dependencies = [ "indexmap 2.0.1", "ipnet", "itertools 0.10.5", + "itoa", "jose-jwa", "jose-jwk", "lasso", @@ -4646,9 +4648,10 @@ dependencies = [ "camino-tempfile", "futures", "futures-util", + "http-body-util", "http-types", "humantime-serde", - "hyper 0.14.30", + "hyper 1.4.1", "itertools 0.10.5", "metrics", "once_cell", @@ -7307,6 +7310,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "indexmap 1.9.3", + "indexmap 2.0.1", "itertools 0.12.1", "lazy_static", "libc", diff --git a/Cargo.toml b/Cargo.toml index dde80f5020..a1a974b33b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -107,6 +107,7 @@ indexmap = "2" indoc = "2" ipnet = "2.9.0" itertools = "0.10" +itoa = "1.0.11" jsonwebtoken = "9" lasso = "0.7" libc = "0.2" diff --git a/Dockerfile b/Dockerfile index bdb76a4f4f..785dd4598e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,8 @@ ARG IMAGE=build-tools ARG TAG=pinned ARG DEFAULT_PG_VERSION=17 ARG STABLE_PG_VERSION=16 +ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build @@ -57,7 +59,7 @@ RUN set -e \ # Build final image # -FROM debian:bullseye-slim +FROM debian:${DEBIAN_FLAVOR} ARG DEFAULT_PG_VERSION WORKDIR /data diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index d8bcacf228..7cba1c8635 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -1,12 +1,7 @@ -FROM debian:bullseye-slim +ARG DEBIAN_VERSION=bullseye -# Use ARG as a build-time environment variable here to allow. -# It's not supposed to be set outside. -# Alternatively it can be obtained using the following command -# ``` -# . /etc/os-release && echo "${VERSION_CODENAME}" -# ``` -ARG DEBIAN_VERSION_CODENAME=bullseye +FROM debian:${DEBIAN_VERSION}-slim +ARG DEBIAN_VERSION # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home @@ -32,6 +27,7 @@ RUN set -e \ gnupg \ gzip \ jq \ + jsonnet \ libcurl4-openssl-dev \ libbz2-dev \ libffi-dev \ @@ -42,14 +38,14 @@ RUN set -e \ libseccomp-dev \ libsqlite3-dev \ libssl-dev \ - libstdc++-10-dev \ + $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \ libtool \ libxml2-dev \ libxmlsec1-dev \ libxxhash-dev \ lsof \ make \ - netcat \ + netcat-openbsd \ net-tools \ openssh-client \ parallel \ @@ -78,7 +74,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/ # LLVM ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ - && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ @@ -86,7 +82,7 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ # Install docker RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \ && apt update \ && apt install -y docker-ce docker-ce-cli \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/Makefile b/Makefile index 5e227ed3f5..33cfda2661 100644 --- a/Makefile +++ b/Makefile @@ -291,6 +291,7 @@ postgres-check: \ # This doesn't remove the effects of 'configure'. .PHONY: clean clean: postgres-clean neon-pg-clean-ext + $(MAKE) -C compute clean $(CARGO_CMD_PREFIX) cargo clean # This removes everything diff --git a/README.md b/README.md index cfc63b4708..e68ef70bdf 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ -libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev +libprotobuf-dev libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev ``` * On Fedora, these packages are needed: ```bash diff --git a/compute/.gitignore b/compute/.gitignore new file mode 100644 index 0000000000..70980d335a --- /dev/null +++ b/compute/.gitignore @@ -0,0 +1,5 @@ +# sql_exporter config files generated from Jsonnet +etc/neon_collector.yml +etc/neon_collector_autoscaling.yml +etc/sql_exporter.yml +etc/sql_exporter_autoscaling.yml diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 15afb9897f..b0ce7c1718 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -3,7 +3,8 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG -ARG DEBIAN_FLAVOR=bullseye-slim +ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim ######################################################################################### # @@ -11,19 +12,23 @@ ARG DEBIAN_FLAVOR=bullseye-slim # ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS build-deps -ARG DEBIAN_FLAVOR +ARG DEBIAN_VERSION -RUN case $DEBIAN_FLAVOR in \ +RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. # Install newer version (3.25) from backports. - bullseye*) \ + # libstdc++-10-dev is required for plv8 + bullseye) \ echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \ - VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \ + VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports libstdc++-10-dev"; \ ;; \ # Version-specific installs for Bookworm (PG17): - bookworm*) \ - VERSION_INSTALLS="cmake"; \ + bookworm) \ + VERSION_INSTALLS="cmake libstdc++-12-dev"; \ + ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ ;; \ esac && \ apt update && \ @@ -223,18 +228,33 @@ FROM build-deps AS plv8-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt update && \ +RUN apt update && \ apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ +# plv8 3.2.3 supports v17 +# last release v3.2.3 - Sep 7, 2024 +# +# clone the repo instead of downloading the release tarball because plv8 has submodule dependencies +# and the release tarball doesn't include them +# +# Use new version only for v17 +# because since v3.2, plv8 doesn't include plcoffee and plls extensions +ENV PLV8_TAG=v3.2.3 + +RUN case "${PG_VERSION}" in \ + "v17") \ + export PLV8_TAG=v3.2.3 \ + ;; \ + "v14" | "v15" | "v16") \ + export PLV8_TAG=v3.1.10 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ esac && \ - wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ - echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ - mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ + git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ + tar -czf plv8.tar.gz --exclude .git plv8-src && \ + cd plv8-src && \ # generate and copy upgrade scripts mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ @@ -244,8 +264,17 @@ RUN case "${PG_VERSION}" in "v17") \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ # don't break computes with installed old version of plv8 cd /usr/local/pgsql/lib/ && \ - ln -s plv8-3.1.10.so plv8-3.1.5.so && \ - ln -s plv8-3.1.10.so plv8-3.1.8.so && \ + case "${PG_VERSION}" in \ + "v17") \ + ln -s plv8-3.2.3.so plv8-3.1.8.so && \ + ln -s plv8-3.2.3.so plv8-3.1.5.so && \ + ln -s plv8-3.2.3.so plv8-3.1.10.so \ + ;; \ + "v14" | "v15" | "v16") \ + ln -s plv8-3.1.10.so plv8-3.1.5.so && \ + ln -s plv8-3.1.10.so plv8-3.1.8.so \ + ;; \ + esac && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control @@ -323,6 +352,9 @@ COPY compute/patches/pgvector.patch /pgvector.patch # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. +# +# v17 is not supported yet because of upstream issue +# https://github.com/pgvector/pgvector/issues/669 RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ @@ -345,7 +377,7 @@ ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific -# doesn't use releases, last commit f3d82fd - Mar 2, 2023 +# doesn't use releases, last commit f3d82fd - Mar 2, 2023 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ @@ -362,11 +394,10 @@ FROM build-deps AS hypopg-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ - echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ +# HypoPG 1.4.1 supports v17 +# last release 1.4.1 - Apr 28, 2024 +RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \ + echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -403,6 +434,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/rum.patch /rum.patch +# maybe version-specific +# support for v17 is unknown +# last release 1.3.13 - Sep 19, 2022 RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ @@ -424,11 +458,10 @@ FROM build-deps AS pgtap-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ - echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ +# pgtap 1.3.3 supports v17 +# last release v1.3.3 - Apr 8, 2024 +RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \ + echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -501,11 +534,10 @@ FROM build-deps AS plpgsql-check-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ - echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ +# plpgsql_check v2.7.11 supports v17 +# last release v2.7.11 - Sep 16, 2024 +RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \ + echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -523,18 +555,19 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ ;; \ - *) \ + "v16") \ export TIMESCALEDB_VERSION=2.13.0 \ export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \ ;; \ + "v17") \ + export TIMESCALEDB_VERSION=2.17.0 \ + export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \ + ;; \ esac && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ @@ -557,10 +590,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +# version-specific, has separate releases for each version +RUN case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -574,7 +605,8 @@ RUN case "${PG_VERSION}" in "v17") \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ "v17") \ - echo "TODO: PG17 pg_hint_plan support" && exit 0 \ + export PG_HINT_PLAN_VERSION=17_1_7_0 \ + export PG_HINT_PLAN_CHECKSUM=06dd306328c67a4248f48403c50444f30959fb61ebe963248dbc2afb396fe600 \ ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ @@ -598,6 +630,10 @@ FROM build-deps AS pg-cron-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# 1.6.4 available, supports v17 +# This is an experimental extension that we do not support on prod yet. +# !Do not remove! +# We set it in shared_preload_libraries and computes will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ @@ -619,23 +655,37 @@ FROM build-deps AS rdkit-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt-get update && \ +RUN apt-get update && \ apt-get install --no-install-recommends -y \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ - libeigen3-dev + libeigen3-dev \ + libboost-all-dev +# rdkit Release_2024_09_1 supports v17 +# last release Release_2024_09_1 - Sep 27, 2024 +# +# Use new version only for v17 +# because Release_2024_09_1 has some backward incompatible changes +# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ +RUN case "${PG_VERSION}" in \ + "v17") \ + export RDKIT_VERSION=Release_2024_09_1 \ + export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \ + ;; \ + "v14" | "v15" | "v16") \ + export RDKIT_VERSION=Release_2023_03_3 \ + export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ esac && \ - wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ - echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ + wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \ + echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ @@ -674,12 +724,11 @@ FROM build-deps AS pg-uuidv7-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# not version-specific +# last release v1.6.0 - Oct 9, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ - echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \ + echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -750,6 +799,8 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# This is our extension, support stopped in favor of pgvector +# TODO: deprecate it ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ @@ -776,6 +827,8 @@ FROM build-deps AS pg-anon-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# This is an experimental extension, never got to real production. +# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ @@ -925,8 +978,8 @@ ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \ esac && \ - wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \ - echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \ + wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \ + echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release @@ -942,13 +995,12 @@ FROM build-deps AS wal2json-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# wal2json wal2json_2_6 supports v17 +# last release wal2json_2_6 - Apr 25, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \ - esac && \ - wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ - echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ - mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ +RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \ + echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \ + mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install @@ -962,12 +1014,11 @@ FROM build-deps AS pg-ivm-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# pg_ivm v1.9 supports v17 +# last release v1.9 - Jul 31 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \ - esac && \ - wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ - echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \ + echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -983,12 +1034,11 @@ FROM build-deps AS pg-partman-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# should support v17 https://github.com/pgpartman/pg_partman/discussions/693 +# last release 5.1.0 Apr 2, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "pg_partman doesn't support PG17 yet" && exit 0;; \ - esac && \ - wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ - echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \ + echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -1091,7 +1141,6 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS compute-tools-image -ARG DEBIAN_FLAVOR COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl @@ -1102,7 +1151,6 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS pgbouncer -ARG DEBIAN_FLAVOR RUN set -e \ && apt-get update \ && apt-get install --no-install-recommends -y \ @@ -1167,6 +1215,19 @@ RUN rm -r /usr/local/pgsql/include # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a +######################################################################################### +# +# Preprocess the sql_exporter configuration files +# +######################################################################################### +FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor +ARG PG_VERSION + +USER nonroot + +COPY --chown=nonroot compute compute + +RUN make PG_VERSION="${PG_VERSION}" -C compute ######################################################################################### # @@ -1257,7 +1318,7 @@ ENV PGDATABASE=postgres # ######################################################################################### FROM debian:$DEBIAN_FLAVOR -ARG DEBIAN_FLAVOR +ARG DEBIAN_VERSION # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo "postgres:test_console_pass" | chpasswd && \ @@ -1285,10 +1346,10 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter -COPY --chmod=0644 compute/etc/sql_exporter.yml /etc/sql_exporter.yml -COPY --chmod=0644 compute/etc/neon_collector.yml /etc/neon_collector.yml -COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml -COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Create remote extension download directory RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions @@ -1305,19 +1366,22 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca RUN apt update && \ - case $DEBIAN_FLAVOR in \ + case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # libicu67, locales for collations (including ICU and plpgsql_check) # libgdal28, libproj19 for PostGIS - bullseye*) \ + bullseye) \ VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \ ;; \ # Version-specific installs for Bookworm (PG17): # libicu72, locales for collations (including ICU and plpgsql_check) # libgdal32, libproj25 for PostGIS - bookworm*) \ + bookworm) \ VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \ ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ + ;; \ esac && \ apt install --no-install-recommends -y \ gdb \ diff --git a/compute/Makefile b/compute/Makefile new file mode 100644 index 0000000000..e4f08a223c --- /dev/null +++ b/compute/Makefile @@ -0,0 +1,47 @@ +jsonnet_files = $(wildcard \ + etc/*.jsonnet \ + etc/sql_exporter/*.libsonnet) + +.PHONY: all +all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml + +neon_collector.yml: $(jsonnet_files) + JSONNET_PATH=jsonnet:etc jsonnet \ + --output-file etc/$@ \ + --ext-str pg_version=$(PG_VERSION) \ + etc/neon_collector.jsonnet + +neon_collector_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=jsonnet:etc jsonnet \ + --output-file etc/$@ \ + --ext-str pg_version=$(PG_VERSION) \ + etc/neon_collector_autoscaling.jsonnet + +sql_exporter.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_file=neon_collector.yml \ + etc/sql_exporter.jsonnet + +sql_exporter_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_file=neon_collector_autoscaling.yml \ + --tla-str application_name=sql_exporter_autoscaling \ + etc/sql_exporter.jsonnet + +.PHONY: clean +clean: + rm --force \ + etc/neon_collector.yml \ + etc/neon_collector_autoscaling.yml \ + etc/sql_exporter.yml \ + etc/sql_exporter_autoscaling.yml + +.PHONY: jsonnetfmt-test +jsonnetfmt-test: + jsonnetfmt --test $(jsonnet_files) + +.PHONY: jsonnetfmt-format +jsonnetfmt-format: + jsonnetfmt --in-place $(jsonnet_files) diff --git a/compute/etc/README.md b/compute/etc/README.md new file mode 100644 index 0000000000..70b108146c --- /dev/null +++ b/compute/etc/README.md @@ -0,0 +1,17 @@ +# Compute Configuration + +These files are the configuration files for various other pieces of software +that will be running in the compute alongside Postgres. + +## `sql_exporter` + +### Adding a `sql_exporter` Metric + +We use `sql_exporter` to export various metrics from Postgres. In order to add +a metric, you will need to create two files: a `libsonnet` and a `sql` file. You +will then import the `libsonnet` file in one of the collector files, and the +`sql` file will be imported in the `libsonnet` file. + +In the event your statistic is an LSN, you may want to cast it to a `float8` +because Prometheus only supports floats. It's probably fine because `float8` can +store integers from `-2^53` to `+2^53` exactly. diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet new file mode 100644 index 0000000000..8b43ebe7a3 --- /dev/null +++ b/compute/etc/neon_collector.jsonnet @@ -0,0 +1,51 @@ +{ + collector_name: 'neon_collector', + metrics: [ + import 'sql_exporter/checkpoints_req.libsonnet', + import 'sql_exporter/checkpoints_timed.libsonnet', + import 'sql_exporter/compute_current_lsn.libsonnet', + import 'sql_exporter/compute_logical_snapshot_files.libsonnet', + import 'sql_exporter/compute_receive_lsn.libsonnet', + import 'sql_exporter/compute_subscriptions_count.libsonnet', + import 'sql_exporter/connection_counts.libsonnet', + import 'sql_exporter/db_total_size.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_count.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_sum.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_count.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_sum.libsonnet', + import 'sql_exporter/getpage_prefetch_discards_total.libsonnet', + import 'sql_exporter/getpage_prefetch_misses_total.libsonnet', + import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', + import 'sql_exporter/getpage_prefetches_buffered.libsonnet', + import 'sql_exporter/getpage_sync_requests_total.libsonnet', + import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', + import 'sql_exporter/getpage_wait_seconds_count.libsonnet', + import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + import 'sql_exporter/logical_slot_restart_lsn.libsonnet', + import 'sql_exporter/max_cluster_size.libsonnet', + import 'sql_exporter/pageserver_disconnects_total.libsonnet', + import 'sql_exporter/pageserver_requests_sent_total.libsonnet', + import 'sql_exporter/pageserver_send_flushes_total.libsonnet', + import 'sql_exporter/pageserver_open_requests.libsonnet', + import 'sql_exporter/pg_stats_userdb.libsonnet', + import 'sql_exporter/replication_delay_bytes.libsonnet', + import 'sql_exporter/replication_delay_seconds.libsonnet', + import 'sql_exporter/retained_wal.libsonnet', + import 'sql_exporter/wal_is_lost.libsonnet', + ], + queries: [ + { + query_name: 'neon_perf_counters', + query: importstr 'sql_exporter/neon_perf_counters.sql', + }, + ], +} diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml deleted file mode 100644 index 92da0cdbdd..0000000000 --- a/compute/etc/neon_collector.yml +++ /dev/null @@ -1,331 +0,0 @@ -collector_name: neon_collector -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: connection_counts - type: gauge - help: 'Connection counts' - key_labels: - - datname - - state - values: [count] - query: | - select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; - -- metric_name: pg_stats_userdb - type: gauge - help: 'Stats for several oldest non-system dbs' - key_labels: - - datname - value_label: kind - values: - - db_size - - deadlocks - # Rows - - inserted - - updated - - deleted - # We export stats for 10 non-system database. Without this limit - # it is too easy to abuse the system by creating lots of databases. - query: | - select pg_database_size(datname) as db_size, deadlocks, - tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, - datname - from pg_stat_database - where datname IN ( - select datname - from pg_database - where datname <> 'postgres' and not datistemplate - order by oid - limit 10 - ); - -- metric_name: max_cluster_size - type: gauge - help: 'neon.max_cluster_size setting' - key_labels: - values: [max_cluster_size] - query: | - select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; - -- metric_name: db_total_size - type: gauge - help: 'Size of all databases' - key_labels: - values: [total] - query: | - select sum(pg_database_size(datname)) as total from pg_database; - -- metric_name: getpage_wait_seconds_count - type: counter - help: 'Number of getpage requests' - values: [getpage_wait_seconds_count] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_sum - type: counter - help: 'Time spent in getpage requests' - values: [getpage_wait_seconds_sum] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_requests_total - type: counter - help: 'Number of getpage issued for prefetching' - values: [getpage_prefetch_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_sync_requests_total - type: counter - help: 'Number of synchronous getpage issued' - values: [getpage_sync_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_misses_total - type: counter - help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read' - values: [getpage_prefetch_misses_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_discards_total - type: counter - help: 'Number of prefetch responses issued but not used' - values: [getpage_prefetch_discards_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_requests_sent_total - type: counter - help: 'Number of all requests sent to the pageserver (not just GetPage requests)' - values: [pageserver_requests_sent_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_disconnects_total - type: counter - help: 'Number of times that the connection to the pageserver was lost' - values: [pageserver_disconnects_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_send_flushes_total - type: counter - help: 'Number of flushes to the pageserver connection' - values: [pageserver_send_flushes_total] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_bucket - type: counter - help: 'Histogram buckets of getpage request latency' - key_labels: - - bucket_le - values: [value] - query_ref: getpage_wait_seconds_buckets - -# DEPRECATED -- metric_name: lfc_approximate_working_set_size - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: - values: [approximate_working_set_size] - query: | - select neon.approximate_working_set_size(false) as approximate_working_set_size; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration] - values: [size] - # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection - # of durations in a pretty-printed form. - query: | - select - x as duration, - neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size - from - (values ('5m'),('15m'),('1h')) as t (x); - -- metric_name: compute_current_lsn - type: gauge - help: 'Current LSN of the database' - key_labels: - values: [lsn] - query: | - select - case - when pg_catalog.pg_is_in_recovery() - then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 - else (pg_current_wal_lsn() - '0/0')::FLOAT8 - end as lsn; - -- metric_name: compute_receive_lsn - type: gauge - help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication' - key_labels: - values: [lsn] - query: | - SELECT - CASE - WHEN pg_catalog.pg_is_in_recovery() - THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 - ELSE 0 - END AS lsn; - -- metric_name: replication_delay_bytes - type: gauge - help: 'Bytes between received and replayed LSN' - key_labels: - values: [replication_delay_bytes] - # We use a GREATEST call here because this calculation can be negative. - # The calculation is not atomic, meaning after we've gotten the receive - # LSN, the replay LSN may have advanced past the receive LSN we - # are using for the calculation. - query: | - SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; - -- metric_name: replication_delay_seconds - type: gauge - help: 'Time since last LSN was replayed' - key_labels: - values: [replication_delay_seconds] - query: | - SELECT - CASE - WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 - ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) - END AS replication_delay_seconds; - -- metric_name: checkpoints_req - type: gauge - help: 'Number of requested checkpoints' - key_labels: - values: [checkpoints_req] - query: | - SELECT checkpoints_req FROM pg_stat_bgwriter; - -- metric_name: checkpoints_timed - type: gauge - help: 'Number of scheduled checkpoints' - key_labels: - values: [checkpoints_timed] - query: | - SELECT checkpoints_timed FROM pg_stat_bgwriter; - -- metric_name: compute_logical_snapshot_files - type: gauge - help: 'Number of snapshot files in pg_logical/snapshot' - key_labels: - - timeline_id - values: [num_logical_snapshot_files] - query: | - SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, - -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These - -- temporary snapshot files are renamed to the actual snapshot files after they are - -- completely built. We only WAL-log the completely built snapshot files. - (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; - -# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. -# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. - -# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. -- metric_name: logical_slot_restart_lsn - type: gauge - help: 'restart_lsn of logical slots' - key_labels: - - slot_name - values: [restart_lsn] - query: | - select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn - from pg_replication_slots - where slot_type = 'logical'; - -- metric_name: compute_subscriptions_count - type: gauge - help: 'Number of logical replication subscriptions grouped by enabled/disabled' - key_labels: - - enabled - values: [subscriptions_count] - query: | - select subenabled::text as enabled, count(*) as subscriptions_count - from pg_subscription - group by subenabled; - -- metric_name: retained_wal - type: gauge - help: 'Retained WAL in inactive replication slots' - key_labels: - - slot_name - values: [retained_wal] - query: | - SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal - FROM pg_replication_slots - WHERE active = false; - -- metric_name: wal_is_lost - type: gauge - help: 'Whether or not the replication slot wal_status is lost' - key_labels: - - slot_name - values: [wal_is_lost] - query: | - SELECT slot_name, - CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost - FROM pg_replication_slots; - -queries: - - query_name: neon_perf_counters - query: | - WITH c AS ( - SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters - ) - SELECT d.* - FROM pg_catalog.jsonb_to_record((select jb from c)) as d( - getpage_wait_seconds_count numeric, - getpage_wait_seconds_sum numeric, - getpage_prefetch_requests_total numeric, - getpage_sync_requests_total numeric, - getpage_prefetch_misses_total numeric, - getpage_prefetch_discards_total numeric, - pageserver_requests_sent_total numeric, - pageserver_disconnects_total numeric, - pageserver_send_flushes_total numeric - ); - - - query_name: getpage_wait_seconds_buckets - query: | - SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/neon_collector_autoscaling.jsonnet b/compute/etc/neon_collector_autoscaling.jsonnet new file mode 100644 index 0000000000..e248172a3d --- /dev/null +++ b/compute/etc/neon_collector_autoscaling.jsonnet @@ -0,0 +1,11 @@ +{ + collector_name: 'neon_collector_autoscaling', + metrics: [ + import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + ], +} diff --git a/compute/etc/neon_collector_autoscaling.yml b/compute/etc/neon_collector_autoscaling.yml deleted file mode 100644 index 5616264eba..0000000000 --- a/compute/etc/neon_collector_autoscaling.yml +++ /dev/null @@ -1,55 +0,0 @@ -collector_name: neon_collector_autoscaling -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration_seconds] - values: [size] - # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set - # size looking back 1..60 minutes, labeled with the number of minutes. - query: | - select - x::text as duration_seconds, - neon.approximate_working_set_size_seconds(x) as size - from - (select generate_series * 60 as x from generate_series(1, 60)) as t (x); diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet new file mode 100644 index 0000000000..640e2ac38d --- /dev/null +++ b/compute/etc/sql_exporter.jsonnet @@ -0,0 +1,40 @@ +function(collector_file, application_name='sql_exporter') { + // Configuration for sql_exporter for autoscaling-agent + // Global defaults. + global: { + // If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: '10s', + // Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: '500ms', + // Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: '0s', + // Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + // as will concurrent scrapes. + max_connections: 1, + // Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + // always be the same as max_connections. + max_idle_connections: 1, + // Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + // If 0, connections are not closed due to a connection's age. + max_connection_lifetime: '5m', + }, + + // The target to monitor and the collectors to execute on it. + target: { + // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + // the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]), + + // Collectors (referenced by name) to execute on the target. + // Glob patterns are supported (see for syntax). + collectors: [ + 'neon_collector', + ], + }, + + // Collector files specifies a list of globs. One collector definition is read from each matching file. + // Glob patterns are supported (see for syntax). + collector_files: [ + collector_file, + ], +} diff --git a/compute/etc/sql_exporter.yml b/compute/etc/sql_exporter.yml deleted file mode 100644 index 139d04468a..0000000000 --- a/compute/etc/sql_exporter.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector.yml" diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql new file mode 100644 index 0000000000..a4b946e8e2 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.17.sql @@ -0,0 +1 @@ +SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet new file mode 100644 index 0000000000..e5d9753507 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet @@ -0,0 +1,15 @@ +local neon = import 'neon.libsonnet'; + +local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; +local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; + +{ + metric_name: 'checkpoints_req', + type: 'gauge', + help: 'Number of requested checkpoints', + key_labels: null, + values: [ + 'checkpoints_req', + ], + query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, +} diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql new file mode 100644 index 0000000000..eb8427c883 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.sql @@ -0,0 +1 @@ +SELECT checkpoints_req FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/checkpoints_timed.17.sql b/compute/etc/sql_exporter/checkpoints_timed.17.sql new file mode 100644 index 0000000000..0d86ddb3ea --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.17.sql @@ -0,0 +1 @@ +SELECT num_timed AS checkpoints_timed FROM pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet new file mode 100644 index 0000000000..0ba0080188 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet @@ -0,0 +1,15 @@ +local neon = import 'neon.libsonnet'; + +local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; +local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; + +{ + metric_name: 'checkpoints_timed', + type: 'gauge', + help: 'Number of scheduled checkpoints', + key_labels: null, + values: [ + 'checkpoints_timed', + ], + query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, +} diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql new file mode 100644 index 0000000000..c50853134c --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.sql @@ -0,0 +1 @@ +SELECT checkpoints_timed FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/compute_current_lsn.libsonnet b/compute/etc/sql_exporter/compute_current_lsn.libsonnet new file mode 100644 index 0000000000..ccff161358 --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_current_lsn', + type: 'gauge', + help: 'Current LSN of the database', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_current_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql new file mode 100644 index 0000000000..be02b8a094 --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet new file mode 100644 index 0000000000..212f079ccf --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_logical_snapshot_files', + type: 'gauge', + help: 'Number of snapshot files in pg_logical/snapshot', + key_labels: [ + 'timeline_id', + ], + values: [ + 'num_logical_snapshot_files', + ], + query: importstr 'sql_exporter/compute_logical_snapshot_files.sql', +} diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql new file mode 100644 index 0000000000..f2454235b7 --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql @@ -0,0 +1,7 @@ +SELECT + (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. + -- These temporary snapshot files are renamed to the actual snapshot files + -- after they are completely built. We only WAL-log the completely built + -- snapshot files + (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; diff --git a/compute/etc/sql_exporter/compute_receive_lsn.libsonnet b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet new file mode 100644 index 0000000000..eb68a77ec2 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_receive_lsn', + type: 'gauge', + help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_receive_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql new file mode 100644 index 0000000000..318b31ab41 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + ELSE 0 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet new file mode 100644 index 0000000000..e1575da397 --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_subscriptions_count', + type: 'gauge', + help: 'Number of logical replication subscriptions grouped by enabled/disabled', + key_labels: [ + 'enabled', + ], + values: [ + 'subscriptions_count', + ], + query: importstr 'sql_exporter/compute_subscriptions_count.sql', +} diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql new file mode 100644 index 0000000000..50740cb5df --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql @@ -0,0 +1 @@ +SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled; diff --git a/compute/etc/sql_exporter/connection_counts.libsonnet b/compute/etc/sql_exporter/connection_counts.libsonnet new file mode 100644 index 0000000000..9f94db67a9 --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.libsonnet @@ -0,0 +1,13 @@ +{ + metric_name: 'connection_counts', + type: 'gauge', + help: 'Connection counts', + key_labels: [ + 'datname', + 'state', + ], + values: [ + 'count', + ], + query: importstr 'sql_exporter/connection_counts.sql', +} diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql new file mode 100644 index 0000000000..6824480fdb --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.sql @@ -0,0 +1 @@ +SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state; diff --git a/compute/etc/sql_exporter/db_total_size.libsonnet b/compute/etc/sql_exporter/db_total_size.libsonnet new file mode 100644 index 0000000000..6e08d5fb87 --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'db_total_size', + type: 'gauge', + help: 'Size of all databases', + key_labels: null, + values: [ + 'total', + ], + query: importstr 'sql_exporter/db_total_size.sql', +} diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql new file mode 100644 index 0000000000..9cbbdfd8a3 --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -0,0 +1 @@ +SELECT sum(pg_database_size(datname)) AS total FROM pg_database; diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..d13f657a7f --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'file_cache_read_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of LFC read operation latencies', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/file_cache_read_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql new file mode 100644 index 0000000000..09047bf0c4 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_read_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..aa028b0f5e --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_read_wait_seconds_count', + type: 'counter', + help: 'Number of read operations in LFC', + values: [ + 'file_cache_read_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..2547aabf3d --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_read_wait_seconds_sum', + type: 'counter', + help: 'Time spent in LFC read operations', + values: [ + 'file_cache_read_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..13dbc77f76 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'file_cache_write_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of LFC write operation latencies', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/file_cache_write_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql new file mode 100644 index 0000000000..d03613cf91 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_write_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..6227d3193a --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_write_wait_seconds_count', + type: 'counter', + help: 'Number of write operations in LFC', + values: [ + 'file_cache_write_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..2acfe7f608 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_write_wait_seconds_sum', + type: 'counter', + help: 'Time spent in LFC write operations', + values: [ + 'file_cache_write_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet new file mode 100644 index 0000000000..935e35d2e4 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_discards_total', + type: 'counter', + help: 'Number of prefetch responses issued but not used', + values: [ + 'getpage_prefetch_discards_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet new file mode 100644 index 0000000000..b9a9632105 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_misses_total', + type: 'counter', + help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read", + values: [ + 'getpage_prefetch_misses_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet new file mode 100644 index 0000000000..75fdb6717b --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_requests_total', + type: 'counter', + help: 'Number of getpage issued for prefetching', + values: [ + 'getpage_prefetch_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet new file mode 100644 index 0000000000..8926d867c9 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetches_buffered', + type: 'gauge', + help: 'Number of prefetched pages buffered in neon', + values: [ + 'getpage_prefetches_buffered', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet new file mode 100644 index 0000000000..f3a1e6b339 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_sync_requests_total', + type: 'counter', + help: 'Number of synchronous getpage issued', + values: [ + 'getpage_sync_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..2adda2ad03 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'getpage_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of getpage request latency', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql new file mode 100644 index 0000000000..b4a6bc1560 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..d2326974fc --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_count', + type: 'counter', + help: 'Number of getpage requests', + values: [ + 'getpage_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..844c8419ff --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_sum', + type: 'counter', + help: 'Time spent in getpage requests', + values: [ + 'getpage_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet new file mode 100644 index 0000000000..78859ce60d --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet @@ -0,0 +1,12 @@ +// DEPRECATED + +{ + metric_name: 'lfc_approximate_working_set_size', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: null, + values: [ + 'approximate_working_set_size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql new file mode 100644 index 0000000000..de509ebb47 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql @@ -0,0 +1 @@ +SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size; diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet new file mode 100644 index 0000000000..a54deca467 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration_seconds', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql new file mode 100644 index 0000000000..35fa42c34c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "internal" / "machine-readable" version. This outputs the +-- working set size looking back 1..60 minutes, labeled with the number of +-- minutes. + +SELECT + x::text as duration_seconds, + neon.approximate_working_set_size_seconds(x) AS size +FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet new file mode 100644 index 0000000000..4970bd2c7f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql new file mode 100644 index 0000000000..46c7d1610c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "public" / "human-readable" version. Here, we supply a +-- small selection of durations in a pretty-printed form. + +SELECT + x AS duration, + neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM ( + VALUES ('5m'), ('15m'), ('1h') + ) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet new file mode 100644 index 0000000000..4cbbd76621 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_cache_size_limit', + type: 'gauge', + help: 'LFC cache size limit in bytes', + key_labels: null, + values: [ + 'lfc_cache_size_limit', + ], + query: importstr 'sql_exporter/lfc_cache_size_limit.sql', +} diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql new file mode 100644 index 0000000000..378904c1fe --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql @@ -0,0 +1 @@ +SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; diff --git a/compute/etc/sql_exporter/lfc_hits.libsonnet b/compute/etc/sql_exporter/lfc_hits.libsonnet new file mode 100644 index 0000000000..4a0b7671bf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_hits', + type: 'gauge', + help: 'lfc_hits', + key_labels: null, + values: [ + 'lfc_hits', + ], + query: importstr 'sql_exporter/lfc_hits.sql', +} diff --git a/compute/etc/sql_exporter/lfc_hits.sql b/compute/etc/sql_exporter/lfc_hits.sql new file mode 100644 index 0000000000..2e14f5c73c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits'; diff --git a/compute/etc/sql_exporter/lfc_misses.libsonnet b/compute/etc/sql_exporter/lfc_misses.libsonnet new file mode 100644 index 0000000000..302998d04f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_misses', + type: 'gauge', + help: 'lfc_misses', + key_labels: null, + values: [ + 'lfc_misses', + ], + query: importstr 'sql_exporter/lfc_misses.sql', +} diff --git a/compute/etc/sql_exporter/lfc_misses.sql b/compute/etc/sql_exporter/lfc_misses.sql new file mode 100644 index 0000000000..27ed4ecf86 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses'; diff --git a/compute/etc/sql_exporter/lfc_used.libsonnet b/compute/etc/sql_exporter/lfc_used.libsonnet new file mode 100644 index 0000000000..23891dadaf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_used', + type: 'gauge', + help: 'LFC chunks used (chunk = 1MB)', + key_labels: null, + values: [ + 'lfc_used', + ], + query: importstr 'sql_exporter/lfc_used.sql', +} diff --git a/compute/etc/sql_exporter/lfc_used.sql b/compute/etc/sql_exporter/lfc_used.sql new file mode 100644 index 0000000000..4f01545f30 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used'; diff --git a/compute/etc/sql_exporter/lfc_writes.libsonnet b/compute/etc/sql_exporter/lfc_writes.libsonnet new file mode 100644 index 0000000000..6a22ee1dd9 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_writes', + type: 'gauge', + help: 'lfc_writes', + key_labels: null, + values: [ + 'lfc_writes', + ], + query: importstr 'sql_exporter/lfc_writes.sql', +} diff --git a/compute/etc/sql_exporter/lfc_writes.sql b/compute/etc/sql_exporter/lfc_writes.sql new file mode 100644 index 0000000000..37c9abc9cf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes'; diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet new file mode 100644 index 0000000000..8ef31b5d8d --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet @@ -0,0 +1,15 @@ +// Number of slots is limited by max_replication_slots, so collecting position +// for all of them shouldn't be bad. + +{ + metric_name: 'logical_slot_restart_lsn', + type: 'gauge', + help: 'restart_lsn of logical slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'restart_lsn', + ], + query: importstr 'sql_exporter/logical_slot_restart_lsn.sql', +} diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql new file mode 100644 index 0000000000..1b1c038501 --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql @@ -0,0 +1,3 @@ +SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn +FROM pg_replication_slots +WHERE slot_type = 'logical'; diff --git a/compute/etc/sql_exporter/max_cluster_size.libsonnet b/compute/etc/sql_exporter/max_cluster_size.libsonnet new file mode 100644 index 0000000000..1352fb77ee --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'max_cluster_size', + type: 'gauge', + help: 'neon.max_cluster_size setting', + key_labels: null, + values: [ + 'max_cluster_size', + ], + query: importstr 'sql_exporter/max_cluster_size.sql', +} diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql new file mode 100644 index 0000000000..2d2355a9a7 --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.sql @@ -0,0 +1 @@ +SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size'; diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql new file mode 100644 index 0000000000..4a36f3bf2f --- /dev/null +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -0,0 +1,19 @@ +WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters) + +SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( + file_cache_read_wait_seconds_count numeric, + file_cache_read_wait_seconds_sum numeric, + file_cache_write_wait_seconds_count numeric, + file_cache_write_wait_seconds_sum numeric, + getpage_wait_seconds_count numeric, + getpage_wait_seconds_sum numeric, + getpage_prefetch_requests_total numeric, + getpage_sync_requests_total numeric, + getpage_prefetch_misses_total numeric, + getpage_prefetch_discards_total numeric, + getpage_prefetches_buffered numeric, + pageserver_requests_sent_total numeric, + pageserver_disconnects_total numeric, + pageserver_send_flushes_total numeric, + pageserver_open_requests numeric +); diff --git a/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet new file mode 100644 index 0000000000..5ad9ba078e --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_disconnects_total', + type: 'counter', + help: 'Number of times that the connection to the pageserver was lost', + values: [ + 'pageserver_disconnects_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_open_requests.libsonnet b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet new file mode 100644 index 0000000000..dca89ea64a --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_open_requests', + type: 'gauge', + help: 'Number of open requests to PageServer', + values: [ + 'pageserver_open_requests', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet new file mode 100644 index 0000000000..c191e2467f --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_requests_sent_total', + type: 'counter', + help: 'Number of all requests sent to the pageserver (not just GetPage requests)', + values: [ + 'pageserver_requests_sent_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet new file mode 100644 index 0000000000..9fa5f77758 --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_send_flushes_total', + type: 'counter', + help: 'Number of flushes to the pageserver connection', + values: [ + 'pageserver_send_flushes_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.libsonnet b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet new file mode 100644 index 0000000000..46ea2f4192 --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet @@ -0,0 +1,18 @@ +{ + metric_name: 'pg_stats_userdb', + type: 'gauge', + help: 'Stats for several oldest non-system dbs', + key_labels: [ + 'datname', + ], + value_label: 'kind', + values: [ + 'db_size', + 'deadlocks', + // Rows + 'inserted', + 'updated', + 'deleted', + ], + query: importstr 'sql_exporter/pg_stats_userdb.sql', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql new file mode 100644 index 0000000000..00ada87370 --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -0,0 +1,10 @@ +-- We export stats for 10 non-system databases. Without this limit it is too +-- easy to abuse the system by creating lots of databases. + +SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, + tup_updated AS updated, tup_deleted AS deleted, datname +FROM pg_stat_database +WHERE datname IN ( + SELECT datname FROM pg_database + WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 +); diff --git a/compute/etc/sql_exporter/replication_delay_bytes.libsonnet b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet new file mode 100644 index 0000000000..3e5bb6af1f --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_bytes', + type: 'gauge', + help: 'Bytes between received and replayed LSN', + key_labels: null, + values: [ + 'replication_delay_bytes', + ], + query: importstr 'sql_exporter/replication_delay_bytes.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql new file mode 100644 index 0000000000..60a6981acd --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.sql @@ -0,0 +1,6 @@ +-- We use a GREATEST call here because this calculation can be negative. The +-- calculation is not atomic, meaning after we've gotten the receive LSN, the +-- replay LSN may have advanced past the receive LSN we are using for the +-- calculation. + +SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; diff --git a/compute/etc/sql_exporter/replication_delay_seconds.libsonnet b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet new file mode 100644 index 0000000000..d3f2c21b54 --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_seconds', + type: 'gauge', + help: 'Time since last LSN was replayed', + key_labels: null, + values: [ + 'replication_delay_seconds', + ], + query: importstr 'sql_exporter/replication_delay_seconds.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql new file mode 100644 index 0000000000..a76809ad74 --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.sql @@ -0,0 +1,5 @@ +SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; diff --git a/compute/etc/sql_exporter/retained_wal.libsonnet b/compute/etc/sql_exporter/retained_wal.libsonnet new file mode 100644 index 0000000000..f9eff5faa5 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'retained_wal', + type: 'gauge', + help: 'Retained WAL in inactive replication slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'retained_wal', + ], + query: importstr 'sql_exporter/retained_wal.sql', +} diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql new file mode 100644 index 0000000000..6c58359461 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -0,0 +1,5 @@ +SELECT + slot_name, + pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal +FROM pg_replication_slots +WHERE active = false; diff --git a/compute/etc/sql_exporter/wal_is_lost.libsonnet b/compute/etc/sql_exporter/wal_is_lost.libsonnet new file mode 100644 index 0000000000..3cd25f4b39 --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'wal_is_lost', + type: 'gauge', + help: 'Whether or not the replication slot wal_status is lost', + key_labels: [ + 'slot_name', + ], + values: [ + 'wal_is_lost', + ], + query: importstr 'sql_exporter/wal_is_lost.sql', +} diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql new file mode 100644 index 0000000000..5521270851 --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.sql @@ -0,0 +1,7 @@ +SELECT + slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_is_lost +FROM pg_replication_slots; diff --git a/compute/etc/sql_exporter_autoscaling.yml b/compute/etc/sql_exporter_autoscaling.yml deleted file mode 100644 index 044557233e..0000000000 --- a/compute/etc/sql_exporter_autoscaling.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter for autoscaling-agent -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector_autoscaling] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector_autoscaling.yml" diff --git a/compute/jsonnet/neon.libsonnet b/compute/jsonnet/neon.libsonnet new file mode 100644 index 0000000000..583b631c58 --- /dev/null +++ b/compute/jsonnet/neon.libsonnet @@ -0,0 +1,16 @@ +local MIN_SUPPORTED_VERSION = 14; +local MAX_SUPPORTED_VERSION = 17; +local SUPPORTED_VERSIONS = std.range(MIN_SUPPORTED_VERSION, MAX_SUPPORTED_VERSION); + +# If we receive the pg_version with a leading "v", ditch it. +local pg_version = std.strReplace(std.extVar('pg_version'), 'v', ''); +local pg_version_num = std.parseInt(pg_version); + +assert std.setMember(pg_version_num, SUPPORTED_VERSIONS) : + std.format('%s is an unsupported Postgres version: %s', + [pg_version, std.toString(SUPPORTED_VERSIONS)]); + +{ + PG_MAJORVERSION: pg_version, + PG_MAJORVERSION_NUM: pg_version_num, +} diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml new file mode 100644 index 0000000000..51a55b513f --- /dev/null +++ b/compute/vm-image-spec-bookworm.yaml @@ -0,0 +1,126 @@ +# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image. +--- +commands: + - name: cgconfigparser + user: root + sysvInitAction: sysinit + shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' + # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for + # running it as root. + - name: chmod-resize-swap + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/resize-swap' + - name: chmod-set-disk-quota + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/set-disk-quota' + - name: pgbouncer + user: postgres + sysvInitAction: respawn + shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini' + - name: local_proxy + user: postgres + sysvInitAction: respawn + shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + - name: postgres-exporter + user: nobody + sysvInitAction: respawn + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter' + - name: sql-exporter + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' + - name: sql-exporter-autoscaling + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' +shutdownHook: | + su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' +files: + - filename: compute_ctl-sudoers + content: | + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap + # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), + # regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + - filename: cgconfig.conf + content: | + # Configuration for cgroups in VM compute nodes + group neon-postgres { + perm { + admin { + uid = postgres; + } + task { + gid = users; + } + } + memory {} + } +build: | + # Build cgroup-tools + # + # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically + # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor + # requires cgroup v2, so we'll build cgroup-tools ourselves. + # + # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2, + # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset + # for debian version migration. + # + FROM debian:bookworm-slim as libcgroup-builder + ENV LIBCGROUP_VERSION=v2.0.3 + + RUN set -exu \ + && apt update \ + && apt install --no-install-recommends -y \ + git \ + ca-certificates \ + automake \ + cmake \ + make \ + gcc \ + byacc \ + flex \ + libtool \ + libpam0g-dev \ + && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ + && INSTALL_DIR="/libcgroup-install" \ + && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ + && cd libcgroup \ + # extracted from bootstrap.sh, with modified flags: + && (test -d m4 || mkdir m4) \ + && autoreconf -fi \ + && rm -rf autom4te.cache \ + && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ + # actually build the thing... + && make install +merge: | + # tweak nofile limits + RUN set -e \ + && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \ + && test ! -e /etc/security || ( \ + echo '* - nofile 1048576' >>/etc/security/limits.conf \ + && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ + ) + + # Allow postgres user (compute_ctl) to run swap resizer. + # Need to install sudo in order to allow this. + # + # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. + RUN set -e \ + && apt update \ + && apt install --no-install-recommends -y \ + sudo \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers + + COPY cgconfig.conf /etc/cgconfig.conf + + RUN set -e \ + && chmod 0644 /etc/cgconfig.conf + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ + COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ + COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec-bullseye.yaml similarity index 100% rename from compute/vm-image-spec.yaml rename to compute/vm-image-spec-bullseye.yaml diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 3d8b22a8a3..72578b1f34 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -33,6 +33,7 @@ fn list_dbs(client: &mut Client) -> Result> { } /// Connect to every database (see list_dbs above) and get the list of installed extensions. +/// /// Same extension can be installed in multiple databases with different versions, /// we only keep the highest and lowest version across all databases. pub async fn get_installed_extensions(connstr: Url) -> Result { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 7cdf621737..71514daa7c 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -97,7 +97,21 @@ impl ComputeControlPlane { for endpoint_dir in std::fs::read_dir(env.endpoints_path()) .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { - let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?; + let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env); + let ep = match ep_res { + Ok(ep) => ep, + Err(e) => match e.downcast::() { + Ok(e) => { + // A parallel task could delete an endpoint while we have just scanned the directory + if e.kind() == std::io::ErrorKind::NotFound { + continue; + } else { + Err(e)? + } + } + Err(e) => Err(e)?, + }, + }; endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep)); } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index be4d61f009..1816825bda 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -16,7 +16,7 @@ aws-sdk-s3.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true -hyper0 = { workspace = true, features = ["stream"] } +hyper = { workspace = true, features = ["client"] } futures.workspace = true serde.workspace = true serde_json.workspace = true @@ -36,6 +36,7 @@ azure_storage.workspace = true azure_storage_blobs.workspace = true futures-util.workspace = true http-types.workspace = true +http-body-util.workspace = true itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index f950f2886c..cde32df402 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -28,13 +28,15 @@ use aws_sdk_s3::{ Client, }; use aws_smithy_async::rt::sleep::TokioSleep; +use http_body_util::StreamBody; use http_types::StatusCode; use aws_smithy_types::{body::SdkBody, DateTime}; use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; use bytes::Bytes; use futures::stream::Stream; -use hyper0::Body; +use futures_util::StreamExt; +use hyper::body::Frame; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -710,8 +712,8 @@ impl RemoteStorage for S3Bucket { let started_at = start_measuring_requests(kind); - let body = Body::wrap_stream(from); - let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body)); + let body = StreamBody::new(from.map(|x| x.map(Frame::data))); + let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body)); let upload = self .client diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 7b735875b7..5bd6f4bedc 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -31,9 +31,12 @@ pub enum Scope { /// The scope used by pageservers in upcalls to storage controller and cloud control plane #[serde(rename = "generations_api")] GenerationsApi, - /// Allows access to control plane managment API and some storage controller endpoints. + /// Allows access to control plane managment API and all storage controller endpoints. Admin, + /// Allows access to control plane & storage controller endpoints used in infrastructure automation (e.g. node registration) + Infra, + /// Allows access to storage controller APIs used by the scrubber, to interrogate the state /// of a tenant & post scrub results. Scrubber, diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 5e05e4e713..02fc9e3b99 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -28,6 +28,9 @@ pub enum ApiError { #[error("Resource temporarily unavailable: {0}")] ResourceUnavailable(Cow<'static, str>), + #[error("Too many requests: {0}")] + TooManyRequests(Cow<'static, str>), + #[error("Shutting down")] ShuttingDown, @@ -73,6 +76,10 @@ impl ApiError { err.to_string(), StatusCode::SERVICE_UNAVAILABLE, ), + ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::TOO_MANY_REQUESTS, + ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 9e3dedb75a..5c931fcfdb 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -14,14 +14,19 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => { - Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Pageserver auth", - claims.scope - ) - .into(), - )) - } + ( + Scope::Admin + | Scope::SafekeeperData + | Scope::GenerationsApi + | Scope::Infra + | Scope::Scrubber, + _, + ) => Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Pageserver auth", + claims.scope + ) + .into(), + )), } } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index a32d09f3b3..975318419f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -16,7 +16,7 @@ use fail::fail_point; use pageserver_api::key::Key; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; -use std::time::SystemTime; +use std::time::{Instant, SystemTime}; use tokio::io; use tokio::io::AsyncWrite; use tracing::*; @@ -352,12 +352,25 @@ where } } - for (path, content) in self + let start_time = Instant::now(); + let aux_files = self .timeline .list_aux_files(self.lsn, self.ctx) .await - .map_err(|e| BasebackupError::Server(e.into()))? - { + .map_err(|e| BasebackupError::Server(e.into()))?; + let aux_scan_time = start_time.elapsed(); + let aux_estimated_size = aux_files + .values() + .map(|content| content.len()) + .sum::(); + info!( + "Scanned {} aux files in {}ms, aux file content size = {}", + aux_files.len(), + aux_scan_time.as_millis(), + aux_estimated_size + ); + + for (path, content) in aux_files { if path.starts_with("pg_replslot") { let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; let restart_lsn = Lsn(u64::from_le_bytes( diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2985ab1efb..36a6ed427b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -77,6 +77,7 @@ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::offload::offload_timeline; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; @@ -325,6 +326,7 @@ impl From for ApiError { match value { NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), Timeout => ApiError::Timeout("hit pageserver internal timeout".into()), + Cancelled => ApiError::ShuttingDown, e @ HasArchivedParent(_) => { ApiError::PreconditionFailed(e.to_string().into_boxed_str()) } @@ -715,8 +717,15 @@ async fn timeline_archival_config_handler( .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + tenant - .apply_timeline_archival_config(timeline_id, request_data.state, ctx) + .apply_timeline_archival_config( + timeline_id, + request_data.state, + state.broker_client.clone(), + ctx, + ) .await?; Ok::<_, ApiError>(()) } @@ -1783,6 +1792,49 @@ async fn timeline_compact_handler( .await } +// Run offload immediately on given timeline. +async fn timeline_offload_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + if tenant.get_offloaded_timeline(timeline_id).is_ok() { + return json_response(StatusCode::OK, ()); + } + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + if !tenant.timeline_has_no_attached_children(timeline_id) { + return Err(ApiError::PreconditionFailed( + "timeline has attached children".into(), + )); + } + if !timeline.can_offload() { + return Err(ApiError::PreconditionFailed( + "Timeline::can_offload() returned false".into(), + )); + } + offload_timeline(&tenant, &timeline) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) + } + .instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await +} + // Run checkpoint immediately on given timeline. async fn timeline_checkpoint_handler( request: Request, @@ -3006,6 +3058,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_compact_handler), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload", + |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8fa6b9a7f0..afb2f92ff8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -26,8 +26,8 @@ use std::str::FromStr; use std::sync::Arc; use std::time::SystemTime; use std::time::{Duration, Instant}; -use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; @@ -1137,10 +1137,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } else { - let mut writer = pgb.copyout_writer(); + let mut writer = BufWriter::new(pgb.copyout_writer()); if gzip { let mut encoder = GzipEncoder::with_quality( - writer, + &mut writer, // NOTE using fast compression because it's on the critical path // for compute startup. For an empty database, we get // <100KB with this method. The Level::Best compression method @@ -1175,6 +1175,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } + writer + .flush() + .await + .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?; } pgb.write_message_noflush(&BeMessage::CopyDone) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7aa313f031..900da5beab 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1545,9 +1545,6 @@ impl<'a> DatadirModification<'a> { // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update relation size cache - self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update logical database size. self.pending_nblocks -= old_size as i64 - nblocks as i64; } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d2818d04dc..689982ddd4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -67,7 +67,7 @@ use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; use self::remote_timeline_client::upload::upload_index_part; -use self::remote_timeline_client::RemoteTimelineClient; +use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::UninitializedTimeline; @@ -493,6 +493,8 @@ pub struct OffloadedTimeline { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, + /// Whether to retain the branch lsn at the ancestor or not + pub ancestor_retain_lsn: Option, // TODO: once we persist offloaded state, make this lazily constructed pub remote_client: Arc, @@ -504,10 +506,14 @@ pub struct OffloadedTimeline { impl OffloadedTimeline { fn from_timeline(timeline: &Timeline) -> Self { + let ancestor_retain_lsn = timeline + .get_ancestor_timeline_id() + .map(|_timeline_id| timeline.get_ancestor_lsn()); Self { tenant_shard_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + ancestor_retain_lsn, remote_client: timeline.remote_client.clone(), delete_progress: timeline.delete_progress.clone(), @@ -515,6 +521,12 @@ impl OffloadedTimeline { } } +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub enum MaybeOffloaded { + Yes, + No, +} + #[derive(Clone)] pub enum TimelineOrOffloaded { Timeline(Arc), @@ -607,6 +619,9 @@ pub enum TimelineArchivalError { #[error("Timeout")] Timeout, + #[error("Cancelled")] + Cancelled, + #[error("ancestor is archived: {}", .0)] HasArchivedParent(TimelineId), @@ -617,7 +632,7 @@ pub enum TimelineArchivalError { AlreadyInProgress, #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl Debug for TimelineArchivalError { @@ -625,6 +640,7 @@ impl Debug for TimelineArchivalError { match self { Self::NotFound => write!(f, "NotFound"), Self::Timeout => write!(f, "Timeout"), + Self::Cancelled => write!(f, "Cancelled"), Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(), Self::HasUnarchivedChildren(c) => { f.debug_tuple("HasUnarchivedChildren").field(c).finish() @@ -1538,8 +1554,10 @@ impl Tenant { async fn unoffload_timeline( self: &Arc, timeline_id: TimelineId, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result, TimelineArchivalError> { + info!("unoffloading timeline"); let cancel = self.cancel.clone(); let timeline_preload = self .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel) @@ -1554,6 +1572,7 @@ impl Tenant { error!(%timeline_id, "index_part not found on remote"); return Err(TimelineArchivalError::NotFound); } + Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled), Err(e) => { // Some (possibly ephemeral) error happened during index_part download. warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); @@ -1584,26 +1603,40 @@ impl Tenant { "failed to load remote timeline {} for tenant {}", timeline_id, self.tenant_shard_id ) - })?; + }) + .map_err(TimelineArchivalError::Other)?; let timelines = self.timelines.lock().unwrap(); - if let Some(timeline) = timelines.get(&timeline_id) { - let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); - if offloaded_timelines.remove(&timeline_id).is_none() { - warn!("timeline already removed from offloaded timelines"); - } - Ok(Arc::clone(timeline)) - } else { + let Some(timeline) = timelines.get(&timeline_id) else { warn!("timeline not available directly after attach"); - Err(TimelineArchivalError::Other(anyhow::anyhow!( + return Err(TimelineArchivalError::Other(anyhow::anyhow!( "timeline not available directly after attach" - ))) + ))); + }; + let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); + if offloaded_timelines.remove(&timeline_id).is_none() { + warn!("timeline already removed from offloaded timelines"); } + + // Activate the timeline (if it makes sense) + if !(timeline.is_broken() || timeline.is_stopping()) { + let background_jobs_can_start = None; + timeline.activate( + self.clone(), + broker_client.clone(), + background_jobs_can_start, + &ctx, + ); + } + + info!("timeline unoffloading complete"); + Ok(Arc::clone(timeline)) } pub(crate) async fn apply_timeline_archival_config( self: &Arc, timeline_id: TimelineId, new_state: TimelineArchivalState, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); @@ -1644,18 +1677,29 @@ impl Tenant { Some(Arc::clone(timeline)) }; - // Second part: unarchive timeline (if needed) + // Second part: unoffload timeline (if needed) let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { timeline } else { // Turn offloaded timeline into a non-offloaded one - self.unoffload_timeline(timeline_id, ctx).await? + self.unoffload_timeline(timeline_id, broker_client, ctx) + .await? }; // Third part: upload new timeline archival state and block until it is present in S3 - let upload_needed = timeline + let upload_needed = match timeline .remote_client - .schedule_index_upload_for_timeline_archival_state(new_state)?; + .schedule_index_upload_for_timeline_archival_state(new_state) + { + Ok(upload_needed) => upload_needed, + Err(e) => { + if timeline.cancel.is_cancelled() { + return Err(TimelineArchivalError::Cancelled); + } else { + return Err(TimelineArchivalError::Other(e)); + } + } + }; if upload_needed { info!("Uploading new state"); @@ -1666,11 +1710,33 @@ impl Tenant { tracing::warn!("reached timeout for waiting on upload queue"); return Err(TimelineArchivalError::Timeout); }; - v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; + v.map_err(|e| match e { + WaitCompletionError::NotInitialized(e) => { + TimelineArchivalError::Other(anyhow::anyhow!(e)) + } + WaitCompletionError::UploadQueueShutDownOrStopped => { + TimelineArchivalError::Cancelled + } + })?; } Ok(()) } + pub fn get_offloaded_timeline( + &self, + timeline_id: TimelineId, + ) -> Result, GetTimelineError> { + self.timelines_offloaded + .lock() + .unwrap() + .get(&timeline_id) + .map(Arc::clone) + .ok_or(GetTimelineError::NotFound { + tenant_id: self.tenant_shard_id, + timeline_id, + }) + } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } @@ -2206,6 +2272,13 @@ impl Tenant { } } + pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { + let timelines = self.timelines.lock().unwrap(); + !timelines + .iter() + .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id)) + } + pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -2253,12 +2326,13 @@ impl Tenant { if activating { let timelines_accessor = self.timelines.lock().unwrap(); + let timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap(); let timelines_to_activate = timelines_accessor .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); // Before activation, populate each Timeline's GcInfo with information about its children - self.initialize_gc_info(&timelines_accessor); + self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor); // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. @@ -3294,10 +3368,11 @@ impl Tenant { /// Populate all Timelines' `GcInfo` with information about their children. We do not set the /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] /// - /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + /// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`]. fn initialize_gc_info( &self, timelines: &std::sync::MutexGuard>>, + timelines_offloaded: &std::sync::MutexGuard>>, ) { // This function must be called before activation: after activation timeline create/delete operations // might happen, and this function is not safe to run concurrently with those. @@ -3305,20 +3380,37 @@ impl Tenant { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + let mut all_branchpoints: BTreeMap> = + BTreeMap::new(); timelines.iter().for_each(|(timeline_id, timeline_entry)| { if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); - ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + ancestor_children.push(( + timeline_entry.get_ancestor_lsn(), + *timeline_id, + MaybeOffloaded::No, + )); } }); + timelines_offloaded + .iter() + .for_each(|(timeline_id, timeline_entry)| { + let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id else { + return; + }; + let Some(retain_lsn) = timeline_entry.ancestor_retain_lsn else { + return; + }; + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((retain_lsn, *timeline_id, MaybeOffloaded::Yes)); + }); // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines let horizon = self.get_gc_horizon(); // Populate each timeline's GcInfo with information about its child branches for timeline in timelines.values() { - let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints .remove(&timeline.timeline_id) .unwrap_or_default(); @@ -4878,7 +4970,10 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); + assert_eq!( + branchpoints[0], + (Lsn(0x40), NEW_TIMELINE_ID, MaybeOffloaded::No) + ); } // You can read the key from the child branch even though the parent is @@ -8261,8 +8356,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8489,8 +8584,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8723,7 +8818,7 @@ mod tests { // Update GC info let mut guard = parent_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x10), space: Lsn(0x10), @@ -8737,7 +8832,7 @@ mod tests { // Update GC info let mut guard = branch_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x50), space: Lsn(0x50), diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 41d558d3f6..4a4c698b56 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -12,7 +12,7 @@ use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use super::{GcError, LogicalSizeCalculationCause, Tenant}; -use crate::tenant::Timeline; +use crate::tenant::{MaybeOffloaded, Timeline}; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -264,10 +264,12 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) + .filter(|(lsn, _child_id, is_offloaded)| { + lsn > &ancestor_lsn && *is_offloaded == MaybeOffloaded::No + }) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint)) .collect::>(); lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2fd4e699cf..1992dee930 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -139,8 +139,10 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::{ - config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint, + config::TenantConf, + storage_layer::{inmemory_layer, LayerVisibilityHint}, upload_queue::NotInitialized, + MaybeOffloaded, }; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; @@ -450,7 +452,7 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId, MaybeOffloaded)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, @@ -467,8 +469,13 @@ impl GcInfo { self.cutoffs.select_min() } - pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { - self.retain_lsns.push((child_lsn, child_id)); + pub(super) fn insert_child( + &mut self, + child_id: TimelineId, + child_lsn: Lsn, + is_offloaded: MaybeOffloaded, + ) { + self.retain_lsns.push((child_lsn, child_id, is_offloaded)); self.retain_lsns.sort_by_key(|i| i.0); } @@ -2164,7 +2171,9 @@ impl Timeline { if let Some(ancestor) = &ancestor { let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); - ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + // If we construct an explicit timeline object, it's obviously not offloaded + let is_offloaded = MaybeOffloaded::No; + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded); } Arc::new_cyclic(|myself| { @@ -3083,7 +3092,6 @@ impl Timeline { } impl Timeline { - #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// @@ -4875,7 +4883,7 @@ impl Timeline { let retain_lsns = gc_info .retain_lsns .iter() - .map(|(lsn, _child_id)| *lsn) + .map(|(lsn, _child_id, _is_offloaded)| *lsn) .collect(); // Gets the maximum LSN that holds the valid lease. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9f64471432..8b9ace1e5b 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -42,7 +42,7 @@ use crate::tenant::storage_layer::{ use crate::tenant::timeline::ImageLayerCreationOutcome; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::DeltaLayer; +use crate::tenant::{DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use pageserver_api::config::tenant_conf_defaults::{ DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, @@ -639,7 +639,10 @@ impl Timeline { let children = self.gc_info.read().unwrap().retain_lsns.clone(); let mut readable_points = Vec::with_capacity(children.len() + 1); - for (child_lsn, _child_timeline_id) in &children { + for (child_lsn, _child_timeline_id, is_offloaded) in &children { + if *is_offloaded == MaybeOffloaded::Yes { + continue; + } readable_points.push(*child_lsn); } readable_points.push(head_lsn); @@ -1741,7 +1744,7 @@ impl Timeline { let gc_info = self.gc_info.read().unwrap(); let mut retain_lsns_below_horizon = Vec::new(); let gc_cutoff = gc_info.cutoffs.select_min(); - for (lsn, _timeline_id) in &gc_info.retain_lsns { + for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns { if lsn < &gc_cutoff { retain_lsns_below_horizon.push(*lsn); } diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index fb906d906b..7e6084baaf 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -19,6 +19,9 @@ pub(crate) async fn offload_timeline( return Ok(()); }; + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. + timeline.shutdown(super::ShutdownMode::Hard).await; + // TODO extend guard mechanism above with method // to make deletions possible while offloading is in progress diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index d789526050..70b250d394 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -43,6 +43,7 @@ #include "hll.h" #include "bitmap.h" #include "neon.h" +#include "neon_perf_counters.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -114,7 +115,9 @@ typedef struct FileCacheControl uint32 limit; /* shared copy of lfc_size_limit */ uint64 hits; uint64 misses; - uint64 writes; + uint64 writes; /* number of writes issued */ + uint64 time_read; /* time spent reading (us) */ + uint64 time_write; /* time spent writing (us) */ dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ @@ -270,6 +273,8 @@ lfc_shmem_startup(void) lfc_ctl->hits = 0; lfc_ctl->misses = 0; lfc_ctl->writes = 0; + lfc_ctl->time_read = 0; + lfc_ctl->time_write = 0; dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); @@ -612,31 +617,34 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) /* remove the page from the cache */ entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1))); - /* - * If the chunk has no live entries, we can position the chunk to be - * recycled first. - */ - if (entry->bitmap[chunk_offs >> 5] == 0) + if (entry->access_count == 0) { - bool has_remaining_pages = false; - - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) - { - if (entry->bitmap[i] != 0) - { - has_remaining_pages = true; - break; - } - } - /* - * Put the entry at the position that is first to be reclaimed when we - * have no cached pages remaining in the chunk + * If the chunk has no live entries, we can position the chunk to be + * recycled first. */ - if (!has_remaining_pages) + if (entry->bitmap[chunk_offs >> 5] == 0) { - dlist_delete(&entry->list_node); - dlist_push_head(&lfc_ctl->lru, &entry->list_node); + bool has_remaining_pages = false; + + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) + { + if (entry->bitmap[i] != 0) + { + has_remaining_pages = true; + break; + } + } + + /* + * Put the entry at the position that is first to be reclaimed when we + * have no cached pages remaining in the chunk + */ + if (!has_remaining_pages) + { + dlist_delete(&entry->list_node); + dlist_push_head(&lfc_ctl->lru, &entry->list_node); + } } } @@ -701,6 +709,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); int iteration_hits = 0; int iteration_misses = 0; + uint64 io_time_us = 0; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -795,6 +804,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_ctl->misses += iteration_misses; pgBufferUsage.file_cache.hits += iteration_hits; pgBufferUsage.file_cache.misses += iteration_misses; + + if (iteration_hits) + { + lfc_ctl->time_read += io_time_us; + inc_page_cache_read_wait(io_time_us); + } + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); @@ -859,6 +875,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, struct iovec iov[PG_IOV_MAX]; int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + instr_time io_start, io_end; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -947,12 +964,13 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); + INSTR_TIME_SET_CURRENT(io_start); rc = pwritev(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + INSTR_TIME_SET_CURRENT(io_end); pgstat_report_wait_end(); if (rc != BLCKSZ * blocks_in_chunk) @@ -965,9 +983,17 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_ctl->generation == generation) { + uint64 time_spent_us; CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ CriticalAssert(entry->access_count > 0); + + lfc_ctl->writes += blocks_in_chunk; + INSTR_TIME_SUBTRACT(io_start, io_end); + time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); + lfc_ctl->time_write += time_spent_us; + inc_page_cache_write_wait(time_spent_us); + if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index a497d387c8..05db187076 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -50,28 +50,52 @@ NeonPerfCountersShmemInit(void) } } -/* - * Count a GetPage wait operation. - */ -void -inc_getpage_wait(uint64 latency_us) +static inline void +inc_iohist(IOHistogram hist, uint64 latency_us) { int lo = 0; - int hi = NUM_GETPAGE_WAIT_BUCKETS - 1; + int hi = NUM_IO_WAIT_BUCKETS - 1; /* Find the right bucket with binary search */ while (lo < hi) { int mid = (lo + hi) / 2; - if (latency_us < getpage_wait_bucket_thresholds[mid]) + if (latency_us < io_wait_bucket_thresholds[mid]) hi = mid; else lo = mid + 1; } - MyNeonCounters->getpage_wait_us_bucket[lo]++; - MyNeonCounters->getpage_wait_us_sum += latency_us; - MyNeonCounters->getpage_wait_us_count++; + hist->wait_us_bucket[lo]++; + hist->wait_us_sum += latency_us; + hist->wait_us_count++; +} + +/* + * Count a GetPage wait operation. + */ +void +inc_getpage_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->getpage_hist, latency); +} + +/* + * Count an LFC read wait operation. + */ +void +inc_page_cache_read_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_read_hist, latency); +} + +/* + * Count an LFC write wait operation. + */ +void +inc_page_cache_write_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_write_hist, latency); } /* @@ -81,77 +105,91 @@ inc_getpage_wait(uint64 latency_us) typedef struct { - char *name; + const char *name; bool is_bucket; double bucket_le; double value; } metric_t; -static metric_t * -neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +static int +histogram_to_metrics(IOHistogram histogram, + metric_t *metrics, + const char *count, + const char *sum, + const char *bucket) { -#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8) - metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); - uint64 bucket_accum; - int i = 0; + int i = 0; + uint64 bucket_accum = 0; - metrics[i].name = "getpage_wait_seconds_count"; + metrics[i].name = count; metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_wait_us_count; + metrics[i].value = (double) histogram->wait_us_count; i++; - metrics[i].name = "getpage_wait_seconds_sum"; + metrics[i].name = sum; metrics[i].is_bucket = false; - metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0; + metrics[i].value = (double) histogram->wait_us_sum / 1000000.0; i++; - - bucket_accum = 0; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) { - uint64 threshold = getpage_wait_bucket_thresholds[bucketno]; + uint64 threshold = io_wait_bucket_thresholds[bucketno]; - bucket_accum += counters->getpage_wait_us_bucket[bucketno]; + bucket_accum += histogram->wait_us_bucket[bucketno]; - metrics[i].name = "getpage_wait_seconds_bucket"; + metrics[i].name = bucket; metrics[i].is_bucket = true; metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0; metrics[i].value = (double) bucket_accum; i++; } - metrics[i].name = "getpage_prefetch_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_requests_total; - i++; - metrics[i].name = "getpage_sync_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_sync_requests_total; - i++; - metrics[i].name = "getpage_prefetch_misses_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_misses_total; - i++; - metrics[i].name = "getpage_prefetch_discards_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_discards_total; - i++; - metrics[i].name = "pageserver_requests_sent_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_requests_sent_total; - i++; - metrics[i].name = "pageserver_disconnects_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_disconnects_total; - i++; - metrics[i].name = "pageserver_send_flushes_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_send_flushes_total; - i++; - metrics[i].name = "file_cache_hits_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->file_cache_hits_total; - i++; + + return i; +} + +static metric_t * +neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +{ +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10) + metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); + int i = 0; + +#define APPEND_METRIC(_name) do { \ + metrics[i].name = #_name; \ + metrics[i].is_bucket = false; \ + metrics[i].value = (double) counters->_name; \ + i++; \ + } while (false) + + i += histogram_to_metrics(&counters->getpage_hist, &metrics[i], + "getpage_wait_seconds_count", + "getpage_wait_seconds_sum", + "getpage_wait_seconds_bucket"); + + APPEND_METRIC(getpage_prefetch_requests_total); + APPEND_METRIC(getpage_sync_requests_total); + APPEND_METRIC(getpage_prefetch_misses_total); + APPEND_METRIC(getpage_prefetch_discards_total); + APPEND_METRIC(pageserver_requests_sent_total); + APPEND_METRIC(pageserver_disconnects_total); + APPEND_METRIC(pageserver_send_flushes_total); + APPEND_METRIC(pageserver_open_requests); + APPEND_METRIC(getpage_prefetches_buffered); + + APPEND_METRIC(file_cache_hits_total); + + i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], + "file_cache_read_wait_seconds_count", + "file_cache_read_wait_seconds_sum", + "file_cache_read_wait_seconds_bucket"); + i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], + "file_cache_write_wait_seconds_count", + "file_cache_write_wait_seconds_sum", + "file_cache_write_wait_seconds_bucket"); Assert(i == NUM_METRICS); +#undef APPEND_METRIC +#undef NUM_METRICS + /* NULL entry marks end of array */ metrics[i].name = NULL; metrics[i].value = 0; @@ -216,6 +254,15 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS) return (Datum) 0; } +static inline void +histogram_merge_into(IOHistogram into, IOHistogram from) +{ + into->wait_us_count += from->wait_us_count; + into->wait_us_sum += from->wait_us_sum; + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) + into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno]; +} + PG_FUNCTION_INFO_V1(neon_get_perf_counters); Datum neon_get_perf_counters(PG_FUNCTION_ARGS) @@ -234,10 +281,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; - totals.getpage_wait_us_count += counters->getpage_wait_us_count; - totals.getpage_wait_us_sum += counters->getpage_wait_us_sum; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) - totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno]; + histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total; totals.getpage_sync_requests_total += counters->getpage_sync_requests_total; totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total; @@ -245,7 +289,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total; totals.pageserver_disconnects_total += counters->pageserver_disconnects_total; totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total; + totals.pageserver_open_requests += counters->pageserver_open_requests; + totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered; totals.file_cache_hits_total += counters->file_cache_hits_total; + histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); + histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 49d477c4f8..8edc658a30 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -15,17 +15,26 @@ #include "storage/proc.h" #endif -static const uint64 getpage_wait_bucket_thresholds[] = { - 20, 30, 60, 100, /* 0 - 100 us */ +static const uint64 io_wait_bucket_thresholds[] = { + 2, 3, 6, 10, /* 0 us - 10 us */ + 20, 30, 60, 100, /* 10 us - 100 us */ 200, 300, 600, 1000, /* 100 us - 1 ms */ 2000, 3000, 6000, 10000, /* 1 ms - 10 ms */ 20000, 30000, 60000, 100000, /* 10 ms - 100 ms */ 200000, 300000, 600000, 1000000, /* 100 ms - 1 s */ 2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */ - 20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */ UINT64_MAX, }; -#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds)) +#define NUM_IO_WAIT_BUCKETS (lengthof(io_wait_bucket_thresholds)) + +typedef struct IOHistogramData +{ + uint64 wait_us_count; + uint64 wait_us_sum; + uint64 wait_us_bucket[NUM_IO_WAIT_BUCKETS]; +} IOHistogramData; + +typedef IOHistogramData *IOHistogram; typedef struct { @@ -39,9 +48,7 @@ typedef struct * the backend, but the 'neon_backend_perf_counters' view will convert * them to seconds, to make them more idiomatic as prometheus metrics. */ - uint64 getpage_wait_us_count; - uint64 getpage_wait_us_sum; - uint64 getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS]; + IOHistogramData getpage_hist; /* * Total number of speculative prefetch Getpage requests and synchronous @@ -50,7 +57,11 @@ typedef struct uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; - /* XXX: It's not clear to me when these misses happen. */ + /* + * Total number of readahead misses; consisting of either prefetches that + * don't satisfy the LSN bounds, or cases where no readahead was issued + * for the read. + */ uint64 getpage_prefetch_misses_total; /* @@ -80,6 +91,16 @@ typedef struct * this can be smaller than pageserver_requests_sent_total. */ uint64 pageserver_send_flushes_total; + + /* + * Number of open requests to PageServer. + */ + uint64 pageserver_open_requests; + + /* + * Number of unused prefetches currently cached in this backend. + */ + uint64 getpage_prefetches_buffered; /* * Number of requests satisfied from the LFC. @@ -91,6 +112,9 @@ typedef struct */ uint64 file_cache_hits_total; + /* LFC I/O time buckets */ + IOHistogramData file_cache_read_hist; + IOHistogramData file_cache_write_hist; } neon_per_backend_counters; /* Pointer to the shared memory array of neon_per_backend_counters structs */ @@ -111,6 +135,8 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared; #endif extern void inc_getpage_wait(uint64 latency); +extern void inc_page_cache_read_wait(uint64 latency); +extern void inc_page_cache_write_wait(uint64 latency); extern Size NeonPerfCountersShmemSize(void); extern void NeonPerfCountersShmemInit(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3d9d9285df..cbb0e2ae6d 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -488,6 +488,11 @@ readahead_buffer_resize(int newsize, void *extra) newPState->n_unused -= 1; } + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) { prefetch_set_unused(end); @@ -621,6 +626,8 @@ prefetch_read(PrefetchRequest *slot) MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; /* update slot state */ slot->status = PRFS_RECEIVED; @@ -674,6 +681,15 @@ prefetch_on_ps_disconnect(void) prefetch_set_unused(ring_index); } + + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } /* @@ -706,6 +722,9 @@ prefetch_set_unused(uint64 ring_index) MyPState->n_responses_buffered -= 1; MyPState->n_unused += 1; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } else { @@ -820,6 +839,15 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, hashkey.buftag = tag; Retry: + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + min_ring_index = UINT64_MAX; for (int i = 0; i < nblocks; i++) { @@ -1001,6 +1029,9 @@ Retry: prefetch_do_request(slot, lsns); } + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + Assert(any_hits); Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || @@ -1061,8 +1092,7 @@ page_server_request(void const *req) * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || - ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) { shard_no = 0; } @@ -1076,8 +1106,10 @@ page_server_request(void const *req) { /* do nothing */ } + MyNeonCounters->pageserver_open_requests++; consume_prefetch_responses(); resp = page_server->receive(shard_no); + MyNeonCounters->pageserver_open_requests--; } PG_CATCH(); { @@ -1086,6 +1118,8 @@ page_server_request(void const *req) * point, but this currently seems fine for now. */ page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + PG_RE_THROW(); } PG_END_TRY(); diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 963fb94a7d..e25d2fcbab 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -42,9 +42,10 @@ hyper0.workspace = true hyper = { workspace = true, features = ["server", "http1", "http2"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } http-body-util = { version = "0.1" } -indexmap.workspace = true +indexmap = { workspace = true, features = ["serde"] } ipnet.workspace = true itertools.workspace = true +itoa.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } measured = { workspace = true, features = ["lasso"] } metrics.workspace = true diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 94b84b6f00..de32a06e9e 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -1,16 +1,15 @@ -use super::{ComputeCredentials, ComputeUserInfo}; -use crate::{ - auth::{self, backend::ComputeCredentialKeys, AuthFlow}, - compute, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::AuthSecret, - sasl, - stream::{PqStream, Stream}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; +use super::{ComputeCredentials, ComputeUserInfo}; +use crate::auth::backend::ComputeCredentialKeys; +use crate::auth::{self, AuthFlow}; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::stream::{PqStream, Stream}; +use crate::{compute, sasl}; + pub(super) async fn authenticate( ctx: &RequestMonitoring, creds: ComputeUserInfo, diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index a7cc678187..255e1fed54 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,18 +1,21 @@ -use crate::{ - auth, compute, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::{self, provider::NodeInfo}, - error::{ReportableError, UserFacingError}, - stream::PqStream, - waiters, -}; +use async_trait::async_trait; use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::SslMode; use tracing::{info, info_span}; +use super::ComputeCredentialKeys; +use crate::cache::Cached; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::provider::NodeInfo; +use crate::control_plane::{self, CachedNodeInfo}; +use crate::error::{ReportableError, UserFacingError}; +use crate::proxy::connect_compute::ComputeConnectBackend; +use crate::stream::PqStream; +use crate::{auth, compute, waiters}; + #[derive(Debug, Error)] pub(crate) enum WebAuthError { #[error(transparent)] @@ -25,6 +28,11 @@ pub(crate) enum WebAuthError { Io(#[from] std::io::Error), } +#[derive(Debug)] +pub struct ConsoleRedirectBackend { + console_uri: reqwest::Url, +} + impl UserFacingError for WebAuthError { fn to_string_client(&self) -> String { "Internal error".to_string() @@ -57,7 +65,40 @@ pub(crate) fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } -pub(super) async fn authenticate( +impl ConsoleRedirectBackend { + pub fn new(console_uri: reqwest::Url) -> Self { + Self { console_uri } + } + + pub(crate) async fn authenticate( + &self, + ctx: &RequestMonitoring, + auth_config: &'static AuthenticationConfig, + client: &mut PqStream, + ) -> auth::Result { + authenticate(ctx, auth_config, &self.console_uri, client) + .await + .map(ConsoleRedirectNodeInfo) + } +} + +pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo); + +#[async_trait] +impl ComputeConnectBackend for ConsoleRedirectNodeInfo { + async fn wake_compute( + &self, + _ctx: &RequestMonitoring, + ) -> Result { + Ok(Cached::new_uncached(self.0.clone())) + } + + fn get_keys(&self) -> &ComputeCredentialKeys { + &ComputeCredentialKeys::None + } +} + +async fn authenticate( ctx: &RequestMonitoring, auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 749218d260..8ab8d5d37f 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -1,16 +1,15 @@ -use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; -use crate::{ - auth::{self, AuthFlow}, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::AuthSecret, - intern::EndpointIdInt, - sasl, - stream::{self, Stream}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; +use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; +use crate::auth::{self, AuthFlow}; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::intern::EndpointIdInt; +use crate::sasl; +use crate::stream::{self, Stream}; + /// Compared to [SCRAM](crate::scram), cleartext password auth saves /// one round trip and *expensive* computations (>= 4096 HMAC iterations). /// These properties are benefical for serverless JS workers, so we diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 17ab7eda22..3f53ee24c3 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -1,23 +1,22 @@ -use std::{ - future::Future, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; -use anyhow::{bail, ensure, Context}; use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; -use serde::{de::Visitor, Deserialize, Deserializer}; +use serde::de::Visitor; +use serde::{Deserialize, Deserializer}; use signature::Verifier; +use thiserror::Error; use tokio::time::Instant; -use crate::{ - context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId, - RoleName, -}; - -use super::ComputeCredentialKeys; +use crate::auth::backend::ComputeCredentialKeys; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::http::parse_json_body_with_limit; +use crate::intern::RoleNameInt; +use crate::{EndpointId, RoleName}; // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); @@ -32,7 +31,16 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> impl Future>> + Send; + ) -> impl Future, FetchAuthRulesError>> + Send; +} + +#[derive(Error, Debug)] +pub(crate) enum FetchAuthRulesError { + #[error(transparent)] + GetEndpointJwks(#[from] GetEndpointJwksError), + + #[error("JWKs settings for this role were not configured")] + RoleJwksNotConfigured, } pub(crate) struct AuthRule { @@ -122,7 +130,7 @@ impl JwkCacheEntryLock { client: &reqwest::Client, endpoint: EndpointId, auth_rules: &F, - ) -> anyhow::Result> { + ) -> Result, JwtError> { // double check that no one beat us to updating the cache. let now = Instant::now(); let guard = self.cached.load_full(); @@ -188,7 +196,7 @@ impl JwkCacheEntryLock { client: &reqwest::Client, endpoint: EndpointId, fetch: &F, - ) -> Result, anyhow::Error> { + ) -> Result, JwtError> { let now = Instant::now(); let guard = self.cached.load_full(); @@ -243,27 +251,24 @@ impl JwkCacheEntryLock { endpoint: EndpointId, role_name: &RoleName, fetch: &F, - ) -> Result { + ) -> Result { // JWT compact form is defined to be // || . || || . || // where Signature = alg( || . || ); let (header_payload, signature) = jwt .rsplit_once('.') - .context("Provided authentication token is not a valid JWT encoding")?; + .ok_or(JwtEncodingError::InvalidCompactForm)?; let (header, payload) = header_payload .split_once('.') - .context("Provided authentication token is not a valid JWT encoding")?; + .ok_or(JwtEncodingError::InvalidCompactForm)?; - let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; - let header = serde_json::from_slice::>(&header) - .context("Provided authentication token is not a valid JWT encoding")?; + let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?; + let header = serde_json::from_slice::>(&header)?; - let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; + let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?; - let kid = header.key_id.context("missing key id")?; + let kid = header.key_id.ok_or(JwtError::MissingKeyId)?; let mut guard = self .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch) @@ -281,16 +286,13 @@ impl JwkCacheEntryLock { .renew_jwks(permit, ctx, client, endpoint.clone(), fetch) .await?; } - _ => { - bail!("jwk not found"); - } + _ => return Err(JwtError::JwkNotFound), } }; - ensure!( - jwk.is_supported(&header.algorithm), - "signature algorithm not supported" - ); + if !jwk.is_supported(&header.algorithm) { + return Err(JwtError::SignatureAlgorithmNotSupported); + } match &jwk.key { jose_jwk::Key::Ec(key) => { @@ -299,34 +301,32 @@ impl JwkCacheEntryLock { jose_jwk::Key::Rsa(key) => { verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } - key => bail!("unsupported key type {key:?}"), + key => return Err(JwtError::UnsupportedKeyType(key.into())), }; - let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; - let payload = serde_json::from_slice::>(&payloadb) - .context("Provided authentication token is not a valid JWT encoding")?; + let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?; + let payload = serde_json::from_slice::>(&payloadb)?; tracing::debug!(?payload, "JWT signature valid with claims"); if let Some(aud) = expected_audience { - ensure!( - payload.audience.0.iter().any(|s| s == aud), - "invalid JWT token audience" - ); + if payload.audience.0.iter().all(|s| s != aud) { + return Err(JwtError::InvalidJwtTokenAudience); + } } let now = SystemTime::now(); if let Some(exp) = payload.expiration { - ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired"); + if now >= exp + CLOCK_SKEW_LEEWAY { + return Err(JwtError::JwtTokenHasExpired); + } } if let Some(nbf) = payload.not_before { - ensure!( - nbf < now + CLOCK_SKEW_LEEWAY, - "JWT token is not yet ready to use" - ); + if nbf >= now + CLOCK_SKEW_LEEWAY { + return Err(JwtError::JwtTokenNotYetReadyToUse); + } } Ok(ComputeCredentialKeys::JwtPayload(payloadb)) @@ -341,7 +341,7 @@ impl JwkCache { role_name: &RoleName, fetch: &F, jwt: &str, - ) -> Result { + ) -> Result { // try with just a read lock first let key = (endpoint.clone(), role_name.clone()); let entry = self.map.get(&key).as_deref().map(Arc::clone); @@ -357,19 +357,18 @@ impl JwkCache { } } -fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> { +fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> { use ecdsa::Signature; use signature::Verifier; match key.crv { jose_jwk::EcCurves::P256 => { - let pk = - p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?; + let pk = p256::PublicKey::try_from(key).map_err(JwtError::InvalidP256Key)?; let key = p256::ecdsa::VerifyingKey::from(&pk); let sig = Signature::from_slice(sig)?; key.verify(data, &sig)?; } - key => bail!("unsupported ec key type {key:?}"), + key => return Err(JwtError::UnsupportedEcKeyType(key)), } Ok(()) @@ -380,14 +379,12 @@ fn verify_rsa_signature( sig: &[u8], key: &jose_jwk::Rsa, alg: &jose_jwa::Algorithm, -) -> anyhow::Result<()> { +) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; - use rsa::{ - pkcs1v15::{Signature, VerifyingKey}, - RsaPublicKey, - }; + use rsa::pkcs1v15::{Signature, VerifyingKey}; + use rsa::RsaPublicKey; - let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?; + let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; match alg { Algorithm::Signing(Signing::Rs256) => { @@ -395,7 +392,7 @@ fn verify_rsa_signature( let sig = Signature::try_from(sig)?; key.verify(data, &sig)?; } - _ => bail!("invalid RSA signing algorithm"), + _ => return Err(JwtError::InvalidRsaSigningAlgorithm), }; Ok(()) @@ -561,13 +558,104 @@ impl Drop for JwkRenewalPermit<'_> { } } +#[derive(Error, Debug)] +#[non_exhaustive] +pub(crate) enum JwtError { + #[error("jwk not found")] + JwkNotFound, + + #[error("missing key id")] + MissingKeyId, + + #[error("Provided authentication token is not a valid JWT encoding")] + JwtEncoding(#[from] JwtEncodingError), + + #[error("invalid JWT token audience")] + InvalidJwtTokenAudience, + + #[error("JWT token has expired")] + JwtTokenHasExpired, + + #[error("JWT token is not yet ready to use")] + JwtTokenNotYetReadyToUse, + + #[error("invalid P256 key")] + InvalidP256Key(jose_jwk::crypto::Error), + + #[error("invalid RSA key")] + InvalidRsaKey(jose_jwk::crypto::Error), + + #[error("invalid RSA signing algorithm")] + InvalidRsaSigningAlgorithm, + + #[error("unsupported EC key type {0:?}")] + UnsupportedEcKeyType(jose_jwk::EcCurves), + + #[error("unsupported key type {0:?}")] + UnsupportedKeyType(KeyType), + + #[error("signature algorithm not supported")] + SignatureAlgorithmNotSupported, + + #[error("signature error: {0}")] + Signature(#[from] signature::Error), + + #[error("failed to fetch auth rules: {0}")] + FetchAuthRules(#[from] FetchAuthRulesError), +} + +impl From for JwtError { + fn from(err: base64::DecodeError) -> Self { + JwtEncodingError::Base64Decode(err).into() + } +} + +impl From for JwtError { + fn from(err: serde_json::Error) -> Self { + JwtEncodingError::SerdeJson(err).into() + } +} + +#[derive(Error, Debug)] +#[non_exhaustive] +pub enum JwtEncodingError { + #[error(transparent)] + Base64Decode(#[from] base64::DecodeError), + + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + + #[error("invalid compact form")] + InvalidCompactForm, +} + +#[allow(dead_code, reason = "Debug use only")] +#[derive(Debug)] +pub(crate) enum KeyType { + Ec(jose_jwk::EcCurves), + Rsa, + Oct, + Okp(jose_jwk::OkpCurves), + Unknown, +} + +impl From<&jose_jwk::Key> for KeyType { + fn from(key: &jose_jwk::Key) -> Self { + match key { + jose_jwk::Key::Ec(ec) => Self::Ec(ec.crv), + jose_jwk::Key::Rsa(_rsa) => Self::Rsa, + jose_jwk::Key::Oct(_oct) => Self::Oct, + jose_jwk::Key::Okp(okp) => Self::Okp(okp.crv), + _ => Self::Unknown, + } + } +} + #[cfg(test)] mod tests { - use crate::RoleName; - - use super::*; - - use std::{future::IntoFuture, net::SocketAddr, time::SystemTime}; + use std::future::IntoFuture; + use std::net::SocketAddr; + use std::time::SystemTime; use base64::URL_SAFE_NO_PAD; use bytes::Bytes; @@ -580,6 +668,9 @@ mod tests { use signature::Signer; use tokio::net::TcpListener; + use super::*; + use crate::RoleName; + fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) { let sk = p256::SecretKey::random(&mut OsRng); let pk = sk.public_key().into(); @@ -758,7 +849,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, FetchAuthRulesError> { Ok(vec![ AuthRule { id: "foo".to_owned(), diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 12451847b1..e3995ac6c0 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -1,20 +1,15 @@ use std::net::SocketAddr; -use anyhow::Context; use arc_swap::ArcSwapOption; -use crate::{ - compute::ConnCfg, - context::RequestMonitoring, - control_plane::{ - messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}, - NodeInfo, - }, - intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}, - EndpointId, -}; - use super::jwt::{AuthRule, FetchAuthRules}; +use crate::auth::backend::jwt::FetchAuthRulesError; +use crate::compute::ConnCfg; +use crate::context::RequestMonitoring; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; +use crate::control_plane::NodeInfo; +use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}; +use crate::EndpointId; pub struct LocalBackend { pub(crate) node_info: NodeInfo, @@ -53,11 +48,11 @@ impl FetchAuthRules for StaticAuthRules { &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, FetchAuthRulesError> { let mappings = JWKS_ROLE_MAP.load(); let role_mappings = mappings .as_deref() - .context("JWKs settings for this role were not configured")?; + .ok_or(FetchAuthRulesError::RoleJwksNotConfigured)?; let mut rules = vec![]; for setting in &role_mappings.jwks { rules.push(AuthRule { diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index c9aa5b7e61..a4db130b61 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -8,6 +8,7 @@ use std::net::IpAddr; use std::sync::Arc; use std::time::Duration; +pub use console_redirect::ConsoleRedirectBackend; pub(crate) use console_redirect::WebAuthError; use ipnet::{Ipv4Net, Ipv6Net}; use local::LocalBackend; @@ -16,29 +17,22 @@ use tokio_postgres::config::AuthKeys; use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::{validate_password_and_exchange, AuthError}; +use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint}; use crate::cache::Cached; +use crate::config::AuthenticationConfig; use crate::context::RequestMonitoring; use crate::control_plane::errors::GetAuthInfoError; -use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend}; -use crate::control_plane::{AuthSecret, NodeInfo}; +use crate::control_plane::provider::{ + CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend, +}; +use crate::control_plane::{self, Api, AuthSecret}; use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; -use crate::{ - auth::{self, ComputeUserInfoMaybeEndpoint}, - config::AuthenticationConfig, - control_plane::{ - self, - provider::{CachedAllowedIps, CachedNodeInfo}, - Api, - }, - stream, url, -}; -use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; +use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName}; /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality pub enum MaybeOwned<'a, T> { @@ -65,11 +59,9 @@ impl std::ops::Deref for MaybeOwned<'_, T> { /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -pub enum Backend<'a, T, D> { +pub enum Backend<'a, T> { /// Cloud API (V2). ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T), - /// Authentication via a web browser. - ConsoleRedirect(MaybeOwned<'a, url::ApiUrl>, D), /// Local proxy uses configured auth credentials and does not wake compute Local(MaybeOwned<'a, LocalBackend>), } @@ -90,7 +82,7 @@ impl Clone for Box { } } -impl std::fmt::Display for Backend<'_, (), ()> { +impl std::fmt::Display for Backend<'_, ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ControlPlane(api, ()) => match &**api { @@ -106,46 +98,39 @@ impl std::fmt::Display for Backend<'_, (), ()> { #[cfg(test)] ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(), }, - Self::ConsoleRedirect(url, ()) => fmt - .debug_tuple("ConsoleRedirect") - .field(&url.as_str()) - .finish(), Self::Local(_) => fmt.debug_tuple("Local").finish(), } } } -impl Backend<'_, T, D> { +impl Backend<'_, T> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. - pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> { + pub(crate) fn as_ref(&self) -> Backend<'_, &T> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(MaybeOwned::Borrowed(c), x), - Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(MaybeOwned::Borrowed(c), x), Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)), } } } -impl<'a, T, D> Backend<'a, T, D> { +impl<'a, T> Backend<'a, T> { /// Very similar to [`std::option::Option::map`]. /// Maps [`Backend`] to [`Backend`] by applying /// a function to a contained value. - pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> { + pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(c, f(x)), - Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(c, x), Self::Local(l) => Backend::Local(l), } } } -impl<'a, T, D, E> Backend<'a, Result, D> { +impl<'a, T, E> Backend<'a, Result> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub(crate) fn transpose(self) -> Result, E> { + pub(crate) fn transpose(self) -> Result, E> { match self { Self::ControlPlane(c, x) => x.map(|x| Backend::ControlPlane(c, x)), - Self::ConsoleRedirect(c, x) => Ok(Backend::ConsoleRedirect(c, x)), Self::Local(l) => Ok(Backend::Local(l)), } } @@ -241,7 +226,6 @@ impl AuthenticationConfig { pub(crate) fn check_rate_limit( &self, ctx: &RequestMonitoring, - config: &AuthenticationConfig, secret: AuthSecret, endpoint: &EndpointId, is_cleartext: bool, @@ -265,7 +249,7 @@ impl AuthenticationConfig { let limit_not_exceeded = self.rate_limiter.check( ( endpoint_int, - MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet), + MaskedIp::new(ctx.peer_addr(), self.rate_limit_ip_subnet), ), password_weight, ); @@ -339,7 +323,6 @@ async fn auth_quirks( let secret = if let Some(secret) = secret { config.check_rate_limit( ctx, - config, secret, &info.endpoint, unauthenticated_password.is_some() || allow_cleartext, @@ -415,12 +398,11 @@ async fn authenticate_with_secret( classic::authenticate(ctx, info, client, config, secret).await } -impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { +impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { /// Get username from the credentials. pub(crate) fn get_user(&self) -> &str { match self { Self::ControlPlane(_, user_info) => &user_info.user, - Self::ConsoleRedirect(_, ()) => "web", Self::Local(_) => "local", } } @@ -434,7 +416,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, - ) -> auth::Result> { + ) -> auth::Result> { let res = match self { Self::ControlPlane(api, user_info) => { info!( @@ -455,14 +437,6 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { .await?; Backend::ControlPlane(api, credentials) } - // NOTE: this auth backend doesn't use client credentials. - Self::ConsoleRedirect(url, ()) => { - info!("performing web authentication"); - - let info = console_redirect::authenticate(ctx, config, &url, client).await?; - - Backend::ConsoleRedirect(url, info) - } Self::Local(_) => { return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) } @@ -473,14 +447,13 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { } } -impl Backend<'_, ComputeUserInfo, &()> { +impl Backend<'_, ComputeUserInfo> { pub(crate) async fn get_role_secret( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await, - Self::ConsoleRedirect(_, ()) => Ok(Cached::new_uncached(None)), Self::Local(_) => Ok(Cached::new_uncached(None)), } } @@ -493,21 +466,19 @@ impl Backend<'_, ComputeUserInfo, &()> { Self::ControlPlane(api, user_info) => { api.get_allowed_ips_and_secret(ctx, user_info).await } - Self::ConsoleRedirect(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), } } } #[async_trait::async_trait] -impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> { +impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { async fn wake_compute( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::ConsoleRedirect(_, info) => Ok(Cached::new_uncached(info.clone())), Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), } } @@ -515,31 +486,6 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> { fn get_keys(&self) -> &ComputeCredentialKeys { match self { Self::ControlPlane(_, creds) => &creds.keys, - Self::ConsoleRedirect(_, _) => &ComputeCredentialKeys::None, - Self::Local(_) => &ComputeCredentialKeys::None, - } - } -} - -#[async_trait::async_trait] -impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> { - async fn wake_compute( - &self, - ctx: &RequestMonitoring, - ) -> Result { - match self { - Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::ConsoleRedirect(_, ()) => { - unreachable!("web auth flow doesn't support waking the compute") - } - Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), - } - } - - fn get_keys(&self) -> &ComputeCredentialKeys { - match self { - Self::ControlPlane(_, creds) => &creds.keys, - Self::ConsoleRedirect(_, ()) => &ComputeCredentialKeys::None, Self::Local(_) => &ComputeCredentialKeys::None, } } @@ -547,34 +493,32 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> { #[cfg(test)] mod tests { - use std::{net::IpAddr, sync::Arc, time::Duration}; + use std::net::IpAddr; + use std::sync::Arc; + use std::time::Duration; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use once_cell::sync::Lazy; - use postgres_protocol::{ - authentication::sasl::{ChannelBinding, ScramSha256}, - message::{backend::Message as PgMessage, frontend}, - }; + use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; + use postgres_protocol::message::backend::Message as PgMessage; + use postgres_protocol::message::frontend; use provider::AuthSecret; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; - use crate::{ - auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::{ - self, - provider::{self, CachedAllowedIps, CachedRoleSecret}, - CachedNodeInfo, - }, - proxy::NeonOptions, - rate_limiter::{EndpointRateLimiter, RateBucketInfo}, - scram::{threadpool::ThreadPool, ServerSecret}, - stream::{PqStream, Stream}, - }; - - use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter}; + use super::jwt::JwkCache; + use super::{auth_quirks, AuthRateLimiter}; + use crate::auth::backend::MaskedIp; + use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; + use crate::config::AuthenticationConfig; + use crate::context::RequestMonitoring; + use crate::control_plane::provider::{self, CachedAllowedIps, CachedRoleSecret}; + use crate::control_plane::{self, CachedNodeInfo}; + use crate::proxy::NeonOptions; + use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; + use crate::scram::threadpool::ThreadPool; + use crate::scram::ServerSecret; + use crate::stream::{PqStream, Stream}; struct Auth { ips: Vec, @@ -608,7 +552,8 @@ mod tests { &self, _ctx: &RequestMonitoring, _endpoint: crate::EndpointId, - ) -> anyhow::Result> { + ) -> Result, control_plane::errors::GetEndpointJwksError> + { unimplemented!() } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index cba8601d14..fa6bc4c6f5 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,20 +1,22 @@ //! User credentials used in authentication. -use crate::{ - auth::password_hack::parse_endpoint_param, - context::RequestMonitoring, - error::{ReportableError, UserFacingError}, - metrics::{Metrics, SniKind}, - proxy::NeonOptions, - serverless::SERVERLESS_DRIVER_SNI, - EndpointId, RoleName, -}; +use std::collections::HashSet; +use std::net::IpAddr; +use std::str::FromStr; + use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::{collections::HashSet, net::IpAddr, str::FromStr}; use thiserror::Error; use tracing::{info, warn}; +use crate::auth::password_hack::parse_endpoint_param; +use crate::context::RequestMonitoring; +use crate::error::{ReportableError, UserFacingError}; +use crate::metrics::{Metrics, SniKind}; +use crate::proxy::NeonOptions; +use crate::serverless::SERVERLESS_DRIVER_SNI; +use crate::{EndpointId, RoleName}; + #[derive(Debug, Error, PartialEq, Eq, Clone)] pub(crate) enum ComputeUserInfoParseError { #[error("Parameter '{0}' is missing in startup packet.")] @@ -249,10 +251,11 @@ fn project_name_valid(name: &str) -> bool { #[cfg(test)] mod tests { - use super::*; use serde_json::json; use ComputeUserInfoParseError::*; + use super::*; + #[test] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 9a5139dfb8..6294549ff6 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,21 +1,24 @@ //! Main authentication flow. -use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload}; -use crate::{ - config::TlsServerEndPoint, - context::RequestMonitoring, - control_plane::AuthSecret, - intern::EndpointIdInt, - sasl, - scram::{self, threadpool::ThreadPool}, - stream::{PqStream, Stream}, -}; +use std::io; +use std::sync::Arc; + use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; -use std::{io, sync::Arc}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; +use super::backend::ComputeCredentialKeys; +use super::{AuthError, PasswordHackPayload}; +use crate::config::TlsServerEndPoint; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::intern::EndpointIdInt; +use crate::sasl; +use crate::scram::threadpool::ThreadPool; +use crate::scram::{self}; +use crate::stream::{PqStream, Stream}; + /// Every authentication selector is supposed to implement this trait. pub(crate) trait AuthMethod { /// Any authentication selector should provide initial backend message @@ -114,14 +117,14 @@ impl AuthFlow<'_, S, PasswordHack> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) - .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + .ok_or(AuthError::MalformedPassword("missing terminator"))?; let payload = PasswordHackPayload::parse(password) // If we ended up here and the payload is malformed, it means that // the user neither enabled SNI nor resorted to any other method // for passing the project name we rely on. We should show them // the most helpful error message and point to the documentation. - .ok_or(AuthErrorImpl::MissingEndpointName)?; + .ok_or(AuthError::MissingEndpointName)?; Ok(payload) } @@ -133,7 +136,7 @@ impl AuthFlow<'_, S, CleartextPassword> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) - .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + .ok_or(AuthError::MalformedPassword("missing terminator"))?; let outcome = validate_password_and_exchange( &self.state.pool, @@ -163,7 +166,7 @@ impl AuthFlow<'_, S, Scram<'_>> { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg) - .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?; + .ok_or(AuthError::MalformedPassword("bad sasl message"))?; // Currently, the only supported SASL method is SCRAM. if !scram::METHODS.contains(&sasl.method) { diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index 0c8686add2..7a373dd825 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -14,22 +14,22 @@ pub(crate) use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; mod flow; +use std::io; +use std::net::IpAddr; + pub(crate) use flow::*; +use thiserror::Error; use tokio::time::error::Elapsed; -use crate::{ - control_plane, - error::{ReportableError, UserFacingError}, -}; -use std::{io, net::IpAddr}; -use thiserror::Error; +use crate::control_plane; +use crate::error::{ReportableError, UserFacingError}; /// Convenience wrapper for the authentication error. pub(crate) type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] -pub(crate) enum AuthErrorImpl { +pub(crate) enum AuthError { #[error(transparent)] Web(#[from] backend::WebAuthError), @@ -78,80 +78,70 @@ pub(crate) enum AuthErrorImpl { ConfirmationTimeout(humantime::Duration), } -#[derive(Debug, Error)] -#[error(transparent)] -pub(crate) struct AuthError(Box); - impl AuthError { pub(crate) fn bad_auth_method(name: impl Into>) -> Self { - AuthErrorImpl::BadAuthMethod(name.into()).into() + AuthError::BadAuthMethod(name.into()) } pub(crate) fn auth_failed(user: impl Into>) -> Self { - AuthErrorImpl::AuthFailed(user.into()).into() + AuthError::AuthFailed(user.into()) } pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self { - AuthErrorImpl::IpAddressNotAllowed(ip).into() + AuthError::IpAddressNotAllowed(ip) } pub(crate) fn too_many_connections() -> Self { - AuthErrorImpl::TooManyConnections.into() + AuthError::TooManyConnections } pub(crate) fn is_auth_failed(&self) -> bool { - matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) + matches!(self, AuthError::AuthFailed(_)) } pub(crate) fn user_timeout(elapsed: Elapsed) -> Self { - AuthErrorImpl::UserTimeout(elapsed).into() + AuthError::UserTimeout(elapsed) } pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self { - AuthErrorImpl::ConfirmationTimeout(timeout).into() - } -} - -impl> From for AuthError { - fn from(e: E) -> Self { - Self(Box::new(e.into())) + AuthError::ConfirmationTimeout(timeout) } } impl UserFacingError for AuthError { fn to_string_client(&self) -> String { - match self.0.as_ref() { - AuthErrorImpl::Web(e) => e.to_string_client(), - AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(), - AuthErrorImpl::Sasl(e) => e.to_string_client(), - AuthErrorImpl::AuthFailed(_) => self.to_string(), - AuthErrorImpl::BadAuthMethod(_) => self.to_string(), - AuthErrorImpl::MalformedPassword(_) => self.to_string(), - AuthErrorImpl::MissingEndpointName => self.to_string(), - AuthErrorImpl::Io(_) => "Internal error".to_string(), - AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(), - AuthErrorImpl::TooManyConnections => self.to_string(), - AuthErrorImpl::UserTimeout(_) => self.to_string(), - AuthErrorImpl::ConfirmationTimeout(_) => self.to_string(), + match self { + Self::Web(e) => e.to_string_client(), + Self::GetAuthInfo(e) => e.to_string_client(), + Self::Sasl(e) => e.to_string_client(), + Self::AuthFailed(_) => self.to_string(), + Self::BadAuthMethod(_) => self.to_string(), + Self::MalformedPassword(_) => self.to_string(), + Self::MissingEndpointName => self.to_string(), + Self::Io(_) => "Internal error".to_string(), + Self::IpAddressNotAllowed(_) => self.to_string(), + Self::TooManyConnections => self.to_string(), + Self::UserTimeout(_) => self.to_string(), + Self::ConfirmationTimeout(_) => self.to_string(), } } } impl ReportableError for AuthError { fn get_error_kind(&self) -> crate::error::ErrorKind { - match self.0.as_ref() { - AuthErrorImpl::Web(e) => e.get_error_kind(), - AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(), - AuthErrorImpl::Sasl(e) => e.get_error_kind(), - AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User, - AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User, - AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User, - AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User, - AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect, - AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, - AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit, - AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User, - AuthErrorImpl::ConfirmationTimeout(_) => crate::error::ErrorKind::User, + match self { + Self::Web(e) => e.get_error_kind(), + Self::GetAuthInfo(e) => e.get_error_kind(), + Self::Sasl(e) => e.get_error_kind(), + Self::AuthFailed(_) => crate::error::ErrorKind::User, + Self::BadAuthMethod(_) => crate::error::ErrorKind::User, + Self::MalformedPassword(_) => crate::error::ErrorKind::User, + Self::MissingEndpointName => crate::error::ErrorKind::User, + Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, + Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + Self::TooManyConnections => crate::error::ErrorKind::RateLimit, + Self::UserTimeout(_) => crate::error::ErrorKind::User, + Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User, } } } diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index ae8a7f0841..e6bc369d9a 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -1,38 +1,43 @@ -use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration}; +use std::net::SocketAddr; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; use anyhow::{bail, ensure, Context}; use camino::{Utf8Path, Utf8PathBuf}; use compute_api::spec::LocalProxySpec; use dashmap::DashMap; use futures::future::Either; -use proxy::{ - auth::backend::{ - jwt::JwkCache, - local::{LocalBackend, JWKS_ROLE_MAP}, - }, - cancellation::CancellationHandlerMain, - config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, - control_plane::{ - locks::ApiLocks, - messages::{EndpointJwksResponse, JwksSettings}, - }, - http::health_server::AppMetrics, - intern::RoleNameInt, - metrics::{Metrics, ThreadPoolMetrics}, - rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}, - scram::threadpool::ThreadPool, - serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions}, - RoleName, +use proxy::auth::backend::jwt::JwkCache; +use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; +use proxy::auth::{self}; +use proxy::cancellation::CancellationHandlerMain; +use proxy::config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}; +use proxy::control_plane::locks::ApiLocks; +use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use proxy::http::health_server::AppMetrics; +use proxy::intern::RoleNameInt; +use proxy::metrics::{Metrics, ThreadPoolMetrics}; +use proxy::rate_limiter::{ + BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, }; +use proxy::scram::threadpool::ThreadPool; +use proxy::serverless::cancel_set::CancelSet; +use proxy::serverless::{self, GlobalConnPoolOptions}; +use proxy::RoleName; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); use clap::Parser; -use tokio::{net::TcpListener, sync::Notify, task::JoinSet}; +use tokio::net::TcpListener; +use tokio::sync::Notify; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; -use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; @@ -132,6 +137,7 @@ async fn main() -> anyhow::Result<()> { let args = LocalProxyCliArgs::parse(); let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args)?; // before we bind to any ports, write the process ID to a file // so that compute-ctl can find our process later @@ -193,6 +199,7 @@ async fn main() -> anyhow::Result<()> { let task = serverless::task_main( config, + auth_backend, http_listener, shutdown.clone(), Arc::new(CancellationHandlerMain::new( @@ -257,9 +264,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig Ok(Box::leak(Box::new(ProxyConfig { tls_config: None, - auth_backend: proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( - LocalBackend::new(args.compute), - )), metric_collection: None, allow_self_signed_compute: false, http_config, @@ -286,6 +290,17 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }))) } +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend( + args: &LocalProxyCliArgs, +) -> anyhow::Result<&'static auth::Backend<'static, ()>> { + let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( + LocalBackend::new(args.compute), + )); + + Ok(Box::leak(Box::new(auth_backend))) +} + async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { loop { rx.notified().await; diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 53f1586abe..00eb830d98 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -5,25 +5,23 @@ /// the outside. Similar to an ingress controller for HTTPS. use std::{net::SocketAddr, sync::Arc}; +use anyhow::{anyhow, bail, ensure, Context}; +use clap::Arg; use futures::future::Either; +use futures::TryFutureExt; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; use proxy::metrics::{Metrics, ThreadPoolMetrics}; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; -use rustls::pki_types::PrivateKeyDer; -use tokio::net::TcpListener; - -use anyhow::{anyhow, bail, ensure, Context}; -use clap::Arg; -use futures::TryFutureExt; use proxy::stream::{PqStream, Stream}; - +use rustls::pki_types::PrivateKeyDer; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpListener; use tokio_util::sync::CancellationToken; -use utils::{project_git_version, sentry_init::init_sentry}; - use tracing::{error, info, Instrument}; +use utils::project_git_version; +use utils::sentry_init::init_sentry; project_git_version!(GIT_VERSION); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7488cce3c4..96a71e69c6 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,3 +1,8 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::sync::Arc; + +use anyhow::bail; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; @@ -7,51 +12,34 @@ use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use aws_config::Region; use futures::future::Either; -use proxy::auth; use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::AuthRateLimiter; -use proxy::auth::backend::MaybeOwned; -use proxy::cancellation::CancelMap; -use proxy::cancellation::CancellationHandler; -use proxy::config::remote_storage_from_toml; -use proxy::config::AuthenticationConfig; -use proxy::config::CacheOptions; -use proxy::config::HttpConfig; -use proxy::config::ProjectInfoCacheOptions; -use proxy::config::ProxyProtocolV2; +use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; +use proxy::cancellation::{CancelMap, CancellationHandler}; +use proxy::config::{ + self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig, + ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, +}; use proxy::context::parquet::ParquetUploadArgs; -use proxy::control_plane; -use proxy::http; use proxy::http::health_server::AppMetrics; use proxy::metrics::Metrics; -use proxy::rate_limiter::EndpointRateLimiter; -use proxy::rate_limiter::LeakyBucketConfig; -use proxy::rate_limiter::RateBucketInfo; -use proxy::rate_limiter::WakeComputeRateLimiter; +use proxy::rate_limiter::{ + EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, +}; use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use proxy::redis::elasticache; -use proxy::redis::notifications; +use proxy::redis::{elasticache, notifications}; use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; -use proxy::usage_metrics; - -use anyhow::bail; -use proxy::config::{self, ProxyConfig}; -use proxy::serverless; +use proxy::{auth, control_plane, http, serverless, usage_metrics}; use remote_storage::RemoteStorageConfig; -use std::net::SocketAddr; -use std::pin::pin; -use std::sync::Arc; use tokio::net::TcpListener; use tokio::sync::Mutex; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::info; -use tracing::warn; -use tracing::Instrument; -use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; +use tracing::{info, warn, Instrument}; +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -311,8 +299,12 @@ async fn main() -> anyhow::Result<()> { let args = ProxyCliArgs::parse(); let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args)?; - info!("Authentication backend: {}", config.auth_backend); + match auth_backend { + Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), + Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), + }; info!("Using region: {}", args.aws_region); let region_provider = @@ -459,24 +451,41 @@ async fn main() -> anyhow::Result<()> { // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::proxy::task_main( - config, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } + match auth_backend { + Either::Left(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } - if let Some(serverless_listener) = serverless_listener { - client_tasks.spawn(serverless::task_main( - config, - serverless_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); + if let Some(serverless_listener) = serverless_listener { + client_tasks.spawn(serverless::task_main( + config, + auth_backend, + serverless_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + } + Either::Right(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::console_redirect_proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + )); + } + } } client_tasks.spawn(proxy::context::parquet::worker( @@ -506,7 +515,7 @@ async fn main() -> anyhow::Result<()> { )); } - if let auth::Backend::ControlPlane(api, _) = &config.auth_backend { + if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api { match (redis_notifications_client, regional_redis_client.clone()) { (None, None) => {} @@ -610,73 +619,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { bail!("dynamic rate limiter should be disabled"); } - let auth_backend = match &args.auth_backend { - AuthBackendType::Console => { - let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; - let project_info_cache_config: ProjectInfoCacheOptions = - args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; - - info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); - info!( - "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" - ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); - let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( - wake_compute_cache_config, - project_info_cache_config, - endpoint_cache_config, - ))); - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.wake_compute_lock.parse()?; - info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( - "wake_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().wake_compute_lock, - )?)); - tokio::spawn(locks.garbage_collect_worker()); - - let url = args.auth_endpoint.parse()?; - let endpoint = http::Endpoint::new(url, http::new_client()); - - let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); - RateBucketInfo::validate(&mut wake_compute_rps_limit)?; - let wake_compute_endpoint_rate_limiter = - Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); - let api = control_plane::provider::neon::Api::new( - endpoint, - caches, - locks, - wake_compute_endpoint_rate_limiter, - ); - let api = control_plane::provider::ControlPlaneBackend::Management(api); - auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()) - } - - AuthBackendType::Web => { - let url = args.uri.parse()?; - auth::Backend::ConsoleRedirect(MaybeOwned::Owned(url), ()) - } - - #[cfg(feature = "testing")] - AuthBackendType::Postgres => { - let url = args.auth_endpoint.parse()?; - let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy); - let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api); - auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()) - } - }; - let config::ConcurrencyLockOptions { shards, limiter, @@ -726,9 +668,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { webauth_confirmation_timeout: args.webauth_confirmation_timeout, }; - let config = Box::leak(Box::new(ProxyConfig { + let config = ProxyConfig { tls_config, - auth_backend, metric_collection, allow_self_signed_compute: args.allow_self_signed_compute, http_config, @@ -741,13 +682,100 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { connect_to_compute_retry_config: config::RetryConfig::parse( &args.connect_to_compute_retry, )?, - })); + }; + + let config = Box::leak(Box::new(config)); tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); Ok(config) } +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend( + args: &ProxyCliArgs, +) -> anyhow::Result, &'static ConsoleRedirectBackend>> { + match &args.auth_backend { + AuthBackendType::Console => { + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + )?)); + tokio::spawn(locks.garbage_collect_worker()); + + let url = args.auth_endpoint.parse()?; + let endpoint = http::Endpoint::new(url, http::new_client()); + + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + let api = control_plane::provider::neon::Api::new( + endpoint, + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + let api = control_plane::provider::ControlPlaneBackend::Management(api); + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + #[cfg(feature = "testing")] + AuthBackendType::Postgres => { + let url = args.auth_endpoint.parse()?; + let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy); + let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api); + + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + AuthBackendType::Web => { + let url = args.uri.parse()?; + let backend = ConsoleRedirectBackend::new(url); + + let config = Box::leak(Box::new(backend)); + + Ok(Either::Right(config)) + } + } +} + #[cfg(test)] mod tests { use std::time::Duration; diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 27121ce89e..82f3247fa7 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -1,31 +1,23 @@ -use std::{ - convert::Infallible, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::convert::Infallible; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::time::Duration; use dashmap::DashSet; -use redis::{ - streams::{StreamReadOptions, StreamReadReply}, - AsyncCommands, FromRedisValue, Value, -}; +use redis::streams::{StreamReadOptions, StreamReadReply}; +use redis::{AsyncCommands, FromRedisValue, Value}; use serde::Deserialize; use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; use tracing::info; -use crate::{ - config::EndpointCacheConfig, - context::RequestMonitoring, - intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, - metrics::{Metrics, RedisErrors, RedisEventsCount}, - rate_limiter::GlobalRateLimiter, - redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, - EndpointId, -}; +use crate::config::EndpointCacheConfig; +use crate::context::RequestMonitoring; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; +use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; +use crate::rate_limiter::GlobalRateLimiter; +use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::EndpointId; #[derive(Deserialize, Debug, Clone)] pub(crate) struct ControlPlaneEventKey { diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index b92cedb043..31d1dc96e7 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -1,9 +1,8 @@ -use std::{ - collections::HashSet, - convert::Infallible, - sync::{atomic::AtomicU64, Arc}, - time::Duration, -}; +use std::collections::HashSet; +use std::convert::Infallible; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; use dashmap::DashMap; @@ -13,15 +12,12 @@ use tokio::sync::Mutex; use tokio::time::Instant; use tracing::{debug, info}; -use crate::{ - auth::IpPattern, - config::ProjectInfoCacheOptions, - control_plane::AuthSecret, - intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, - EndpointId, RoleName, -}; - use super::{Cache, Cached}; +use crate::auth::IpPattern; +use crate::config::ProjectInfoCacheOptions; +use crate::control_plane::AuthSecret; +use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}; +use crate::{EndpointId, RoleName}; #[async_trait] pub(crate) trait ProjectInfoCache { @@ -371,7 +367,8 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::{scram::ServerSecret, ProjectId}; + use crate::scram::ServerSecret; + use crate::ProjectId; #[tokio::test] async fn test_project_info_cache_settings() { diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 5b08d74696..06eaeb9a30 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -1,9 +1,6 @@ -use std::{ - borrow::Borrow, - hash::Hash, - time::{Duration, Instant}, -}; -use tracing::debug; +use std::borrow::Borrow; +use std::hash::Hash; +use std::time::{Duration, Instant}; // This seems to make more sense than `lru` or `cached`: // @@ -15,8 +12,10 @@ use tracing::debug; // // On the other hand, `hashlink` has good download stats and appears to be maintained. use hashlink::{linked_hash_map::RawEntryMut, LruCache}; +use tracing::debug; -use super::{common::Cached, timed_lru, Cache}; +use super::common::Cached; +use super::{timed_lru, Cache}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 71a2a16af8..db0970adcb 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,6 +1,8 @@ +use std::net::SocketAddr; +use std::sync::Arc; + use dashmap::DashMap; use pq_proto::CancelKeyData; -use std::{net::SocketAddr, sync::Arc}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::Mutex; @@ -8,12 +10,10 @@ use tokio_postgres::{CancelToken, NoTls}; use tracing::info; use uuid::Uuid; -use crate::{ - error::ReportableError, - metrics::{CancellationRequest, CancellationSource, Metrics}, - redis::cancellation_publisher::{ - CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, - }, +use crate::error::ReportableError; +use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::redis::cancellation_publisher::{ + CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; pub type CancelMap = Arc>>; diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 006804fcd4..212e82497f 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,25 +1,31 @@ -use crate::{ - auth::parse_endpoint_param, - cancellation::CancelClosure, - context::RequestMonitoring, - control_plane::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError}, - error::{ReportableError, UserFacingError}, - metrics::{Metrics, NumDbConnectionsGuard}, - proxy::neon_option, - Host, -}; +use std::io; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Duration; + use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; -use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError}; -use std::{io, net::SocketAddr, sync::Arc, time::Duration}; +use rustls::client::danger::ServerCertVerifier; +use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres_rustls::MakeRustlsConnect; use tracing::{error, info, warn}; +use crate::auth::parse_endpoint_param; +use crate::cancellation::CancelClosure; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::provider::ApiLockError; +use crate::error::{ReportableError, UserFacingError}; +use crate::metrics::{Metrics, NumDbConnectionsGuard}; +use crate::proxy::neon_option; +use crate::Host; + pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 55d0b6374c..2ec8c7adda 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,35 +1,29 @@ -use crate::{ - auth::{ - self, - backend::{jwt::JwkCache, AuthRateLimiter}, - }, - control_plane::locks::ApiLocks, - rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, - scram::threadpool::ThreadPool, - serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, - Host, -}; +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; + use anyhow::{bail, ensure, Context, Ok}; use clap::ValueEnum; use itertools::Itertools; use remote_storage::RemoteStorageConfig; -use rustls::{ - crypto::ring::sign, - pki_types::{CertificateDer, PrivateKeyDer}, -}; +use rustls::crypto::ring::sign; +use rustls::pki_types::{CertificateDer, PrivateKeyDer}; use sha2::{Digest, Sha256}; -use std::{ - collections::{HashMap, HashSet}, - str::FromStr, - sync::Arc, - time::Duration, -}; use tracing::{error, info}; use x509_parser::oid_registry; +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::AuthRateLimiter; +use crate::control_plane::locks::ApiLocks; +use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::cancel_set::CancelSet; +use crate::serverless::GlobalConnPoolOptions; +use crate::Host; + pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::Backend<'static, (), ()>, pub metric_collection: Option, pub allow_self_signed_compute: bool, pub http_config: HttpConfig, @@ -696,9 +690,8 @@ impl FromStr for ConcurrencyLockOptions { #[cfg(test)] mod tests { - use crate::rate_limiter::Aimd; - use super::*; + use crate::rate_limiter::Aimd; #[test] fn test_parse_cache_options() -> anyhow::Result<()> { diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs new file mode 100644 index 0000000000..81d1d70958 --- /dev/null +++ b/proxy/src/console_redirect_proxy.rs @@ -0,0 +1,214 @@ +use std::sync::Arc; + +use futures::TryFutureExt; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, Instrument}; + +use crate::auth::backend::ConsoleRedirectBackend; +use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::config::{ProxyConfig, ProxyProtocolV2}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism}; +use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::proxy::passthrough::ProxyPassthrough; +use crate::proxy::{ + prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, +}; + +pub async fn task_main( + config: &'static ProxyConfig, + backend: &'static ConsoleRedirectBackend, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, + cancellation_handler: Arc, +) -> anyhow::Result<()> { + scopeguard::defer! { + info!("proxy has shut down"); + } + + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; + + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); + + let session_id = uuid::Uuid::new_v4(); + let cancellation_handler = Arc::clone(&cancellation_handler); + + tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + + connections.spawn(async move { + let (socket, peer_addr) = match read_proxy_protocol(socket).await { + Err(e) => { + error!("per-client task finished with an error: {e:#}"); + return; + } + Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + error!("missing required proxy protocol header"); + return; + } + Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + error!("proxy protocol header not supported"); + return; + } + Ok((socket, Some(addr))) => (socket, addr.ip()), + Ok((socket, None)) => (socket, peer_addr.ip()), + }; + + match socket.inner.set_nodelay(true) { + Ok(()) => {} + Err(e) => { + error!("per-client task finished with an error: failed to set socket option: {e:#}"); + return; + } + }; + + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); + let span = ctx.span(); + + let startup = Box::pin( + handle_client( + config, + backend, + &ctx, + cancellation_handler, + socket, + conn_gauge, + ) + .instrument(span.clone()), + ); + let res = startup.await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + error!(parent: &span, "per-client task finished with an error: {e:#}"); + } + Ok(None) => { + ctx.set_success(); + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log_connect(); + match p.proxy_pass().instrument(span.clone()).await { + Ok(()) => {} + Err(ErrorSource::Client(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); + } + Err(ErrorSource::Compute(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}"); + } + } + } + } + }); + } + + connections.close(); + drop(listener); + + // Drain connections + connections.wait().await; + + Ok(()) +} + +pub(crate) async fn handle_client( + config: &'static ProxyConfig, + backend: &'static ConsoleRedirectBackend, + ctx: &RequestMonitoring, + cancellation_handler: Arc, + stream: S, + conn_gauge: NumClientConnectionsGuard<'static>, +) -> Result>, ClientRequestError> { + info!( + protocol = %ctx.protocol(), + "handling interactive connection from client" + ); + + let metrics = &Metrics::get().proxy; + let proto = ctx.protocol(); + let request_gauge = metrics.connection_requests.guard(proto); + + let tls = config.tls_config.as_ref(); + + let record_handshake_error = !ctx.has_private_peer_addr(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(ctx, stream, tls, record_handshake_error); + let (mut stream, params) = + match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(cancel_key_data) => { + return Ok(cancellation_handler + .cancel_session(cancel_key_data, ctx.session_id()) + .await + .map(|()| None)?) + } + }; + drop(pause); + + ctx.set_db_options(params.clone()); + + let user_info = match backend + .authenticate(ctx, &config.authentication_config, &mut stream) + .await + { + Ok(auth_result) => auth_result, + Err(e) => { + return stream.throw_error(e).await?; + } + }; + + let mut node = connect_to_compute( + ctx, + &TcpMechanism { + params: ¶ms, + locks: &config.connect_compute_locks, + }, + &user_info, + config.allow_self_signed_compute, + config.wake_compute_retry_config, + config.connect_to_compute_retry_config, + ) + .or_else(|e| stream.throw_error(e)) + .await?; + + let session = cancellation_handler.get_session(); + prepare_client_connection(&node, &session, &mut stream).await?; + + // Before proxy passing, forward to compute whatever data is left in the + // PqStream input buffer. Normally there is none, but our serverless npm + // driver in pipeline mode sends startup, password and first query + // immediately after opening the connection. + let (stream, read_buf) = stream.into_inner(); + node.stream.write_all(&read_buf).await?; + + Ok(Some(ProxyPassthrough { + client: stream, + aux: node.aux.clone(), + compute: node, + _req: request_gauge, + _conn: conn_gauge, + _cancel: session, + })) +} diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 7fb4e7c698..e2d2c1b766 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -1,24 +1,25 @@ //! Connection request monitoring contexts +use std::net::IpAddr; + use chrono::Utc; use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; use smol_str::SmolStr; -use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{debug, field::display, info, info_span, Span}; +use tracing::field::display; +use tracing::{debug, info, info_span, Span}; use try_lock::TryLock; use uuid::Uuid; -use crate::{ - control_plane::messages::{ColdStartInfo, MetricsAuxInfo}, - error::ErrorKind, - intern::{BranchIdInt, ProjectIdInt}, - metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting}, - DbName, EndpointId, RoleName, -}; - use self::parquet::RequestData; +use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::error::ErrorKind; +use crate::intern::{BranchIdInt, ProjectIdInt}; +use crate::metrics::{ + ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, +}; +use crate::{DbName, EndpointId, RoleName}; pub mod parquet; diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 9f6f83022e..b0ad0e4566 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -1,29 +1,28 @@ -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use bytes::{buf::Writer, BufMut, BytesMut}; +use bytes::buf::Writer; +use bytes::{BufMut, BytesMut}; use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; -use parquet::{ - basic::Compression, - file::{ - metadata::RowGroupMetaDataPtr, - properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}, - writer::SerializedFileWriter, - }, - record::RecordWriter, -}; +use parquet::basic::Compression; +use parquet::file::metadata::RowGroupMetaDataPtr; +use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}; +use parquet::file::writer::SerializedFileWriter; +use parquet::record::RecordWriter; use pq_proto::StartupMessageParams; use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use serde::ser::SerializeMap; -use tokio::{sync::mpsc, time}; +use tokio::sync::mpsc; +use tokio::time; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; - use super::{RequestMonitoringInner, LOG_CHAN}; +use crate::config::remote_storage_from_toml; +use crate::context::LOG_CHAN_DISCONNECT; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { @@ -407,26 +406,26 @@ async fn upload_parquet( #[cfg(test)] mod tests { - use std::{net::Ipv4Addr, num::NonZeroUsize, sync::Arc}; + use std::net::Ipv4Addr; + use std::num::NonZeroUsize; + use std::sync::Arc; use camino::Utf8Path; use clap::Parser; use futures::{Stream, StreamExt}; use itertools::Itertools; - use parquet::{ - basic::{Compression, ZstdLevel}, - file::{ - properties::{WriterProperties, DEFAULT_PAGE_SIZE}, - reader::FileReader, - serialized_reader::SerializedFileReader, - }, - }; - use rand::{rngs::StdRng, Rng, SeedableRng}; + use parquet::basic::{Compression, ZstdLevel}; + use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE}; + use parquet::file::reader::FileReader; + use parquet::file::serialized_reader::SerializedFileReader; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use remote_storage::{ GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; - use tokio::{sync::mpsc, time}; + use tokio::sync::mpsc; + use tokio::time; use walkdir::WalkDir; use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData}; diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 960bb5bc21..dae23f7c53 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -1,9 +1,9 @@ -use measured::FixedCardinalityLabel; -use serde::{Deserialize, Serialize}; use std::fmt::{self, Display}; -use crate::auth::IpPattern; +use measured::FixedCardinalityLabel; +use serde::{Deserialize, Serialize}; +use crate::auth::IpPattern; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::proxy::retry::CouldRetry; @@ -362,9 +362,10 @@ pub struct JwksSettings { #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + fn dummy_aux() -> serde_json::Value { json!({ "endpoint_id": "endpoint", diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs index 2c4b5a9b94..5ac3acd28a 100644 --- a/proxy/src/control_plane/mgmt.rs +++ b/proxy/src/control_plane/mgmt.rs @@ -1,16 +1,16 @@ -use crate::{ - control_plane::messages::{DatabaseInfo, KickSession}, - waiters::{self, Waiter, Waiters}, -}; +use std::convert::Infallible; + use anyhow::Context; use once_cell::sync::Lazy; use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::convert::Infallible; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, Instrument}; +use crate::control_plane::messages::{DatabaseInfo, KickSession}; +use crate::waiters::{self, Waiter, Waiters}; + static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); /// Give caller an opportunity to wait for the cloud's reply. diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs index ea2eb79e2a..fb061376e7 100644 --- a/proxy/src/control_plane/provider/mock.rs +++ b/proxy/src/control_plane/provider/mock.rs @@ -1,27 +1,29 @@ //! Mock console backend which relies on a user-provided postgres instance. -use super::{ - errors::{ApiError, GetAuthInfoError, WakeComputeError}, - AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, -}; -use crate::{ - auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName, -}; -use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; -use crate::{auth::IpPattern, cache::Cached}; -use crate::{ - control_plane::{ - messages::MetricsAuxInfo, - provider::{CachedAllowedIps, CachedRoleSecret}, - }, - BranchId, EndpointId, ProjectId, -}; +use std::str::FromStr; +use std::sync::Arc; + use futures::TryFutureExt; -use std::{str::FromStr, sync::Arc}; use thiserror::Error; -use tokio_postgres::{config::SslMode, Client}; +use tokio_postgres::config::SslMode; +use tokio_postgres::Client; use tracing::{error, info, info_span, warn, Instrument}; +use super::errors::{ApiError, GetAuthInfoError, WakeComputeError}; +use super::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::IpPattern; +use crate::cache::Cached; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret}; +use crate::error::io_error; +use crate::intern::RoleNameInt; +use crate::url::ApiUrl; +use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName}; + #[derive(Debug, Error)] enum MockApiError { #[error("Failed to read password: {0}")] @@ -120,7 +122,10 @@ impl Api { }) } - async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result> { + async fn do_get_endpoint_jwks( + &self, + endpoint: EndpointId, + ) -> Result, GetEndpointJwksError> { let (client, connection) = tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; @@ -224,7 +229,7 @@ impl super::Api for Api { &self, _ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(endpoint).await } diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs index 01d93dee43..a4a330cd5f 100644 --- a/proxy/src/control_plane/provider/mod.rs +++ b/proxy/src/control_plane/provider/mod.rs @@ -2,39 +2,36 @@ pub mod mock; pub mod neon; -use super::messages::{ControlPlaneError, MetricsAuxInfo}; -use crate::{ - auth::{ - backend::{ - jwt::{AuthRule, FetchAuthRules}, - ComputeCredentialKeys, ComputeUserInfo, - }, - IpPattern, - }, - cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, - compute, - config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, - context::RequestMonitoring, - error::ReportableError, - intern::ProjectIdInt, - metrics::ApiLockMetrics, - rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}, - scram, EndpointCacheKey, EndpointId, -}; +use std::hash::Hash; +use std::sync::Arc; +use std::time::Duration; + use dashmap::DashMap; -use std::{hash::Hash, sync::Arc, time::Duration}; use tokio::time::Instant; use tracing::info; +use super::messages::{ControlPlaneError, MetricsAuxInfo}; +use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; +use crate::auth::IpPattern; +use crate::cache::endpoints::EndpointsCache; +use crate::cache::project_info::ProjectInfoCacheImpl; +use crate::cache::{Cached, TimedLru}; +use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::intern::ProjectIdInt; +use crate::metrics::ApiLockMetrics; +use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}; +use crate::{compute, scram, EndpointCacheKey, EndpointId}; + pub(crate) mod errors { - use crate::{ - control_plane::messages::{self, ControlPlaneError, Reason}, - error::{io_error, ErrorKind, ReportableError, UserFacingError}, - proxy::retry::CouldRetry, - }; use thiserror::Error; use super::ApiLockError; + use crate::control_plane::messages::{self, ControlPlaneError, Reason}; + use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError}; + use crate::proxy::retry::CouldRetry; /// A go-to error message which doesn't leak any detail. pub(crate) const REQUEST_FAILED: &str = "Console request failed"; @@ -44,7 +41,7 @@ pub(crate) mod errors { pub(crate) enum ApiError { /// Error returned by the console itself. #[error("{REQUEST_FAILED} with {0}")] - ControlPlane(ControlPlaneError), + ControlPlane(Box), /// Various IO errors like broken pipe or malformed payload. #[error("{REQUEST_FAILED}: {0}")] @@ -81,16 +78,16 @@ pub(crate) mod errors { Reason::EndpointNotFound => ErrorKind::User, Reason::BranchNotFound => ErrorKind::User, Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit, - Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User, - Reason::ActiveTimeQuotaExceeded => ErrorKind::User, - Reason::ComputeTimeQuotaExceeded => ErrorKind::User, - Reason::WrittenDataQuotaExceeded => ErrorKind::User, - Reason::DataTransferQuotaExceeded => ErrorKind::User, - Reason::LogicalSizeQuotaExceeded => ErrorKind::User, + Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota, + Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota, + Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota, + Reason::WrittenDataQuotaExceeded => ErrorKind::Quota, + Reason::DataTransferQuotaExceeded => ErrorKind::Quota, + Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota, Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane, Reason::LockAlreadyTaken => ErrorKind::ControlPlane, Reason::RunningOperations => ErrorKind::ControlPlane, - Reason::Unknown => match &e { + Reason::Unknown => match &**e { ControlPlaneError { http_status_code: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, @@ -103,7 +100,7 @@ pub(crate) mod errors { } if error .contains("compute time quota of non-primary branches is exceeded") => { - crate::error::ErrorKind::User + crate::error::ErrorKind::Quota } ControlPlaneError { http_status_code: http::StatusCode::LOCKED, @@ -112,7 +109,7 @@ pub(crate) mod errors { } if error.contains("quota exceeded") || error.contains("the limit for current plan reached") => { - crate::error::ErrorKind::User + crate::error::ErrorKind::Quota } ControlPlaneError { http_status_code: http::StatusCode::TOO_MANY_REQUESTS, @@ -246,6 +243,33 @@ pub(crate) mod errors { } } } + + #[derive(Debug, Error)] + pub enum GetEndpointJwksError { + #[error("endpoint not found")] + EndpointNotFound, + + #[error("failed to build control plane request: {0}")] + RequestBuild(#[source] reqwest::Error), + + #[error("failed to send control plane request: {0}")] + RequestExecute(#[source] reqwest_middleware::Error), + + #[error(transparent)] + ControlPlane(#[from] ApiError), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + TokioPostgres(#[from] tokio_postgres::Error), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + ParseUrl(#[from] url::ParseError), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + TaskJoin(#[from] tokio::task::JoinError), + } } /// Auth secret which is managed by the cloud. @@ -342,7 +366,7 @@ pub(crate) trait Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result>; + ) -> Result, errors::GetEndpointJwksError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( @@ -401,7 +425,7 @@ impl Api for ControlPlaneBackend { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, errors::GetEndpointJwksError> { match self { Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(any(test, feature = "testing"))] @@ -583,7 +607,9 @@ impl FetchAuthRules for ControlPlaneBackend { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { - self.get_endpoint_jwks(ctx, endpoint).await + ) -> Result, FetchAuthRulesError> { + self.get_endpoint_jwks(ctx, endpoint) + .await + .map_err(FetchAuthRulesError::GetEndpointJwks) } } diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs index e5f8b5c741..5d0692c7ca 100644 --- a/proxy/src/control_plane/provider/neon.rs +++ b/proxy/src/control_plane/provider/neon.rs @@ -1,28 +1,30 @@ //! Production console backend. +use std::sync::Arc; +use std::time::Duration; + +use ::http::header::AUTHORIZATION; +use ::http::HeaderName; +use futures::TryFutureExt; +use tokio::time::Instant; +use tokio_postgres::config::SslMode; +use tracing::{debug, info, info_span, warn, Instrument}; + +use super::super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute}; +use super::errors::{ApiError, GetAuthInfoError, WakeComputeError}; use super::{ - super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute}, - errors::{ApiError, GetAuthInfoError, WakeComputeError}, ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, }; -use crate::{ - auth::backend::{jwt::AuthRule, ComputeUserInfo}, - compute, - control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}, - http, - metrics::{CacheOutcome, Metrics}, - rate_limiter::WakeComputeRateLimiter, - scram, EndpointCacheKey, EndpointId, -}; -use crate::{cache::Cached, context::RequestMonitoring}; -use ::http::{header::AUTHORIZATION, HeaderName}; -use anyhow::bail; -use futures::TryFutureExt; -use std::{sync::Arc, time::Duration}; -use tokio::time::Instant; -use tokio_postgres::config::SslMode; -use tracing::{debug, error, info, info_span, warn, Instrument}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::cache::Cached; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; +use crate::metrics::{CacheOutcome, Metrics}; +use crate::rate_limiter::WakeComputeRateLimiter; +use crate::{compute, http, scram, EndpointCacheKey, EndpointId}; const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); @@ -137,14 +139,14 @@ impl Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { if !self .caches .endpoints_cache .is_valid(ctx, &endpoint.normalize()) .await { - bail!("endpoint not found"); + return Err(GetEndpointJwksError::EndpointNotFound); } let request_id = ctx.session_id().to_string(); async { @@ -159,12 +161,17 @@ impl Api { .header(X_REQUEST_ID, &request_id) .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) - .build()?; + .build() + .map_err(GetEndpointJwksError::RequestBuild)?; info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); - let response = self.endpoint.execute(request).await?; + let response = self + .endpoint + .execute(request) + .await + .map_err(GetEndpointJwksError::RequestExecute)?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -330,7 +337,7 @@ impl super::Api for Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(ctx, endpoint).await } @@ -348,7 +355,7 @@ impl super::Api for Api { let (cached, info) = cached.take_value(); let info = info.map_err(|c| { info!(key = &*key, "found cached wake_compute error"); - WakeComputeError::ApiError(ApiError::ControlPlane(*c)) + WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c))) })?; debug!(key = &*key, "found cached compute node info"); @@ -418,7 +425,7 @@ impl super::Api for Api { self.caches.node_info.insert_ttl( key, - Err(Box::new(err.clone())), + Err(err.clone()), Duration::from_secs(30), ); @@ -456,8 +463,8 @@ async fn parse_body serde::Deserialize<'a>>( }); body.http_status_code = status; - error!("console responded with an error ({status}): {body:?}"); - Err(ApiError::ControlPlane(body)) + warn!("console responded with an error ({status}): {body:?}"); + Err(ApiError::ControlPlane(Box::new(body))) } fn parse_host_port(input: &str) -> Option<(&str, u16)> { diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 53f9f75c5b..e71ed0c048 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,4 +1,5 @@ -use std::{error::Error as StdError, fmt, io}; +use std::error::Error as StdError; +use std::{fmt, io}; use measured::FixedCardinalityLabel; @@ -49,6 +50,10 @@ pub enum ErrorKind { #[label(rename = "serviceratelimit")] ServiceRateLimit, + /// Proxy quota limit violation + #[label(rename = "quota")] + Quota, + /// internal errors Service, @@ -70,6 +75,7 @@ impl ErrorKind { ErrorKind::ClientDisconnect => "clientdisconnect", ErrorKind::RateLimit => "ratelimit", ErrorKind::ServiceRateLimit => "serviceratelimit", + ErrorKind::Quota => "quota", ErrorKind::Service => "service", ErrorKind::ControlPlane => "controlplane", ErrorKind::Postgres => "postgres", diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index d0352351d5..978ad9f761 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,19 +1,18 @@ +use std::convert::Infallible; +use std::net::TcpListener; +use std::sync::{Arc, Mutex}; + use anyhow::{anyhow, bail}; -use hyper0::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; -use measured::{text::BufferedTextEncoder, MetricGroup}; +use hyper0::header::CONTENT_TYPE; +use hyper0::{Body, Request, Response, StatusCode}; +use measured::text::BufferedTextEncoder; +use measured::MetricGroup; use metrics::NeonMetrics; -use std::{ - convert::Infallible, - net::TcpListener, - sync::{Arc, Mutex}, -}; use tracing::{info, info_span}; -use utils::http::{ - endpoint::{self, request_span}, - error::ApiError, - json::json_response, - RouterBuilder, RouterService, -}; +use utils::http::endpoint::{self, request_span}; +use utils::http::error::ApiError; +use utils::http::json::json_response; +use utils::http::{RouterBuilder, RouterService}; use crate::jemalloc; diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index d8676d5b50..fd587e8f01 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -10,17 +10,15 @@ use anyhow::bail; use bytes::Bytes; use http_body_util::BodyExt; use hyper::body::Body; +pub(crate) use reqwest::{Request, Response}; +use reqwest_middleware::RequestBuilder; +pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; +pub(crate) use reqwest_retry::policies::ExponentialBackoff; +pub(crate) use reqwest_retry::RetryTransientMiddleware; use serde::de::DeserializeOwned; -pub(crate) use reqwest::{Request, Response}; -pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; -pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; - -use crate::{ - metrics::{ConsoleRequest, Metrics}, - url::ApiUrl, -}; -use reqwest_middleware::RequestBuilder; +use crate::metrics::{ConsoleRequest, Metrics}; +use crate::url::ApiUrl; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). @@ -142,9 +140,10 @@ pub(crate) async fn parse_json_body_with_limit( #[cfg(test)] mod tests { - use super::*; use reqwest::Client; + use super::*; + #[test] fn optional_query_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index 108420d7d7..09fd9657d0 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -1,6 +1,8 @@ -use std::{ - hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock, -}; +use std::hash::BuildHasherDefault; +use std::marker::PhantomData; +use std::num::NonZeroUsize; +use std::ops::Index; +use std::sync::OnceLock; use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; use rustc_hash::FxHasher; @@ -208,9 +210,8 @@ impl From for ProjectIdInt { mod tests { use std::sync::OnceLock; - use crate::intern::StringInterner; - use super::InternId; + use crate::intern::StringInterner; struct MyId; impl InternId for MyId { @@ -222,7 +223,8 @@ mod tests { #[test] fn push_many_strings() { - use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use rand_distr::Zipf; let endpoint_dist = Zipf::new(500000, 0.8).unwrap(); diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index d307d80f4a..0fae78b60c 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -1,14 +1,12 @@ use std::marker::PhantomData; -use measured::{ - label::NoLabels, - metric::{ - gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding, - MetricFamilyEncoding, MetricType, - }, - text::TextEncoder, - LabelGroup, MetricGroup, -}; +use measured::label::NoLabels; +use measured::metric::gauge::GaugeState; +use measured::metric::group::Encoding; +use measured::metric::name::MetricNameEncoder; +use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType}; +use measured::text::TextEncoder; +use measured::{LabelGroup, MetricGroup}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 8d274baa10..74bc778a36 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -95,6 +95,7 @@ pub mod cache; pub mod cancellation; pub mod compute; pub mod config; +pub mod console_redirect_proxy; pub mod context; pub mod control_plane; pub mod error; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index a34eb820f8..11921867e4 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,14 +1,10 @@ use tracing::Subscriber; -use tracing_subscriber::{ - filter::{EnvFilter, LevelFilter}, - fmt::{ - format::{Format, Full}, - time::SystemTime, - FormatEvent, FormatFields, - }, - prelude::*, - registry::LookupSpan, -}; +use tracing_subscriber::filter::{EnvFilter, LevelFilter}; +use tracing_subscriber::fmt::format::{Format, Full}; +use tracing_subscriber::fmt::time::SystemTime; +use tracing_subscriber::fmt::{FormatEvent, FormatFields}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::registry::LookupSpan; /// Initialize logging and OpenTelemetry tracing and exporter. /// diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 272723a1bc..542826e833 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,14 +1,16 @@ use std::sync::{Arc, OnceLock}; use lasso::ThreadedRodeo; +use measured::label::{ + FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet, +}; +use measured::metric::histogram::Thresholds; +use measured::metric::name::MetricName; use measured::{ - label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet}, - metric::{histogram::Thresholds, name::MetricName}, Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, MetricGroup, }; use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; - use tokio::time::{self, Instant}; use crate::control_plane::messages::ColdStartInfo; diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 17764f78d1..ef2391cdd8 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,11 +1,9 @@ //! Proxy Protocol V2 implementation -use std::{ - io, - net::SocketAddr, - pin::Pin, - task::{Context, Poll}, -}; +use std::io; +use std::net::SocketAddr; +use std::pin::Pin; +use std::task::{Context, Poll}; use bytes::BytesMut; use pin_project_lite::pin_project; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index aac7720890..8e9663626a 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,24 +1,23 @@ -use crate::{ - auth::backend::ComputeCredentialKeys, - compute::COULD_NOT_CONNECT, - compute::{self, PostgresConnection}, - config::RetryConfig, - context::RequestMonitoring, - control_plane::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo}, - error::ReportableError, - metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType}, - proxy::{ - retry::{retry_after, should_retry, CouldRetry}, - wake_compute::wake_compute, - }, - Host, -}; use async_trait::async_trait; use pq_proto::StartupMessageParams; use tokio::time; use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; +use crate::auth::backend::ComputeCredentialKeys; +use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; +use crate::config::RetryConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; +use crate::error::ReportableError; +use crate::metrics::{ + ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, +}; +use crate::proxy::retry::{retry_after, should_retry, CouldRetry}; +use crate::proxy::wake_compute::wake_compute; +use crate::Host; const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 4ebda013ac..91a3ceff75 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -1,11 +1,11 @@ -use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; -use tracing::info; - use std::future::poll_fn; use std::io; use std::pin::Pin; use std::task::{ready, Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tracing::info; + #[derive(Debug)] enum TransferState { Running(CopyBuffer), @@ -256,9 +256,10 @@ impl CopyBuffer { #[cfg(test)] mod tests { - use super::*; use tokio::io::AsyncWriteExt; + use super::*; + #[tokio::test] async fn test_client_to_compute() { let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 5996b11c11..a67f1b8112 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -1,21 +1,19 @@ use bytes::Buf; +use pq_proto::framed::Framed; use pq_proto::{ - framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, - StartupMessageParams, + BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams, }; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; -use crate::{ - auth::endpoint_sni, - config::{TlsConfig, PG_ALPN_PROTOCOL}, - context::RequestMonitoring, - error::ReportableError, - metrics::Metrics, - proxy::ERR_INSECURE_CONNECTION, - stream::{PqStream, Stream, StreamUpgradeError}, -}; +use crate::auth::endpoint_sni; +use crate::config::{TlsConfig, PG_ALPN_PROTOCOL}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::Metrics; +use crate::proxy::ERR_INSECURE_CONNECTION; +use crate::stream::{PqStream, Stream, StreamUpgradeError}; #[derive(Error, Debug)] pub(crate) enum HandshakeError { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 7003af2aba..f646862caa 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -7,40 +7,32 @@ pub(crate) mod handshake; pub(crate) mod passthrough; pub(crate) mod retry; pub(crate) mod wake_compute; -pub use copy_bidirectional::copy_bidirectional_client_compute; -pub use copy_bidirectional::ErrorSource; +use std::sync::Arc; -use crate::config::ProxyProtocolV2; -use crate::{ - auth, - cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, - compute, - config::{ProxyConfig, TlsConfig}, - context::RequestMonitoring, - error::ReportableError, - metrics::{Metrics, NumClientConnectionsGuard}, - protocol2::read_proxy_protocol, - proxy::handshake::{handshake, HandshakeData}, - rate_limiter::EndpointRateLimiter, - stream::{PqStream, Stream}, - EndpointCacheKey, -}; +pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource}; use futures::TryFutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; use smol_str::{format_smolstr, SmolStr}; -use std::sync::Arc; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, Instrument}; +use tracing::{error, info, warn, Instrument}; -use self::{ - connect_compute::{connect_to_compute, TcpMechanism}, - passthrough::ProxyPassthrough, -}; +use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::passthrough::ProxyPassthrough; +use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::rate_limiter::EndpointRateLimiter; +use crate::stream::{PqStream, Stream}; +use crate::{auth, compute, EndpointCacheKey}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; @@ -61,6 +53,7 @@ pub async fn run_until_cancelled( pub async fn task_main( config: &'static ProxyConfig, + auth_backend: &'static auth::Backend<'static, ()>, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, @@ -95,15 +88,15 @@ pub async fn task_main( connections.spawn(async move { let (socket, peer_addr) = match read_proxy_protocol(socket).await { Err(e) => { - error!("per-client task finished with an error: {e:#}"); + warn!("per-client task finished with an error: {e:#}"); return; } Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { - error!("missing required proxy protocol header"); + warn!("missing required proxy protocol header"); return; } Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { - error!("proxy protocol header not supported"); + warn!("proxy protocol header not supported"); return; } Ok((socket, Some(addr))) => (socket, addr.ip()), @@ -129,6 +122,7 @@ pub async fn task_main( let startup = Box::pin( handle_client( config, + auth_backend, &ctx, cancellation_handler, socket, @@ -144,7 +138,7 @@ pub async fn task_main( Err(e) => { // todo: log and push to ctx the error kind ctx.set_error_kind(e.get_error_kind()); - error!(parent: &span, "per-client task finished with an error: {e:#}"); + warn!(parent: &span, "per-client task finished with an error: {e:#}"); } Ok(None) => { ctx.set_success(); @@ -155,7 +149,7 @@ pub async fn task_main( match p.proxy_pass().instrument(span.clone()).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { - error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); + warn!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); } Err(ErrorSource::Compute(e)) => { error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}"); @@ -243,8 +237,10 @@ impl ReportableError for ClientRequestError { } } +#[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, + auth_backend: &'static auth::Backend<'static, ()>, ctx: &RequestMonitoring, cancellation_handler: Arc, stream: S, @@ -285,8 +281,7 @@ pub(crate) async fn handle_client( let common_names = tls.map(|tls| &tls.common_names); // Extract credentials which we're going to use for auth. - let result = config - .auth_backend + let result = auth_backend .as_ref() .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names)) .transpose(); @@ -353,7 +348,7 @@ pub(crate) async fn handle_client( /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -async fn prepare_client_connection

( +pub(crate) async fn prepare_client_connection

( node: &compute::PostgresConnection, session: &cancellation::Session

, stream: &mut PqStream, diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index bbea47f8af..e3b4730982 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -1,16 +1,14 @@ -use crate::{ - cancellation, - compute::PostgresConnection, - control_plane::messages::MetricsAuxInfo, - metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, - stream::Stream, - usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; +use crate::cancellation; +use crate::compute::PostgresConnection; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; +use crate::stream::Stream; +use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] @@ -71,7 +69,7 @@ impl ProxyPassthrough { pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> { let res = proxy_pass(self.client, self.compute.stream, self.aux).await; if let Err(err) = self.compute.cancel_closure.try_cancel_query().await { - tracing::error!(?err, "could not cancel the query in the database"); + tracing::warn!(?err, "could not cancel the query in the database"); } res } diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 15895d37e6..d3f0c3e7d4 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -1,7 +1,11 @@ -use crate::{compute, config::RetryConfig}; -use std::{error::Error, io}; +use std::error::Error; +use std::io; + use tokio::time; +use crate::compute; +use crate::config::RetryConfig; + pub(crate) trait CouldRetry { /// Returns true if the error could be retried fn could_retry(&self) -> bool; diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 33a2162bc7..df9f79a7e3 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -6,7 +6,6 @@ use std::fmt::Debug; -use super::*; use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use postgres_protocol::message::frontend; @@ -14,6 +13,8 @@ use tokio::io::{AsyncReadExt, DuplexStream}; use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; +use super::*; + enum Intercept { None, Methods, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 3861ddc8ed..e50ae4bc93 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -4,6 +4,16 @@ mod mitm; use std::time::Duration; +use anyhow::{bail, Context}; +use async_trait::async_trait; +use http::StatusCode; +use retry::{retry_after, ShouldRetryWakeCompute}; +use rstest::rstest; +use rustls::pki_types; +use tokio_postgres::config::SslMode; +use tokio_postgres::tls::{MakeTlsConnect, NoTls}; +use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; + use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; use super::*; @@ -18,15 +28,6 @@ use crate::control_plane::provider::{ use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; use crate::{sasl, scram, BranchId, EndpointId, ProjectId}; -use anyhow::{bail, Context}; -use async_trait::async_trait; -use http::StatusCode; -use retry::{retry_after, ShouldRetryWakeCompute}; -use rstest::rstest; -use rustls::pki_types; -use tokio_postgres::config::SslMode; -use tokio_postgres::tls::{MakeTlsConnect, NoTls}; -use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; /// Generate a set of TLS certificates: CA + server. fn generate_certs( @@ -336,7 +337,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); - use rand::{distributions::Alphanumeric, Rng}; + use rand::distributions::Alphanumeric; + use rand::Rng; let password: String = rand::thread_rng() .sample_iter(&Alphanumeric) .take(rand::random::() as usize) @@ -492,30 +494,32 @@ impl TestBackend for TestConnectMechanism { match action { ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { - let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError { - http_status_code: StatusCode::BAD_REQUEST, - error: "TEST".into(), - status: None, - }); + let err = + control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: None, + })); assert!(!err.could_retry()); Err(control_plane::errors::WakeComputeError::ApiError(err)) } ConnectAction::WakeRetry => { - let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError { - http_status_code: StatusCode::BAD_REQUEST, - error: "TEST".into(), - status: Some(Status { - code: "error".into(), - message: "error".into(), - details: Details { - error_info: None, - retry_info: Some(control_plane::messages::RetryInfo { - retry_delay_ms: 1, - }), - user_facing_message: None, - }, - }), - }); + let err = + control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: Some(Status { + code: "error".into(), + message: "error".into(), + details: Details { + error_info: None, + retry_info: Some(control_plane::messages::RetryInfo { + retry_delay_ms: 1, + }), + user_facing_message: None, + }, + }), + })); assert!(err.could_retry()); Err(control_plane::errors::WakeComputeError::ApiError(err)) } @@ -552,7 +556,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> auth::Backend<'static, ComputeCredentials, &()> { +) -> auth::Backend<'static, ComputeCredentials> { let user_info = auth::Backend::ControlPlane( MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))), ComputeCredentials { diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index ba674f5d0d..9dfa485fa4 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,16 +1,17 @@ +use hyper::StatusCode; +use tracing::{error, info, warn}; + +use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::{ControlPlaneError, Reason}; -use crate::control_plane::{errors::WakeComputeError, provider::CachedNodeInfo}; +use crate::control_plane::provider::CachedNodeInfo; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, WakeupFailureKind, }; use crate::proxy::retry::{retry_after, should_retry}; -use hyper::StatusCode; -use tracing::{error, info, warn}; - -use super::connect_compute::ComputeConnectBackend; pub(crate) async fn wake_compute( num_retries: &mut u32, @@ -79,7 +80,7 @@ fn report_error(e: &WakeComputeError, retry: bool) { Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked, Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked, Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked, - Reason::Unknown => match e { + Reason::Unknown => match **e { ControlPlaneError { http_status_code: StatusCode::LOCKED, ref error, diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index bf4d85f2e4..45f9630dde 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -1,7 +1,5 @@ -use std::{ - hash::Hash, - sync::atomic::{AtomicUsize, Ordering}, -}; +use std::hash::Hash; +use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; use dashmap::DashMap; diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 25607b7e10..16c398f303 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -1,10 +1,12 @@ //! Algorithms for controlling concurrency limits. +use std::pin::pin; +use std::sync::Arc; +use std::time::Duration; + use parking_lot::Mutex; -use std::{pin::pin, sync::Arc, time::Duration}; -use tokio::{ - sync::Notify, - time::{error::Elapsed, Instant}, -}; +use tokio::sync::Notify; +use tokio::time::error::Elapsed; +use tokio::time::Instant; use self::aimd::Aimd; diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 86b56e38fb..5332a5184f 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -60,12 +60,11 @@ impl LimitAlgorithm for Aimd { mod tests { use std::time::Duration; + use super::*; use crate::rate_limiter::limit_algorithm::{ DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig, }; - use super::*; - #[tokio::test(start_paused = true)] async fn increase_decrease() { let config = RateLimiterConfig { diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index be529f174d..5de64c2254 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,17 +1,14 @@ -use std::{ - borrow::Cow, - collections::hash_map::RandomState, - hash::{BuildHasher, Hash}, - sync::{ - atomic::{AtomicUsize, Ordering}, - Mutex, - }, -}; +use std::borrow::Cow; +use std::collections::hash_map::RandomState; +use std::hash::{BuildHasher, Hash}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Mutex; use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; -use rand::{rngs::StdRng, Rng, SeedableRng}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; use tokio::time::{Duration, Instant}; use tracing::info; @@ -243,14 +240,17 @@ impl BucketRateLimiter { #[cfg(test)] mod tests { - use std::{hash::BuildHasherDefault, time::Duration}; + use std::hash::BuildHasherDefault; + use std::time::Duration; use rand::SeedableRng; use rustc_hash::FxHasher; use tokio::time; use super::{BucketRateLimiter, WakeComputeRateLimiter}; - use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; + use crate::intern::EndpointIdInt; + use crate::rate_limiter::RateBucketInfo; + use crate::EndpointId; #[test] fn rate_bucket_rpi() { diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs index 6e38f89458..3ae2ecaf8f 100644 --- a/proxy/src/rate_limiter/mod.rs +++ b/proxy/src/rate_limiter/mod.rs @@ -2,13 +2,11 @@ mod leaky_bucket; mod limit_algorithm; mod limiter; +pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; #[cfg(test)] pub(crate) use limit_algorithm::aimd::Aimd; - pub(crate) use limit_algorithm::{ DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; pub(crate) use limiter::GlobalRateLimiter; - -pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 95bdfc0965..0000246971 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,13 +5,10 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}; use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; -use super::{ - connection_with_credentials_provider::ConnectionWithCredentialsProvider, - notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}, -}; - pub trait CancellationPublisherMut: Send + Sync + 'static { #[allow(async_fn_in_trait)] async fn try_publish( diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 2de66b58b1..82139ea1d5 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -1,12 +1,11 @@ -use std::{sync::Arc, time::Duration}; +use std::sync::Arc; +use std::time::Duration; use futures::FutureExt; -use redis::{ - aio::{ConnectionLike, MultiplexedConnection}, - ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, -}; +use redis::aio::{ConnectionLike, MultiplexedConnection}; +use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult}; use tokio::task::JoinHandle; -use tracing::{debug, error, info}; +use tracing::{debug, error, info, warn}; use super::elasticache::CredentialsProvider; @@ -89,7 +88,7 @@ impl ConnectionWithCredentialsProvider { return Ok(()); } Err(e) => { - error!("Error during PING: {e:?}"); + warn!("Error during PING: {e:?}"); } } } else { @@ -121,7 +120,7 @@ impl ConnectionWithCredentialsProvider { info!("Connection succesfully established"); } Err(e) => { - error!("Connection is broken. Error during PING: {e:?}"); + warn!("Connection is broken. Error during PING: {e:?}"); } } self.con = Some(con); diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 36a3443603..e56c5a3414 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -1,4 +1,5 @@ -use std::{convert::Infallible, sync::Arc}; +use std::convert::Infallible; +use std::sync::Arc; use futures::StreamExt; use pq_proto::CancelKeyData; @@ -8,12 +9,10 @@ use tokio_util::sync::CancellationToken; use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::{ - cache::project_info::ProjectInfoCache, - cancellation::{CancelMap, CancellationHandler}, - intern::{ProjectIdInt, RoleNameInt}, - metrics::{Metrics, RedisErrors, RedisEventsCount}, -}; +use crate::cache::project_info::ProjectInfoCache; +use crate::cancellation::{CancelMap, CancellationHandler}; +use crate::intern::{ProjectIdInt, RoleNameInt}; +use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; @@ -146,7 +145,7 @@ impl MessageHandler { { Ok(()) => {} Err(e) => { - tracing::error!("failed to cancel session: {e}"); + tracing::warn!("failed to cancel session: {e}"); } } } @@ -269,10 +268,10 @@ where #[cfg(test)] mod tests { - use crate::{ProjectId, RoleName}; + use serde_json::json; use super::*; - use serde_json::json; + use crate::{ProjectId, RoleName}; #[test] fn parse_allowed_ips() -> anyhow::Result<()> { diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 6c9a42b2db..1373dfba3d 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -1,8 +1,9 @@ //! Definitions for SASL messages. -use crate::parse::{split_at_const, split_cstr}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; +use crate::parse::{split_at_const, split_cstr}; + /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] pub(crate) struct FirstMessage<'a> { diff --git a/proxy/src/sasl/mod.rs b/proxy/src/sasl/mod.rs index 0a36694359..f0181b404f 100644 --- a/proxy/src/sasl/mod.rs +++ b/proxy/src/sasl/mod.rs @@ -10,13 +10,14 @@ mod channel_binding; mod messages; mod stream; -use crate::error::{ReportableError, UserFacingError}; use std::io; -use thiserror::Error; pub(crate) use channel_binding::ChannelBinding; pub(crate) use messages::FirstMessage; pub(crate) use stream::{Outcome, SaslStream}; +use thiserror::Error; + +use crate::error::{ReportableError, UserFacingError}; /// Fine-grained auth errors help in writing tests. #[derive(Error, Debug)] diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index b6becd28e1..f1c916daa2 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -1,11 +1,14 @@ //! Abstraction for the string-oriented SASL protocols. -use super::{messages::ServerMessage, Mechanism}; -use crate::stream::PqStream; use std::io; + use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; +use super::messages::ServerMessage; +use super::Mechanism; +use crate::stream::PqStream; + /// Abstracts away all peculiarities of the libpq's protocol. pub(crate) struct SaslStream<'a, S> { /// The underlying stream. diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 64ee0135e1..87ab6e0d5f 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -69,7 +69,9 @@ impl CountMinSketch { #[cfg(test)] mod tests { - use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + use rand::rngs::StdRng; + use rand::seq::SliceRandom; + use rand::{Rng, SeedableRng}; use super::CountMinSketch; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index afb5604666..493295c938 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -209,7 +209,8 @@ impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { - use {sasl::Step, ExchangeState}; + use sasl::Step; + use ExchangeState; match &self.state { ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index fd9e77764c..5ee3a51352 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -1,11 +1,12 @@ //! Definitions for SCRAM messages. +use std::fmt; +use std::ops::Range; + use super::base64_decode_array; use super::key::{ScramKey, SCRAM_KEY_LEN}; use super::signature::SignatureBuilder; use crate::sasl::ChannelBinding; -use std::fmt; -use std::ops::Range; /// Faithfully taken from PostgreSQL. pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18; diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index d058f1c3f8..97644b6282 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -16,10 +16,9 @@ mod signature; pub mod threadpool; pub(crate) use exchange::{exchange, Exchange}; +use hmac::{Hmac, Mac}; pub(crate) use key::ScramKey; pub(crate) use secret::ServerSecret; - -use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; @@ -59,13 +58,11 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use crate::{ - intern::EndpointIdInt, - sasl::{Mechanism, Step}, - EndpointId, - }; - - use super::{threadpool::ThreadPool, Exchange, ServerSecret}; + use super::threadpool::ThreadPool; + use super::{Exchange, ServerSecret}; + use crate::intern::EndpointIdInt; + use crate::sasl::{Mechanism, Step}; + use crate::EndpointId; #[test] fn snapshot() { diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs index 4cf76c8452..9c559e9082 100644 --- a/proxy/src/scram/pbkdf2.rs +++ b/proxy/src/scram/pbkdf2.rs @@ -1,7 +1,6 @@ -use hmac::{ - digest::{consts::U32, generic_array::GenericArray}, - Hmac, Mac, -}; +use hmac::digest::consts::U32; +use hmac::digest::generic_array::GenericArray; +use hmac::{Hmac, Mac}; use sha2::Sha256; pub(crate) struct Pbkdf2 { @@ -66,10 +65,11 @@ impl Pbkdf2 { #[cfg(test)] mod tests { - use super::Pbkdf2; use pbkdf2::pbkdf2_hmac_array; use sha2::Sha256; + use super::Pbkdf2; + #[test] fn works() { let salt = b"sodium chloride"; diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index c027a0cd20..cc1b69fcf9 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -4,28 +4,21 @@ //! 1. Fairness per endpoint. //! 2. Yield support for high iteration counts. -use std::{ - cell::RefCell, - future::Future, - pin::Pin, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Weak, - }, - task::{Context, Poll}, -}; +use std::cell::RefCell; +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::task::{Context, Poll}; use futures::FutureExt; -use rand::Rng; -use rand::{rngs::SmallRng, SeedableRng}; - -use crate::{ - intern::EndpointIdInt, - metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}, - scram::countmin::CountMinSketch, -}; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; use super::pbkdf2::Pbkdf2; +use crate::intern::EndpointIdInt; +use crate::metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}; +use crate::scram::countmin::CountMinSketch; pub struct ThreadPool { runtime: Option, @@ -195,9 +188,8 @@ impl Drop for JobHandle { #[cfg(test)] mod tests { - use crate::EndpointId; - use super::*; + use crate::EndpointId; #[tokio::test] async fn hash_is_correct() { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index f54476b51d..a180c4c2ed 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -1,47 +1,41 @@ -use std::{io, sync::Arc, time::Duration}; +use std::io; +use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; +use p256::ecdsa::SigningKey; +use p256::elliptic_curve::JwkEcKey; +use rand::rngs::OsRng; use tokio::net::{lookup_host, TcpStream}; -use tokio_postgres::types::ToSql; -use tracing::{debug, field::display, info}; +use tracing::field::display; +use tracing::{debug, info}; -use crate::{ - auth::{ - self, - backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo}, - check_peer_addr_is_in_list, AuthError, - }, - compute, - config::{AuthenticationConfig, ProxyConfig}, - context::RequestMonitoring, - control_plane::{ - errors::{GetAuthInfoError, WakeComputeError}, - locks::ApiLocks, - provider::ApiLockError, - CachedNodeInfo, - }, - error::{ErrorKind, ReportableError, UserFacingError}, - intern::EndpointIdInt, - proxy::{ - connect_compute::ConnectMechanism, - retry::{CouldRetry, ShouldRetryWakeCompute}, - }, - rate_limiter::EndpointRateLimiter, - EndpointId, Host, -}; - -use super::{ - conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}, - http_conn_pool::{self, poll_http2_client}, - local_conn_pool::{self, LocalClient, LocalConnPool}, -}; +use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; +use super::http_conn_pool::{self, poll_http2_client}; +use super::local_conn_pool::{self, LocalClient, LocalConnPool}; +use crate::auth::backend::local::StaticAuthRules; +use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; +use crate::auth::{self, check_peer_addr_is_in_list, AuthError}; +use crate::config::ProxyConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::provider::ApiLockError; +use crate::control_plane::CachedNodeInfo; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::intern::EndpointIdInt; +use crate::proxy::connect_compute::ConnectMechanism; +use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; +use crate::rate_limiter::EndpointRateLimiter; +use crate::{compute, EndpointId, Host}; pub(crate) struct PoolingBackend { pub(crate) http_conn_pool: Arc, pub(crate) local_pool: Arc>, pub(crate) pool: Arc>, pub(crate) config: &'static ProxyConfig, + pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, pub(crate) endpoint_rate_limiter: Arc, } @@ -49,18 +43,13 @@ impl PoolingBackend { pub(crate) async fn authenticate_with_password( &self, ctx: &RequestMonitoring, - config: &AuthenticationConfig, user_info: &ComputeUserInfo, password: &[u8], ) -> Result { let user_info = user_info.clone(); - let backend = self - .config - .auth_backend - .as_ref() - .map(|()| user_info.clone()); + let backend = self.auth_backend.as_ref().map(|()| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; - if config.ip_allowlist_check_enabled + if self.config.authentication_config.ip_allowlist_check_enabled && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); @@ -79,7 +68,6 @@ impl PoolingBackend { let secret = match cached_secret.value.clone() { Some(secret) => self.config.authentication_config.check_rate_limit( ctx, - config, secret, &user_info.endpoint, true, @@ -91,9 +79,13 @@ impl PoolingBackend { } }; let ep = EndpointIdInt::from(&user_info.endpoint); - let auth_outcome = - crate::auth::validate_password_and_exchange(&config.thread_pool, ep, password, secret) - .await?; + let auth_outcome = crate::auth::validate_password_and_exchange( + &self.config.authentication_config.thread_pool, + ep, + password, + secret, + ) + .await?; let res = match auth_outcome { crate::sasl::Outcome::Success(key) => { info!("user successfully authenticated"); @@ -113,13 +105,13 @@ impl PoolingBackend { pub(crate) async fn authenticate_with_jwt( &self, ctx: &RequestMonitoring, - config: &AuthenticationConfig, user_info: &ComputeUserInfo, jwt: String, ) -> Result { - match &self.config.auth_backend { + match &self.auth_backend { crate::auth::Backend::ControlPlane(console, ()) => { - config + self.config + .authentication_config .jwks_cache .check_jwt( ctx, @@ -136,11 +128,10 @@ impl PoolingBackend { keys: crate::auth::backend::ComputeCredentialKeys::None, }) } - crate::auth::Backend::ConsoleRedirect(_, ()) => Err(AuthError::auth_failed( - "JWT login over web auth proxy is not supported", - )), crate::auth::Backend::Local(_) => { - let keys = config + let keys = self + .config + .authentication_config .jwks_cache .check_jwt( ctx, @@ -185,7 +176,7 @@ impl PoolingBackend { let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - let backend = self.config.auth_backend.as_ref().map(|()| keys); + let backend = self.auth_backend.as_ref().map(|()| keys); crate::proxy::connect_compute::connect_to_compute( ctx, &TokioMechanism { @@ -217,21 +208,14 @@ impl PoolingBackend { let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - let backend = self - .config - .auth_backend - .as_ref() - .map(|()| ComputeCredentials { - info: ComputeUserInfo { - user: conn_info.user_info.user.clone(), - endpoint: EndpointId::from(format!( - "{}-local-proxy", - conn_info.user_info.endpoint - )), - options: conn_info.user_info.options.clone(), - }, - keys: crate::auth::backend::ComputeCredentialKeys::None, - }); + let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials { + info: ComputeUserInfo { + user: conn_info.user_info.user.clone(), + endpoint: EndpointId::from(format!("{}-local-proxy", conn_info.user_info.endpoint)), + options: conn_info.user_info.options.clone(), + }, + keys: crate::auth::backend::ComputeCredentialKeys::None, + }); crate::proxy::connect_compute::connect_to_compute( ctx, &HyperMechanism { @@ -269,57 +253,65 @@ impl PoolingBackend { tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "local_pool: opening a new connection '{conn_info}'"); - let mut node_info = match &self.config.auth_backend { - auth::Backend::ControlPlane(_, ()) | auth::Backend::ConsoleRedirect(_, ()) => { + let mut node_info = match &self.auth_backend { + auth::Backend::ControlPlane(_, ()) => { unreachable!("only local_proxy can connect to local postgres") } auth::Backend::Local(local) => local.node_info.clone(), }; + let (key, jwk) = create_random_jwk(); + let config = node_info .config .user(&conn_info.user_info.user) - .dbname(&conn_info.dbname); + .dbname(&conn_info.dbname) + .options(&format!( + "-c pg_session_jwt.jwk={}", + serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") + )); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(tokio_postgres::NoTls).await?; drop(pause); - tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); + let pid = client.get_process_id(); + tracing::Span::current().record("pid", pid); - let handle = local_conn_pool::poll_client( + let mut handle = local_conn_pool::poll_client( self.local_pool.clone(), ctx, conn_info, client, connection, + key, conn_id, node_info.aux.clone(), ); - let kid = handle.get_client().get_process_id() as i64; - let jwk = p256::PublicKey::from(handle.key().verifying_key()).to_jwk(); + { + let (client, mut discard) = handle.inner(); + debug!("setting up backend session state"); - debug!(kid, ?jwk, "setting up backend session state"); + // initiates the auth session + if let Err(e) = client.query("select auth.init()", &[]).await { + discard.discard(); + return Err(e.into()); + } - // initiates the auth session - handle - .get_client() - .query( - "select auth.init($1, $2);", - &[ - &kid as &(dyn ToSql + Sync), - &tokio_postgres::types::Json(jwk), - ], - ) - .await?; - - info!(?kid, "backend session state init"); + info!("backend session state initialized"); + } Ok(handle) } } +fn create_random_jwk() -> (SigningKey, JwkEcKey) { + let key = SigningKey::random(&mut OsRng); + let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk(); + (key, jwk) +} + #[derive(Debug, thiserror::Error)] pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs index 7659745473..6db986f1f7 100644 --- a/proxy/src/serverless/cancel_set.rs +++ b/proxy/src/serverless/cancel_set.rs @@ -1,10 +1,8 @@ //! A set for cancelling random http connections -use std::{ - hash::{BuildHasher, BuildHasherDefault}, - num::NonZeroUsize, - time::Duration, -}; +use std::hash::{BuildHasher, BuildHasherDefault}; +use std::num::NonZeroUsize; +use std::time::Duration; use indexmap::IndexMap; use parking_lot::Mutex; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 2e576e0ded..aa869ff1c0 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,33 +1,31 @@ +use std::collections::HashMap; +use std::fmt; +use std::ops::Deref; +use std::pin::pin; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::{Arc, Weak}; +use std::task::{ready, Poll}; +use std::time::Duration; + use dashmap::DashMap; -use futures::{future::poll_fn, Future}; +use futures::future::poll_fn; +use futures::Future; use parking_lot::RwLock; use rand::Rng; use smallvec::SmallVec; -use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; -use std::{ - fmt, - task::{ready, Poll}, -}; -use std::{ - ops::Deref, - sync::atomic::{self, AtomicUsize}, -}; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, info_span, warn, Instrument, Span}; +use super::backend::HttpConnError; +use crate::auth::backend::ComputeUserInfo; +use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{ - auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, -}; - -use tracing::{debug, error, warn, Span}; -use tracing::{info, info_span, Instrument}; - -use super::backend::HttpConnError; +use crate::{DbName, EndpointCacheKey, RoleName}; #[derive(Debug, Clone)] pub(crate) struct ConnInfoWithAuth { @@ -724,13 +722,13 @@ impl Drop for Client { #[cfg(test)] mod tests { - use std::{mem, sync::atomic::AtomicBool}; - - use crate::{ - proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId, - }; + use std::mem; + use std::sync::atomic::AtomicBool; use super::*; + use crate::proxy::NeonOptions; + use crate::serverless::cancel_set::CancelSet; + use crate::{BranchId, EndpointId, ProjectId}; struct MockClient(Arc); impl MockClient { diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 6d61536f1a..9b6bc98557 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -1,22 +1,21 @@ +use std::collections::VecDeque; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::{Arc, Weak}; + use dashmap::DashMap; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; use rand::Rng; -use std::collections::VecDeque; -use std::sync::atomic::{self, AtomicUsize}; -use std::{sync::Arc, sync::Weak}; use tokio::net::TcpStream; +use tracing::{debug, error, info, info_span, Instrument}; +use super::conn_pool::ConnInfo; +use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{context::RequestMonitoring, EndpointCacheKey}; - -use tracing::{debug, error}; -use tracing::{info, info_span, Instrument}; - -use super::conn_pool::ConnInfo; +use crate::EndpointCacheKey; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index 87a72ec5f0..c0208d4f68 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -1,12 +1,11 @@ //! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility //! Will merge back in at some point in the future. -use bytes::Bytes; - use anyhow::Context; +use bytes::Bytes; use http::{Response, StatusCode}; -use http_body_util::{combinators::BoxBody, BodyExt, Full}; - +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Full}; use serde::Serialize; use utils::http::error::ApiError; @@ -41,6 +40,10 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::TOO_MANY_REQUESTS, + ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 9f328a0e1d..8c56d317cc 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,7 +1,5 @@ -use serde_json::Map; -use serde_json::Value; -use tokio_postgres::types::Kind; -use tokio_postgres::types::Type; +use serde_json::{Map, Value}; +use tokio_postgres::types::{Kind, Type}; use tokio_postgres::Row; // @@ -256,9 +254,10 @@ fn _pg_array_parse( #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + #[test] fn test_atomic_types_to_pg_params() { let json = vec![Value::Bool(true), Value::Bool(false)]; diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 1dde5952e1..5df37a8762 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -1,29 +1,31 @@ -use futures::{future::poll_fn, Future}; +use std::collections::HashMap; +use std::pin::pin; +use std::sync::{Arc, Weak}; +use std::task::{ready, Poll}; +use std::time::Duration; + +use futures::future::poll_fn; +use futures::Future; +use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; -use rand::rngs::OsRng; -use serde_json::Value; +use serde_json::value::RawValue; use signature::Signer; -use std::task::{ready, Poll}; -use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::types::ToSql; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_util::sync::CancellationToken; -use typed_json::json; - -use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; -use crate::metrics::Metrics; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{context::RequestMonitoring, DbName, RoleName}; - -use tracing::{debug, error, warn, Span}; -use tracing::{info, info_span, Instrument}; +use tracing::{error, info, info_span, warn, Instrument, Span}; use super::backend::HttpConnError; use super::conn_pool::{ClientInnerExt, ConnInfo}; +use crate::context::RequestMonitoring; +use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::metrics::Metrics; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::{DbName, RoleName}; struct ConnPoolEntry { conn: ClientInner, @@ -245,12 +247,14 @@ impl LocalConnPool { } } +#[allow(clippy::too_many_arguments)] pub(crate) fn poll_client( global_pool: Arc>, ctx: &RequestMonitoring, conn_info: ConnInfo, client: tokio_postgres::Client, mut connection: tokio_postgres::Connection, + key: SigningKey, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> LocalClient { @@ -346,8 +350,6 @@ pub(crate) fn poll_client( } .instrument(span)); - let key = SigningKey::random(&mut OsRng); - let inner = ClientInner { inner: client, session: tx, @@ -430,13 +432,6 @@ impl LocalClient { let inner = inner.as_mut().expect("client inner should not be removed"); (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn key(&self) -> &SigningKey { - let inner = &self - .inner - .as_ref() - .expect("client inner should not be removed"); - &inner.key - } } impl LocalClient { @@ -445,25 +440,9 @@ impl LocalClient { .inner .as_mut() .expect("client inner should not be removed"); + inner.jti += 1; - - let kid = inner.inner.get_process_id(); - let header = json!({"kid":kid}).to_string(); - - let mut payload = serde_json::from_slice::>(payload) - .map_err(HttpConnError::JwtPayloadError)?; - payload.insert("jti".to_string(), Value::Number(inner.jti.into())); - let payload = Value::Object(payload).to_string(); - - debug!( - kid, - jti = inner.jti, - ?header, - ?payload, - "signing new ephemeral JWT" - ); - - let token = sign_jwt(&inner.key, header, payload); + let token = resign_jwt(&inner.key, payload, inner.jti)?; // initiates the auth session inner.inner.simple_query("discard all").await?; @@ -475,20 +454,74 @@ impl LocalClient { ) .await?; - info!(kid, jti = inner.jti, "user session state init"); + let pid = inner.inner.get_process_id(); + info!(pid, jti = inner.jti, "user session state init"); Ok(()) } } -fn sign_jwt(sk: &SigningKey, header: String, payload: String) -> String { - let header = Base64UrlUnpadded::encode_string(header.as_bytes()); - let payload = Base64UrlUnpadded::encode_string(payload.as_bytes()); +/// implements relatively efficient in-place json object key upserting +/// +/// only supports top-level keys +fn upsert_json_object( + payload: &[u8], + key: &str, + value: &RawValue, +) -> Result { + let mut payload = serde_json::from_slice::>(payload)?; + payload.insert(key, value); + serde_json::to_string(&payload) +} - let message = format!("{header}.{payload}"); - let sig: Signature = sk.sign(message.as_bytes()); - let base64_sig = Base64UrlUnpadded::encode_string(&sig.to_bytes()); - format!("{message}.{base64_sig}") +fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result { + let mut buffer = itoa::Buffer::new(); + + // encode the jti integer to a json rawvalue + let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)).unwrap(); + + // update the jti in-place + let payload = + upsert_json_object(payload, "jti", jti).map_err(HttpConnError::JwtPayloadError)?; + + // sign the jwt + let token = sign_jwt(sk, payload.as_bytes()); + + Ok(token) +} + +fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { + let header_len = 20; + let payload_len = Base64UrlUnpadded::encoded_len(payload); + let signature_len = Base64UrlUnpadded::encoded_len(&[0; 64]); + let total_len = header_len + payload_len + signature_len + 2; + + let mut jwt = String::with_capacity(total_len); + let cap = jwt.capacity(); + + // we only need an empty header with the alg specified. + // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9" + jwt.push_str("eyJhbGciOiJFUzI1NiJ9."); + + // encode the jwt payload in-place + base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt); + + // create the signature from the encoded header || payload + let sig: Signature = sk.sign(jwt.as_bytes()); + + jwt.push('.'); + + // encode the jwt signature in-place + base64::encode_config_buf(sig.to_bytes(), base64::URL_SAFE_NO_PAD, &mut jwt); + + debug_assert_eq!( + jwt.len(), + total_len, + "the jwt len should match our expected len" + ); + debug_assert_eq!(jwt.capacity(), cap, "the jwt capacity should not change"); + + jwt } impl Discard<'_, C> { @@ -509,14 +542,6 @@ impl Discard<'_, C> { } impl LocalClient { - pub fn get_client(&self) -> &C { - &self - .inner - .as_ref() - .expect("client inner should not be removed") - .inner - } - fn do_drop(&mut self) -> Option { let conn_info = self.conn_info.clone(); let client = self @@ -542,3 +567,30 @@ impl Drop for LocalClient { } } } + +#[cfg(test)] +mod tests { + use p256::ecdsa::SigningKey; + use typed_json::json; + + use super::resign_jwt; + + #[test] + fn jwt_token_snapshot() { + let key = SigningKey::from_bytes(&[1; 32].into()).unwrap(); + let data = + json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string(); + + let jwt = resign_jwt(&key, data.as_bytes(), 2).unwrap(); + + // To validate the JWT, copy the JWT string and paste it into https://jwt.io/. + // In the public-key box, paste the following jwk public key + // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}` + + // let pub_key = p256::ecdsa::VerifyingKey::from(&key); + // let pub_key = p256::PublicKey::from(pub_key); + // println!("{}", pub_key.to_jwk_string()); + + assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA"); + } +} diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 9be6b592bd..3ed3b6c845 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -12,12 +12,15 @@ mod local_conn_pool; mod sql_over_http; mod websocket; +use std::net::{IpAddr, SocketAddr}; +use std::pin::{pin, Pin}; +use std::sync::Arc; + +use anyhow::Context; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; - -use anyhow::Context; use futures::future::{select, Either}; use futures::TryFutureExt; use http::{Method, Response, StatusCode}; @@ -29,9 +32,13 @@ use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; +use tracing::{info, warn, Instrument}; +use utils::http::error::ApiError; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; @@ -43,18 +50,11 @@ use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::{IpAddr, SocketAddr}; -use std::pin::{pin, Pin}; -use std::sync::Arc; -use tokio::net::{TcpListener, TcpStream}; -use tokio_util::sync::CancellationToken; -use tracing::{error, info, warn, Instrument}; -use utils::http::error::ApiError; - pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( config: &'static ProxyConfig, + auth_backend: &'static crate::auth::Backend<'static, ()>, ws_listener: TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, @@ -110,6 +110,7 @@ pub async fn task_main( local_pool, pool: Arc::clone(&conn_pool), config, + auth_backend, endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); let tls_acceptor: Arc = match config.tls_config.as_ref() { @@ -241,7 +242,7 @@ async fn connection_startup( let (conn, peer) = match read_proxy_protocol(conn).await { Ok(c) => c, Err(e) => { - tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + tracing::warn!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); return None; } }; @@ -397,6 +398,7 @@ async fn request_handler( async move { if let Err(e) = websocket::serve_websocket( config, + backend.auth_backend, ctx, websocket, cancellation_handler, @@ -405,7 +407,7 @@ async fn request_handler( ) .await { - error!("error in websocket connection: {e:#}"); + warn!("error in websocket connection: {e:#}"); } } .instrument(span), diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index f7c3b26917..3d8a2adef1 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -2,76 +2,43 @@ use std::pin::pin; use std::sync::Arc; use bytes::Bytes; -use futures::future::select; -use futures::future::try_join; -use futures::future::Either; -use futures::StreamExt; -use futures::TryFutureExt; +use futures::future::{select, try_join, Either}; +use futures::{StreamExt, TryFutureExt}; use http::header::AUTHORIZATION; use http::Method; use http_body_util::combinators::BoxBody; -use http_body_util::BodyExt; -use http_body_util::Full; -use hyper::body::Body; -use hyper::body::Incoming; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{HeaderMap, Request}; +use http_body_util::{BodyExt, Full}; +use hyper::body::{Body, Incoming}; +use hyper::http::{HeaderName, HeaderValue}; +use hyper::{header, HeaderMap, Request, Response, StatusCode}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; use tokio::time; -use tokio_postgres::error::DbError; -use tokio_postgres::error::ErrorPosition; -use tokio_postgres::error::SqlState; -use tokio_postgres::GenericClient; -use tokio_postgres::IsolationLevel; -use tokio_postgres::NoTls; -use tokio_postgres::ReadyForQueryStatus; -use tokio_postgres::Transaction; +use tokio_postgres::error::{DbError, ErrorPosition, SqlState}; +use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use tokio_util::sync::CancellationToken; -use tracing::error; -use tracing::info; +use tracing::{error, info}; use typed_json::json; use url::Url; use urlencoding; use utils::http::error::ApiError; -use crate::auth::backend::ComputeCredentialKeys; -use crate::auth::backend::ComputeUserInfo; -use crate::auth::endpoint_sni; -use crate::auth::ComputeUserInfoParseError; -use crate::config::AuthenticationConfig; -use crate::config::ProxyConfig; -use crate::config::TlsConfig; -use crate::context::RequestMonitoring; -use crate::error::ErrorKind; -use crate::error::ReportableError; -use crate::error::UserFacingError; -use crate::metrics::HttpDirection; -use crate::metrics::Metrics; -use crate::proxy::run_until_cancelled; -use crate::proxy::NeonOptions; -use crate::serverless::backend::HttpConnError; -use crate::usage_metrics::MetricCounter; -use crate::usage_metrics::MetricCounterRecorder; -use crate::DbName; -use crate::RoleName; - -use super::backend::LocalProxyConnError; -use super::backend::PoolingBackend; -use super::conn_pool; -use super::conn_pool::AuthData; -use super::conn_pool::ConnInfo; -use super::conn_pool::ConnInfoWithAuth; +use super::backend::{LocalProxyConnError, PoolingBackend}; +use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth}; use super::http_util::json_response; -use super::json::json_to_pg_text; -use super::json::pg_text_row_to_json; -use super::json::JsonConversionError; -use super::local_conn_pool; +use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; +use super::{conn_pool, local_conn_pool}; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; +use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; +use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; +use crate::context::RequestMonitoring; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::{HttpDirection, Metrics}; +use crate::proxy::{run_until_cancelled, NeonOptions}; +use crate::serverless::backend::HttpConnError; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; +use crate::{DbName, RoleName}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] @@ -554,7 +521,7 @@ async fn handle_inner( match conn_info.auth { AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => { - handle_auth_broker_inner(config, ctx, request, conn_info.conn_info, jwt, backend).await + handle_auth_broker_inner(ctx, request, conn_info.conn_info, jwt, backend).await } auth => { handle_db_inner( @@ -622,28 +589,17 @@ async fn handle_db_inner( let authenticate_and_connect = Box::pin( async { - let is_local_proxy = - matches!(backend.config.auth_backend, crate::auth::Backend::Local(_)); + let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_)); let keys = match auth { AuthData::Password(pw) => { backend - .authenticate_with_password( - ctx, - &config.authentication_config, - &conn_info.user_info, - &pw, - ) + .authenticate_with_password(ctx, &conn_info.user_info, &pw) .await? } AuthData::Jwt(jwt) => { backend - .authenticate_with_jwt( - ctx, - &config.authentication_config, - &conn_info.user_info, - jwt, - ) + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) .await? } }; @@ -691,7 +647,7 @@ async fn handle_db_inner( // Now execute the query and return the result. let json_output = match payload { Payload::Single(stmt) => { - stmt.process(config, cancel, &mut client, parsed_headers) + stmt.process(&config.http_config, cancel, &mut client, parsed_headers) .await? } Payload::Batch(statements) => { @@ -709,7 +665,7 @@ async fn handle_db_inner( } statements - .process(config, cancel, &mut client, parsed_headers) + .process(&config.http_config, cancel, &mut client, parsed_headers) .await? } }; @@ -749,7 +705,6 @@ static HEADERS_TO_FORWARD: &[&HeaderName] = &[ ]; async fn handle_auth_broker_inner( - config: &'static ProxyConfig, ctx: &RequestMonitoring, request: Request, conn_info: ConnInfo, @@ -757,12 +712,7 @@ async fn handle_auth_broker_inner( backend: Arc, ) -> Result>, SqlOverHttpError> { backend - .authenticate_with_jwt( - ctx, - &config.authentication_config, - &conn_info.user_info, - jwt, - ) + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) .await .map_err(HttpConnError::from)?; @@ -800,7 +750,7 @@ async fn handle_auth_broker_inner( impl QueryData { async fn process( self, - config: &'static ProxyConfig, + config: &'static HttpConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -831,7 +781,7 @@ impl QueryData { Either::Right((_cancelled, query)) => { tracing::info!("cancelling query"); if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); + tracing::warn!(?err, "could not cancel query"); } // wait for the query cancellation match time::timeout(time::Duration::from_millis(100), query).await { @@ -874,7 +824,7 @@ impl QueryData { impl BatchQueryData { async fn process( self, - config: &'static ProxyConfig, + config: &'static HttpConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -920,7 +870,7 @@ impl BatchQueryData { } Err(SqlOverHttpError::Cancelled(_)) => { if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); + tracing::warn!(?err, "could not cancel query"); } // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. discard.discard(); @@ -944,7 +894,7 @@ impl BatchQueryData { } async fn query_batch( - config: &'static ProxyConfig, + config: &'static HttpConfig, cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, @@ -983,7 +933,7 @@ async fn query_batch( } async fn query_to_json( - config: &'static ProxyConfig, + config: &'static HttpConfig, client: &T, data: QueryData, current_size: &mut usize, @@ -1004,9 +954,9 @@ async fn query_to_json( rows.push(row); // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) - if *current_size > config.http_config.max_response_size_bytes { + if *current_size > config.max_response_size_bytes { return Err(SqlOverHttpError::ResponseTooLarge( - config.http_config.max_response_size_bytes, + config.max_response_size_bytes, )); } } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 08d5da9bef..ba36116c2c 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,13 +1,7 @@ -use crate::proxy::ErrorSource; -use crate::{ - cancellation::CancellationHandlerMain, - config::ProxyConfig, - context::RequestMonitoring, - error::{io_error, ReportableError}, - metrics::Metrics, - proxy::{handle_client, ClientMode}, - rate_limiter::EndpointRateLimiter, -}; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{ready, Context, Poll}; + use anyhow::Context as _; use bytes::{Buf, BufMut, Bytes, BytesMut}; use framed_websockets::{Frame, OpCode, WebSocketServer}; @@ -15,15 +9,17 @@ use futures::{Sink, Stream}; use hyper::upgrade::OnUpgrade; use hyper_util::rt::TokioIo; use pin_project_lite::pin_project; - -use std::{ - pin::Pin, - sync::Arc, - task::{ready, Context, Poll}, -}; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; +use crate::cancellation::CancellationHandlerMain; +use crate::config::ProxyConfig; +use crate::context::RequestMonitoring; +use crate::error::{io_error, ReportableError}; +use crate::metrics::Metrics; +use crate::proxy::{handle_client, ClientMode, ErrorSource}; +use crate::rate_limiter::EndpointRateLimiter; + pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. @@ -129,6 +125,7 @@ impl AsyncBufRead for WebSocketRw { pub(crate) async fn serve_websocket( config: &'static ProxyConfig, + auth_backend: &'static crate::auth::Backend<'static, ()>, ctx: RequestMonitoring, websocket: OnUpgrade, cancellation_handler: Arc, @@ -145,6 +142,7 @@ pub(crate) async fn serve_websocket( let res = Box::pin(handle_client( config, + auth_backend, &ctx, cancellation_handler, WebSocketRw::new(websocket), @@ -182,14 +180,11 @@ mod tests { use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use tokio::{ - io::{duplex, AsyncReadExt, AsyncWriteExt}, - task::JoinSet, - }; - use tokio_tungstenite::{ - tungstenite::{protocol::Role, Message}, - WebSocketStream, - }; + use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt}; + use tokio::task::JoinSet; + use tokio_tungstenite::tungstenite::protocol::Role; + use tokio_tungstenite::tungstenite::Message; + use tokio_tungstenite::WebSocketStream; use super::WebSocketRw; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index e2fc73235e..89df48c5d3 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,19 +1,20 @@ -use crate::config::TlsServerEndPoint; -use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::metrics::Metrics; -use bytes::BytesMut; - -use pq_proto::framed::{ConnectionError, Framed}; -use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; -use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; + +use bytes::BytesMut; +use pq_proto::framed::{ConnectionError, Framed}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; +use rustls::ServerConfig; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; use tracing::debug; +use crate::config::TlsServerEndPoint; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::Metrics; + /// Stream wrapper which implements libpq's protocol. /// /// NOTE: This object deliberately doesn't implement [`AsyncRead`] diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index bd3e62bc12..c5384c0b0e 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,36 +1,33 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::{ - config::{MetricBackupCollectionConfig, MetricCollectionConfig}, - context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, - http, - intern::{BranchIdInt, EndpointIdInt}, -}; +use std::convert::Infallible; +use std::pin::pin; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; -use dashmap::{mapref::entry::Entry, DashMap}; +use dashmap::mapref::entry::Entry; +use dashmap::DashMap; use futures::future::select; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; -use std::{ - convert::Infallible, - pin::pin, - sync::{ - atomic::{AtomicU64, AtomicUsize, Ordering}, - Arc, - }, - time::Duration, -}; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; -use tracing::{error, info, instrument, trace}; +use tracing::{error, info, instrument, trace, warn}; use utils::backoff; use uuid::{NoContext, Timestamp}; +use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig}; +use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}; +use crate::http; +use crate::intern::{BranchIdInt, EndpointIdInt}; + const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); @@ -346,7 +343,7 @@ async fn collect_metrics_iteration( error!("metrics endpoint refused the sent metrics: {:?}", res); for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large - error!("potentially abnormal metric value: {:?}", metric); + warn!("potentially abnormal metric value: {:?}", metric); } } } @@ -485,19 +482,23 @@ async fn upload_events_chunk( #[cfg(test)] mod tests { - use super::*; + use std::sync::{Arc, Mutex}; - use crate::{http, BranchId, EndpointId}; use anyhow::Error; use chrono::Utc; use consumption_metrics::{Event, EventChunk}; use http_body_util::BodyExt; - use hyper::{body::Incoming, server::conn::http1, service::service_fn, Request, Response}; + use hyper::body::Incoming; + use hyper::server::conn::http1; + use hyper::service::service_fn; + use hyper::{Request, Response}; use hyper_util::rt::TokioIo; - use std::sync::{Arc, Mutex}; use tokio::net::TcpListener; use url::Url; + use super::*; + use crate::{http, BranchId, EndpointId}; + #[tokio::test] async fn metrics() { type Report = EventChunk<'static, Event>; diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 86d0f9e8b2..7e07f6a2af 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -1,8 +1,9 @@ +use std::pin::Pin; +use std::task; + use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; -use std::pin::Pin; -use std::task; use thiserror::Error; use tokio::sync::oneshot; @@ -99,9 +100,10 @@ impl std::future::Future for Waiter<'_, T> { #[cfg(test)] mod tests { - use super::*; use std::sync::Arc; + use super::*; + #[tokio::test] async fn test_waiter() -> anyhow::Result<()> { let waiters = Arc::new(Waiters::default()); diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index c5c9393c00..fdd0830b02 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -15,15 +15,20 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } Ok(()) } - (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => { - Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Safekeeper auth", - claims.scope - ) - .into(), - )) - } + ( + Scope::Admin + | Scope::PageServerApi + | Scope::GenerationsApi + | Scope::Infra + | Scope::Scrubber, + _, + ) => Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Safekeeper auth", + claims.scope + ) + .into(), + )), (Scope::SafekeeperData, _) => Ok(()), } } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 4dd8badd03..46b6f4f2bf 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -636,7 +636,7 @@ async fn handle_tenant_list( } async fn handle_node_register(req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::Admin)?; + check_permissions(&req, Scope::Infra)?; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1182,7 +1182,7 @@ async fn handle_get_safekeeper(req: Request) -> Result, Api /// Assumes information is only relayed to storage controller after first selecting an unique id on /// control plane database, which means we have an id field in the request and payload. async fn handle_upsert_safekeeper(mut req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::Admin)?; + check_permissions(&req, Scope::Infra)?; let body = json_request::(&mut req).await?; let id = parse_request_param::(&req, "id")?; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index cc735dc27e..25e1fb5e1f 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -246,6 +246,11 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { // storage controller's auth configuration. ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}")) } + mgmt_api::Error::ApiError(status @ StatusCode::TOO_MANY_REQUESTS, msg) => { + // Pass through 429 errors: if pageserver is asking us to wait + retry, we in + // turn ask our clients to wait + retry + ApiError::Conflict(format!("{node} {status}: {status} {msg}")) + } mgmt_api::Error::ApiError(status, msg) => { // Presume general case of pageserver API errors is that we tried to do something // that can't be done right now. @@ -1069,8 +1074,9 @@ impl Service { /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`] /// will indicate that reconciliation is not needed. #[instrument(skip_all, fields( - tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), - sequence=%result.sequence + seq=%result.sequence, + tenant_id=%result.tenant_shard_id.tenant_id, + shard_id=%result.tenant_shard_id.shard_slug(), ))] fn process_result(&self, result: ReconcileResult) { let mut locked = self.inner.write().unwrap(); diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index c1ea589f7f..cb3299d413 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -317,9 +317,8 @@ pub async fn scan_pageserver_metadata( tenant_timeline_results.push((ttid, data)); } - let tenant_id = tenant_id.expect("Must be set if results are present"); - if !tenant_timeline_results.is_empty() { + let tenant_id = tenant_id.expect("Must be set if results are present"); analyze_tenant( &remote_client, tenant_id, diff --git a/test_runner/README.md b/test_runner/README.md index d754e60d17..e087241c1f 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -64,10 +64,12 @@ By default performance tests are excluded. To run them explicitly pass performan Useful environment variables: `NEON_BIN`: The directory where neon binaries can be found. +`COMPATIBILITY_NEON_BIN`: The directory where the previous version of Neon binaries can be found `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found. Since pageserver supports several postgres versions, `POSTGRES_DISTRIB_DIR` must contain a subdirectory for each version with naming convention `v{PG_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. +`COMPATIBILITY_POSTGRES_DISTRIB_DIR`: The directory where the prevoius version of postgres distribution can be found. `DEFAULT_PG_VERSION`: The version of Postgres to use, This is used to construct full path to the postgres binaries. Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16` @@ -294,6 +296,16 @@ def test_foobar2(neon_env_builder: NeonEnvBuilder): client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id) ``` +All the test which rely on NeonEnvBuilder, can check the various version combinations of the components. +To do this yuo may want to add the parametrize decorator with the function fixtures.utils.allpairs_versions() +E.g. + +```python +@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) +def test_something( +... +``` + For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html At the end of a test, all the nodes in the environment are automatically stopped, so you diff --git a/test_runner/conftest.py b/test_runner/conftest.py index d6e7fcf7ca..4a3194c691 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -6,6 +6,7 @@ pytest_plugins = ( "fixtures.httpserver", "fixtures.compute_reconfigure", "fixtures.storage_controller_proxy", + "fixtures.paths", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 683ea3af44..5934baccff 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -185,8 +185,8 @@ class NeonAPI: def get_connection_uri( self, project_id: str, - branch_id: str | None = None, - endpoint_id: str | None = None, + branch_id: Optional[str] = None, + endpoint_id: Optional[str] = None, database_name: str = "neondb", role_name: str = "neondb_owner", pooled: bool = True, @@ -262,7 +262,7 @@ class NeonAPI: class NeonApiEndpoint: - def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: str | None): + def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]): self.neon_api = neon_api if project_id is None: project = neon_api.create_project(pg_version) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f81bc3f5a6..a313ac2ed3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -18,7 +18,6 @@ from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime from enum import Enum -from fcntl import LOCK_EX, LOCK_UN, flock from functools import cached_property from pathlib import Path from types import TracebackType @@ -59,6 +58,7 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( wait_for_last_record_lsn, ) +from fixtures.paths import get_test_repo_dir, shared_snapshot_dir from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( @@ -75,8 +75,8 @@ from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, + COMPONENT_BINARIES, allure_add_grafana_links, - allure_attach_from_dir, assert_no_errors, get_dir_size, print_gc_result, @@ -96,6 +96,8 @@ if TYPE_CHECKING: Union, ) + from fixtures.paths import SnapshotDirLocked + T = TypeVar("T") @@ -118,65 +120,11 @@ put directly-importable functions into utils.py or another separate file. Env = dict[str, str] -DEFAULT_OUTPUT_DIR: str = "test_output" DEFAULT_BRANCH_NAME: str = "main" BASE_PORT: int = 15000 -@pytest.fixture(scope="session") -def base_dir() -> Iterator[Path]: - # find the base directory (currently this is the git root) - base_dir = Path(__file__).parents[2] - log.info(f"base_dir is {base_dir}") - - yield base_dir - - -@pytest.fixture(scope="function") -def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: - if os.getenv("REMOTE_ENV"): - # we are in remote env and do not have neon binaries locally - # this is the case for benchmarks run on self-hosted runner - return - - # Find the neon binaries. - if env_neon_bin := os.environ.get("NEON_BIN"): - binpath = Path(env_neon_bin) - else: - binpath = base_dir / "target" / build_type - log.info(f"neon_binpath is {binpath}") - - if not (binpath / "pageserver").exists(): - raise Exception(f"neon binaries not found at '{binpath}'") - - yield binpath - - -@pytest.fixture(scope="session") -def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: - if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): - distrib_dir = Path(env_postgres_bin).resolve() - else: - distrib_dir = base_dir / "pg_install" - - log.info(f"pg_distrib_dir is {distrib_dir}") - yield distrib_dir - - -@pytest.fixture(scope="session") -def top_output_dir(base_dir: Path) -> Iterator[Path]: - # Compute the top-level directory for all tests. - if env_test_output := os.environ.get("TEST_OUTPUT"): - output_dir = Path(env_test_output).resolve() - else: - output_dir = base_dir / DEFAULT_OUTPUT_DIR - output_dir.mkdir(exist_ok=True) - - log.info(f"top_output_dir is {output_dir}") - yield output_dir - - @pytest.fixture(scope="session") def neon_api_key() -> str: api_key = os.getenv("NEON_API_KEY") @@ -369,11 +317,14 @@ class NeonEnvBuilder: run_id: uuid.UUID, mock_s3_server: MockS3Server, neon_binpath: Path, + compatibility_neon_binpath: Path, pg_distrib_dir: Path, + compatibility_pg_distrib_dir: Path, pg_version: PgVersion, test_name: str, top_output_dir: Path, test_output_dir: Path, + combination, test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, # toml that will be decomposed into `--config-override` flags during `pageserver --init` @@ -455,6 +406,19 @@ class NeonEnvBuilder: "test_" ), "Unexpectedly instantiated from outside a test function" self.test_name = test_name + self.compatibility_neon_binpath = compatibility_neon_binpath + self.compatibility_pg_distrib_dir = compatibility_pg_distrib_dir + self.version_combination = combination + self.mixdir = self.test_output_dir / "mixdir_neon" + if self.version_combination is not None: + assert ( + self.compatibility_neon_binpath is not None + ), "the environment variable COMPATIBILITY_NEON_BIN is required when using mixed versions" + assert ( + self.compatibility_pg_distrib_dir is not None + ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required when using mixed versions" + self.mixdir.mkdir(mode=0o755, exist_ok=True) + self._mix_versions() def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv: # Cannot create more than one environment from one builder @@ -655,6 +619,21 @@ class NeonEnvBuilder: return self.env + def _mix_versions(self): + assert self.version_combination is not None, "version combination must be set" + for component, paths in COMPONENT_BINARIES.items(): + directory = ( + self.neon_binpath + if self.version_combination[component] == "new" + else self.compatibility_neon_binpath + ) + for filename in paths: + destination = self.mixdir / filename + destination.symlink_to(directory / filename) + if self.version_combination["compute"] == "old": + self.pg_distrib_dir = self.compatibility_pg_distrib_dir + self.neon_binpath = self.mixdir + def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): """ Mount `srcdir` as an overlayfs mount at `dstdir`. @@ -1403,7 +1382,9 @@ def neon_simple_env( top_output_dir: Path, test_output_dir: Path, neon_binpath: Path, + compatibility_neon_binpath: Path, pg_distrib_dir: Path, + compatibility_pg_distrib_dir: Path, pg_version: PgVersion, pageserver_virtual_file_io_engine: str, pageserver_aux_file_policy: Optional[AuxFileStore], @@ -1418,6 +1399,11 @@ def neon_simple_env( # Create the environment in the per-test output directory repo_dir = get_test_repo_dir(request, top_output_dir) + combination = ( + request._pyfuncitem.callspec.params["combination"] + if "combination" in request._pyfuncitem.callspec.params + else None + ) with NeonEnvBuilder( top_output_dir=top_output_dir, @@ -1425,7 +1411,9 @@ def neon_simple_env( port_distributor=port_distributor, mock_s3_server=mock_s3_server, neon_binpath=neon_binpath, + compatibility_neon_binpath=compatibility_neon_binpath, pg_distrib_dir=pg_distrib_dir, + compatibility_pg_distrib_dir=compatibility_pg_distrib_dir, pg_version=pg_version, run_id=run_id, preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), @@ -1435,6 +1423,7 @@ def neon_simple_env( pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, + combination=combination, ) as builder: env = builder.init_start() @@ -1448,7 +1437,9 @@ def neon_env_builder( port_distributor: PortDistributor, mock_s3_server: MockS3Server, neon_binpath: Path, + compatibility_neon_binpath: Path, pg_distrib_dir: Path, + compatibility_pg_distrib_dir: Path, pg_version: PgVersion, run_id: uuid.UUID, request: FixtureRequest, @@ -1475,6 +1466,11 @@ def neon_env_builder( # Create the environment in the test-specific output dir repo_dir = os.path.join(test_output_dir, "repo") + combination = ( + request._pyfuncitem.callspec.params["combination"] + if "combination" in request._pyfuncitem.callspec.params + else None + ) # Return the builder to the caller with NeonEnvBuilder( @@ -1483,7 +1479,10 @@ def neon_env_builder( port_distributor=port_distributor, mock_s3_server=mock_s3_server, neon_binpath=neon_binpath, + compatibility_neon_binpath=compatibility_neon_binpath, pg_distrib_dir=pg_distrib_dir, + compatibility_pg_distrib_dir=compatibility_pg_distrib_dir, + combination=combination, pg_version=pg_version, run_id=run_id, preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), @@ -1987,11 +1986,11 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"reconcile_all waited for {n} shards") return n - def reconcile_until_idle(self, timeout_secs=30): + def reconcile_until_idle(self, timeout_secs=30, max_interval=5): start_at = time.time() n = 1 - delay_sec = 0.5 - delay_max = 5 + delay_sec = 0.1 + delay_max = max_interval while n > 0: n = self.reconcile_all() if n == 0: @@ -4246,44 +4245,6 @@ class StorageScrubber: raise -def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path: - """Compute the path to a working directory for an individual test.""" - test_name = request.node.name - test_dir = top_output_dir / f"{prefix}{test_name.replace('/', '-')}" - - # We rerun flaky tests multiple times, use a separate directory for each run. - if (suffix := getattr(request.node, "execution_count", None)) is not None: - test_dir = test_dir.parent / f"{test_dir.name}-{suffix}" - - log.info(f"get_test_output_dir is {test_dir}") - # make mypy happy - assert isinstance(test_dir, Path) - return test_dir - - -def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: - """ - The working directory for a test. - """ - return _get_test_dir(request, top_output_dir, "") - - -def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path: - """ - Directory that contains `upperdir` and `workdir` for overlayfs mounts - that a test creates. See `NeonEnvBuilder.overlay_mount`. - """ - return _get_test_dir(request, top_output_dir, "overlay-") - - -def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path: - return top_output_dir / "shared-snapshots" / snapshot_name - - -def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: - return get_test_output_dir(request, top_output_dir) / "repo" - - def pytest_addoption(parser: Parser): parser.addoption( "--preserve-database-files", @@ -4298,149 +4259,6 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile( ) -# This is autouse, so the test output directory always gets created, even -# if a test doesn't put anything there. -# -# NB: we request the overlay dir fixture so the fixture does its cleanups -@pytest.fixture(scope="function", autouse=True) -def test_output_dir( - request: FixtureRequest, top_output_dir: Path, test_overlay_dir: Path -) -> Iterator[Path]: - """Create the working directory for an individual test.""" - - # one directory per test - test_dir = get_test_output_dir(request, top_output_dir) - log.info(f"test_output_dir is {test_dir}") - shutil.rmtree(test_dir, ignore_errors=True) - test_dir.mkdir() - - yield test_dir - - # Allure artifacts creation might involve the creation of `.tar.zst` archives, - # which aren't going to be used if Allure results collection is not enabled - # (i.e. --alluredir is not set). - # Skip `allure_attach_from_dir` in this case - if not request.config.getoption("--alluredir"): - return - - preserve_database_files = False - for k, v in request.node.user_properties: - # NB: the neon_env_builder fixture uses this fixture (test_output_dir). - # So, neon_env_builder's cleanup runs before here. - # The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property. - if k == "preserve_database_files": - assert isinstance(v, bool) - preserve_database_files = v - - allure_attach_from_dir(test_dir, preserve_database_files) - - -class FileAndThreadLock: - def __init__(self, path: Path): - self.path = path - self.thread_lock = threading.Lock() - self.fd: Optional[int] = None - - def __enter__(self): - self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY) - # lock thread lock before file lock so that there's no race - # around flocking / funlocking the file lock - self.thread_lock.acquire() - flock(self.fd, LOCK_EX) - - def __exit__(self, exc_type, exc_value, exc_traceback): - assert self.fd is not None - assert self.thread_lock.locked() # ... by us - flock(self.fd, LOCK_UN) - self.thread_lock.release() - os.close(self.fd) - self.fd = None - - -class SnapshotDirLocked: - def __init__(self, parent: SnapshotDir): - self._parent = parent - - def is_initialized(self): - # TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized. - # Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed. - return self._parent._marker_file_path.exists() - - def set_initialized(self): - self._parent._marker_file_path.write_text("") - - @property - def path(self) -> Path: - return self._parent._path / "snapshot" - - -class SnapshotDir: - _path: Path - - def __init__(self, path: Path): - self._path = path - assert self._path.is_dir() - self._lock = FileAndThreadLock(self._lock_file_path) - - @property - def _lock_file_path(self) -> Path: - return self._path / "initializing.flock" - - @property - def _marker_file_path(self) -> Path: - return self._path / "initialized.marker" - - def __enter__(self) -> SnapshotDirLocked: - self._lock.__enter__() - return SnapshotDirLocked(self) - - def __exit__(self, exc_type, exc_value, exc_traceback): - self._lock.__exit__(exc_type, exc_value, exc_traceback) - - -def shared_snapshot_dir(top_output_dir, ident: str) -> SnapshotDir: - snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident) - snapshot_dir_path.mkdir(exist_ok=True, parents=True) - return SnapshotDir(snapshot_dir_path) - - -@pytest.fixture(scope="function") -def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: - """ - Idempotently create a test's overlayfs mount state directory. - If the functionality isn't enabled via env var, returns None. - - The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc). - """ - - if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None: - return None - - overlay_dir = get_test_overlay_dir(request, top_output_dir) - log.info(f"test_overlay_dir is {overlay_dir}") - - overlay_dir.mkdir(exist_ok=True) - # unmount stale overlayfs mounts which subdirectories of `overlay_dir/*` as the overlayfs `upperdir` and `workdir` - for mountpoint in overlayfs.iter_mounts_beneath(get_test_output_dir(request, top_output_dir)): - cmd = ["sudo", "umount", str(mountpoint)] - log.info( - f"Unmounting stale overlayfs mount probably created during earlier test run: {cmd}" - ) - subprocess.run(cmd, capture_output=True, check=True) - # the overlayfs `workdir`` is owned by `root`, shutil.rmtree won't work. - cmd = ["sudo", "rm", "-rf", str(overlay_dir)] - subprocess.run(cmd, capture_output=True, check=True) - - overlay_dir.mkdir() - - return overlay_dir - - # no need to clean up anything: on clean shutdown, - # NeonEnvBuilder.overlay_cleanup_teardown takes care of cleanup - # and on unclean shutdown, this function will take care of it - # on the next test run - - SKIP_DIRS = frozenset( ( "pg_wal", @@ -4462,6 +4280,7 @@ SKIP_FILES = frozenset( "postmaster.opts", "postmaster.pid", "pg_control", + "pg_dynshmem", ) ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 84a7e5f0a2..18d65cb7de 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -583,6 +583,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter): log.info(f"Got GC request response code: {res.status_code}") self.verbose_error(res) + def timeline_offload( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + ): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting offload: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/offload", + ) + log.info(f"Got offload request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + def timeline_compact( self, tenant_id: Union[TenantId, TenantShardId], @@ -886,7 +902,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, - batch_size: int | None = None, + batch_size: Optional[int] = None, **kwargs, ) -> set[TimelineId]: params = {} diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py new file mode 100644 index 0000000000..65f8e432b0 --- /dev/null +++ b/test_runner/fixtures/paths.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +import os +import shutil +import subprocess +import threading +from fcntl import LOCK_EX, LOCK_UN, flock +from pathlib import Path +from types import TracebackType +from typing import TYPE_CHECKING + +import pytest +from pytest import FixtureRequest + +from fixtures import overlayfs +from fixtures.log_helper import log +from fixtures.utils import allure_attach_from_dir + +if TYPE_CHECKING: + from collections.abc import Iterator + from typing import Optional + + +DEFAULT_OUTPUT_DIR: str = "test_output" + + +def get_test_dir( + request: FixtureRequest, top_output_dir: Path, prefix: Optional[str] = None +) -> Path: + """Compute the path to a working directory for an individual test.""" + test_name = request.node.name + test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}" + + # We rerun flaky tests multiple times, use a separate directory for each run. + if (suffix := getattr(request.node, "execution_count", None)) is not None: + test_dir = test_dir.parent / f"{test_dir.name}-{suffix}" + + return test_dir + + +def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + The working directory for a test. + """ + return get_test_dir(request, top_output_dir) + + +def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + Directory that contains `upperdir` and `workdir` for overlayfs mounts + that a test creates. See `NeonEnvBuilder.overlay_mount`. + """ + return get_test_dir(request, top_output_dir, "overlay-") + + +def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path: + return top_output_dir / "shared-snapshots" / snapshot_name + + +def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + return get_test_output_dir(request, top_output_dir) / "repo" + + +@pytest.fixture(scope="session") +def base_dir() -> Iterator[Path]: + # find the base directory (currently this is the git root) + base_dir = Path(__file__).parents[2] + log.info(f"base_dir is {base_dir}") + + yield base_dir + + +@pytest.fixture(scope="session") +def compute_config_dir(base_dir: Path) -> Iterator[Path]: + """ + Retrieve the path to the compute configuration directory. + """ + yield base_dir / "compute" / "etc" + + +@pytest.fixture(scope="function") +def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: + if os.getenv("REMOTE_ENV"): + # we are in remote env and do not have neon binaries locally + # this is the case for benchmarks run on self-hosted runner + return + + # Find the neon binaries. + if env_neon_bin := os.environ.get("NEON_BIN"): + binpath = Path(env_neon_bin) + else: + binpath = base_dir / "target" / build_type + log.info(f"neon_binpath is {binpath}") + + if not (binpath / "pageserver").exists(): + raise Exception(f"neon binaries not found at '{binpath}'") + + yield binpath.absolute() + + +@pytest.fixture(scope="session") +def compatibility_snapshot_dir() -> Iterator[Path]: + if os.getenv("REMOTE_ENV"): + return + compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") + assert ( + compatibility_snapshot_dir_env is not None + ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg(PG_VERSION)` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" + compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() + yield compatibility_snapshot_dir + + +@pytest.fixture(scope="session") +def compatibility_neon_binpath() -> Optional[Iterator[Path]]: + if os.getenv("REMOTE_ENV"): + return + comp_binpath = None + if env_compatibility_neon_binpath := os.environ.get("COMPATIBILITY_NEON_BIN"): + comp_binpath = Path(env_compatibility_neon_binpath).resolve().absolute() + yield comp_binpath + + +@pytest.fixture(scope="session") +def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: + if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): + distrib_dir = Path(env_postgres_bin).resolve() + else: + distrib_dir = base_dir / "pg_install" + + log.info(f"pg_distrib_dir is {distrib_dir}") + yield distrib_dir + + +@pytest.fixture(scope="session") +def compatibility_pg_distrib_dir() -> Optional[Iterator[Path]]: + compat_distrib_dir = None + if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"): + compat_distrib_dir = Path(env_compat_postgres_bin).resolve() + if not compat_distrib_dir.exists(): + raise Exception(f"compatibility postgres directory not found at {compat_distrib_dir}") + + if compat_distrib_dir: + log.info(f"compatibility_pg_distrib_dir is {compat_distrib_dir}") + yield compat_distrib_dir + + +@pytest.fixture(scope="session") +def top_output_dir(base_dir: Path) -> Iterator[Path]: + # Compute the top-level directory for all tests. + if env_test_output := os.environ.get("TEST_OUTPUT"): + output_dir = Path(env_test_output).resolve() + else: + output_dir = base_dir / DEFAULT_OUTPUT_DIR + output_dir.mkdir(exist_ok=True) + + log.info(f"top_output_dir is {output_dir}") + yield output_dir + + +# This is autouse, so the test output directory always gets created, even +# if a test doesn't put anything there. +# +# NB: we request the overlay dir fixture so the fixture does its cleanups +@pytest.fixture(scope="function", autouse=True) +def test_output_dir(request: pytest.FixtureRequest, top_output_dir: Path) -> Iterator[Path]: + """Create the working directory for an individual test.""" + + # one directory per test + test_dir = get_test_output_dir(request, top_output_dir) + log.info(f"test_output_dir is {test_dir}") + shutil.rmtree(test_dir, ignore_errors=True) + test_dir.mkdir() + + yield test_dir + + # Allure artifacts creation might involve the creation of `.tar.zst` archives, + # which aren't going to be used if Allure results collection is not enabled + # (i.e. --alluredir is not set). + # Skip `allure_attach_from_dir` in this case + if not request.config.getoption("--alluredir"): + return + + preserve_database_files = False + for k, v in request.node.user_properties: + # NB: the neon_env_builder fixture uses this fixture (test_output_dir). + # So, neon_env_builder's cleanup runs before here. + # The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property. + if k == "preserve_database_files": + assert isinstance(v, bool) + preserve_database_files = v + + allure_attach_from_dir(test_dir, preserve_database_files) + + +class FileAndThreadLock: + def __init__(self, path: Path): + self.path = path + self.thread_lock = threading.Lock() + self.fd: Optional[int] = None + + def __enter__(self): + self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY) + # lock thread lock before file lock so that there's no race + # around flocking / funlocking the file lock + self.thread_lock.acquire() + flock(self.fd, LOCK_EX) + + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_value: Optional[BaseException], + exc_traceback: Optional[TracebackType], + ): + assert self.fd is not None + assert self.thread_lock.locked() # ... by us + flock(self.fd, LOCK_UN) + self.thread_lock.release() + os.close(self.fd) + self.fd = None + + +class SnapshotDirLocked: + def __init__(self, parent: SnapshotDir): + self._parent = parent + + def is_initialized(self): + # TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized. + # Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed. + return self._parent.marker_file_path.exists() + + def set_initialized(self): + self._parent.marker_file_path.write_text("") + + @property + def path(self) -> Path: + return self._parent.path / "snapshot" + + +class SnapshotDir: + _path: Path + + def __init__(self, path: Path): + self._path = path + assert self._path.is_dir() + self._lock = FileAndThreadLock(self.lock_file_path) + + @property + def path(self) -> Path: + return self._path + + @property + def lock_file_path(self) -> Path: + return self._path / "initializing.flock" + + @property + def marker_file_path(self) -> Path: + return self._path / "initialized.marker" + + def __enter__(self) -> SnapshotDirLocked: + self._lock.__enter__() + return SnapshotDirLocked(self) + + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_value: Optional[BaseException], + exc_traceback: Optional[TracebackType], + ): + self._lock.__exit__(exc_type, exc_value, exc_traceback) + + +def shared_snapshot_dir(top_output_dir: Path, ident: str) -> SnapshotDir: + snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident) + snapshot_dir_path.mkdir(exist_ok=True, parents=True) + return SnapshotDir(snapshot_dir_path) + + +@pytest.fixture(scope="function") +def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: + """ + Idempotently create a test's overlayfs mount state directory. + If the functionality isn't enabled via env var, returns None. + + The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc). + """ + + if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None: + return None + + overlay_dir = get_test_overlay_dir(request, top_output_dir) + log.info(f"test_overlay_dir is {overlay_dir}") + + overlay_dir.mkdir(exist_ok=True) + # unmount stale overlayfs mounts which subdirectories of `overlay_dir/*` as the overlayfs `upperdir` and `workdir` + for mountpoint in overlayfs.iter_mounts_beneath(get_test_output_dir(request, top_output_dir)): + cmd = ["sudo", "umount", str(mountpoint)] + log.info( + f"Unmounting stale overlayfs mount probably created during earlier test run: {cmd}" + ) + subprocess.run(cmd, capture_output=True, check=True) + # the overlayfs `workdir`` is owned by `root`, shutil.rmtree won't work. + cmd = ["sudo", "rm", "-rf", str(overlay_dir)] + subprocess.run(cmd, capture_output=True, check=True) + + overlay_dir.mkdir() + + return overlay_dir + + # no need to clean up anything: on clean shutdown, + # NeonEnvBuilder.overlay_cleanup_teardown takes care of cleanup + # and on unclean shutdown, this function will take care of it + # on the next test run diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index ca1be35880..76575d330c 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -37,6 +37,23 @@ if TYPE_CHECKING: Fn = TypeVar("Fn", bound=Callable[..., Any]) +COMPONENT_BINARIES = { + "storage_controller": ("storage_controller",), + "storage_broker": ("storage_broker",), + "compute": ("compute_ctl",), + "safekeeper": ("safekeeper",), + "pageserver": ("pageserver", "pagectl"), +} +# Disable auto-formatting for better readability +# fmt: off +VERSIONS_COMBINATIONS = ( + {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, + {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, +) +# fmt: on def subprocess_capture( @@ -607,3 +624,19 @@ def human_bytes(amt: float) -> str: amt = amt / 1024 raise RuntimeError("unreachable") + + +def allpairs_versions(): + """ + Returns a dictionary with arguments for pytest parametrize + to test the compatibility with the previous version of Neon components + combinations were pre-computed to test all the pairs of the components with + the different versions. + """ + ids = [] + for pair in VERSIONS_COMBINATIONS: + cur_id = [] + for component in sorted(pair.keys()): + cur_id.append(pair[component][0]) + ids.append(f"combination_{''.join(cur_id)}") + return {"argnames": "combination", "argvalues": VERSIONS_COMBINATIONS, "ids": ids} diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 452a856714..d2eba751f8 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -4,9 +4,10 @@ import concurrent.futures import random import time from collections import defaultdict +from enum import Enum import pytest -from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -34,6 +35,7 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[ if tenant_placement[tid]["intent"]["attached"] == tenant_placement[tid]["observed"]["attached"] } + assert len(matching) == total_shards attached_per_node: defaultdict[str, int] = defaultdict(int) @@ -107,15 +109,48 @@ def test_storage_controller_many_tenants( ps.allowed_errors.append(".*request was dropped before completing.*") # Total tenants - tenant_count = 4000 + small_tenant_count = 7800 + large_tenant_count = 200 + tenant_count = small_tenant_count + large_tenant_count + large_tenant_shard_count = 8 + total_shards = small_tenant_count + large_tenant_count * large_tenant_shard_count - # Shards per tenant - shard_count = 2 - stripe_size = 1024 + # A small stripe size to encourage all shards to get some data + stripe_size = 1 - total_shards = tenant_count * shard_count + # We use a fixed seed to make the test somewhat reproducible: we want a randomly + # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. + rng = random.Random(1234) - tenants = set(TenantId.generate() for _i in range(0, tenant_count)) + class Tenant: + def __init__(self): + # Tenants may optionally contain a timeline + self.timeline_id = None + + # Tenants may be marked as 'large' to get multiple shard during creation phase + self.large = False + + tenant_ids = list(TenantId.generate() for _i in range(0, tenant_count)) + tenants = dict((tid, Tenant()) for tid in tenant_ids) + + # We will create timelines in only a subset of tenants, because creating timelines + # does many megabytes of IO, and we want to densely simulate huge tenant counts on + # a single test node. + tenant_timelines_count = 100 + + # These lists are maintained for use with rng.choice + tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count)) + tenants_without_timelines = list( + tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines + ) + + # For our sharded tenants, we will make half of them with timelines and half without + assert large_tenant_count >= tenant_timelines_count / 2 + for tenant_id in tenants_with_timelines[0 : large_tenant_count // 2]: + tenants[tenant_id].large = True + + for tenant_id in tenants_without_timelines[0 : large_tenant_count // 2]: + tenants[tenant_id].large = True virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -125,23 +160,39 @@ def test_storage_controller_many_tenants( rss = env.storage_controller.get_metric_value("process_resident_memory_bytes") assert rss is not None - log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)") - assert rss < expect_memory_per_shard * shard_count * tenant_count - - # We use a fixed seed to make the test somewhat reproducible: we want a randomly - # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. - rng = random.Random(1234) + log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)") + assert rss < expect_memory_per_shard * total_shards # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore # permits, to ensure that we are exercising stressing that. api_concurrency = 135 - # We will create tenants directly via API, not via neon_local, to avoid any false - # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) - with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: - futs = [] + # A different concurrency limit for bulk tenant+timeline creations: these do I/O and will + # start timing on test nodes if we aren't a bit careful. + create_concurrency = 16 + + class Operation(str, Enum): + TIMELINE_OPS = "timeline_ops" + SHARD_MIGRATE = "shard_migrate" + TENANT_PASSTHROUGH = "tenant_passthrough" + + run_ops = api_concurrency * 4 + assert run_ops < len(tenants) + + # Creation phase: make a lot of tenants, and create timelines in a subset of them + # This executor has concurrency set modestly, to avoid overloading pageservers with timeline creations. + with concurrent.futures.ThreadPoolExecutor(max_workers=create_concurrency) as executor: + tenant_create_futs = [] t1 = time.time() - for tenant_id in tenants: + + for tenant_id, tenant in tenants.items(): + if tenant.large: + shard_count = large_tenant_shard_count + else: + shard_count = 1 + + # We will create tenants directly via API, not via neon_local, to avoid any false + # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) f = executor.submit( env.storage_controller.tenant_create, tenant_id, @@ -152,44 +203,106 @@ def test_storage_controller_many_tenants( tenant_config={"heatmap_period": "10s"}, placement_policy={"Attached": 1}, ) - futs.append(f) + tenant_create_futs.append(f) - # Wait for creations to finish - for f in futs: + # Wait for tenant creations to finish + for f in tenant_create_futs: f.result() log.info( f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s" ) - run_ops = api_concurrency * 4 - assert run_ops < len(tenants) - op_tenants = list(tenants)[0:run_ops] + # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior + # would be for original scheduling decisions to always match optimizer's preference) + # (workaround for https://github.com/neondatabase/neon/issues/8969) + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + + # Create timelines in those tenants which are going to get one + t1 = time.time() + timeline_create_futs = [] + for tenant_id in tenants_with_timelines: + timeline_id = TimelineId.generate() + tenants[tenant_id].timeline_id = timeline_id + f = executor.submit( + env.storage_controller.pageserver_api().timeline_create, + PgVersion.NOT_SET, + tenant_id, + timeline_id, + ) + timeline_create_futs.append(f) + + for f in timeline_create_futs: + f.result() + log.info( + f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s" + ) + + # Plan operations: ensure each tenant with a timeline gets at least + # one of each operation type. Then add other tenants to make up the + # numbers. + ops_plan = [] + for tenant_id in tenants_with_timelines: + ops_plan.append((tenant_id, Operation.TIMELINE_OPS)) + ops_plan.append((tenant_id, Operation.SHARD_MIGRATE)) + ops_plan.append((tenant_id, Operation.TENANT_PASSTHROUGH)) + + # Fill up remaining run_ops with migrations of tenants without timelines + other_migrate_tenants = rng.sample(tenants_without_timelines, run_ops - len(ops_plan)) + + for tenant_id in other_migrate_tenants: + ops_plan.append( + ( + tenant_id, + rng.choice([Operation.SHARD_MIGRATE, Operation.TENANT_PASSTHROUGH]), + ) + ) + + # Exercise phase: pick pseudo-random operations to do on the tenants + timelines + # This executor has concurrency high enough to stress the storage controller API. + with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: + + def exercise_timeline_ops(tenant_id, timeline_id): + # A read operation: this requires looking up shard zero and routing there + detail = virtual_ps_http.timeline_detail(tenant_id, timeline_id) + assert detail["timeline_id"] == str(timeline_id) + + # A fan-out write operation to all shards in a tenant. + # - We use a metadata operation rather than something like a timeline create, because + # timeline creations are I/O intensive and this test isn't meant to be a stress test for + # doing lots of concurrent timeline creations. + archival_state = rng.choice( + [TimelineArchivalState.ARCHIVED, TimelineArchivalState.UNARCHIVED] + ) + virtual_ps_http.timeline_archival_config(tenant_id, timeline_id, archival_state) # Generate a mixture of operations and dispatch them all concurrently futs = [] - for tenant_id in op_tenants: - op = rng.choice([0, 1, 2]) - if op == 0: - # A fan-out write operation to all shards in a tenant (timeline creation) + for tenant_id, op in ops_plan: + if op == Operation.TIMELINE_OPS: + op_timeline_id = tenants[tenant_id].timeline_id + assert op_timeline_id is not None + + # Exercise operations that modify tenant scheduling state but require traversing + # the fan-out-to-all-shards functionality. f = executor.submit( - virtual_ps_http.timeline_create, - PgVersion.NOT_SET, + exercise_timeline_ops, tenant_id, - TimelineId.generate(), + op_timeline_id, ) - elif op == 1: + elif op == Operation.SHARD_MIGRATE: # A reconciler operation: migrate a shard. - shard_number = rng.randint(0, shard_count - 1) - tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) + desc = env.storage_controller.tenant_describe(tenant_id) + + shard_number = rng.randint(0, len(desc["shards"]) - 1) + tenant_shard_id = TenantShardId(tenant_id, shard_number, len(desc["shards"])) # Migrate it to its secondary location - desc = env.storage_controller.tenant_describe(tenant_id) dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] f = executor.submit( env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id ) - elif op == 2: + elif op == Operation.TENANT_PASSTHROUGH: # A passthrough read to shard zero f = executor.submit(virtual_ps_http.tenant_status, tenant_id) @@ -199,10 +312,18 @@ def test_storage_controller_many_tenants( for f in futs: f.result() + log.info("Completed mixed operations phase") + # Some of the operations above (notably migrations) might leave the controller in a state where it has # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system # to reach a quiescent state before doing following checks. - env.storage_controller.reconcile_until_idle() + # + # - Set max_interval low because we probably have a significant number of optimizations to complete and would like + # the test to run quickly. + # - Set timeout high because we might be waiting for optimizations that reuqire a secondary + # to warm up, and if we just started a secondary in the previous step, it might wait some time + # before downloading its heatmap + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() check_memory() @@ -213,6 +334,7 @@ def test_storage_controller_many_tenants( # # We do not require that the system is quiescent already here, although at present in this point in the test # that may be the case. + log.info("Reconciling all & timing") while True: t1 = time.time() reconcilers = env.storage_controller.reconcile_all() @@ -225,6 +347,7 @@ def test_storage_controller_many_tenants( break # Restart the storage controller + log.info("Restarting controller") env.storage_controller.stop() env.storage_controller.start() @@ -246,7 +369,16 @@ def test_storage_controller_many_tenants( # Restart pageservers gracefully: this exercises the /re-attach pageserver API # and the storage controller drain and fill API + log.info("Restarting pageservers...") + + # Parameters for how long we expect it to take to migrate all of the tenants from/to + # a node during a drain/fill operation + DRAIN_FILL_TIMEOUT = 240 + DRAIN_FILL_BACKOFF = 5 + for ps in env.pageservers: + log.info(f"Draining pageserver {ps.id}") + t1 = time.time() env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -255,9 +387,10 @@ def test_storage_controller_many_tenants( ps.id, PageserverAvailability.ACTIVE, PageserverSchedulingPolicy.PAUSE_FOR_RESTART, - max_attempts=24, - backoff=5, + max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF, + backoff=DRAIN_FILL_BACKOFF, ) + log.info(f"Drained pageserver {ps.id} in {time.time() - t1}s") shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") @@ -275,6 +408,7 @@ def test_storage_controller_many_tenants( backoff=1, ) + log.info(f"Filling pageserver {ps.id}") env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -282,16 +416,23 @@ def test_storage_controller_many_tenants( ps.id, PageserverAvailability.ACTIVE, PageserverSchedulingPolicy.ACTIVE, - max_attempts=24, - backoff=5, + max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF, + backoff=DRAIN_FILL_BACKOFF, ) + log.info(f"Filled pageserver {ps.id} in {time.time() - t1}s") + + # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior + # would be for original scheduling decisions to always match optimizer's preference) + # (workaround for https://github.com/neondatabase/neon/issues/8969) + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") assert_consistent_balanced_attachments(env, total_shards) - env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 791e38383e..96ba3dd5a4 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -9,6 +9,7 @@ from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING +import fixtures.utils import pytest import toml from fixtures.common_types import TenantId, TimelineId @@ -93,6 +94,34 @@ if TYPE_CHECKING: # # Run forward compatibility test # ./scripts/pytest -k test_forward_compatibility # +# +# How to run `test_version_mismatch` locally: +# +# export DEFAULT_PG_VERSION=16 +# export BUILD_TYPE=release +# export CHECK_ONDISK_DATA_COMPATIBILITY=true +# export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} +# export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install +# export NEON_BIN=target/release +# export POSTGRES_DISTRIB_DIR=pg_install +# +# # Build previous version of binaries and store them somewhere: +# rm -rf pg_install target +# git checkout +# CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` +# mkdir -p neon_previous/target +# cp -a target/${BUILD_TYPE} ./neon_previous/target/${BUILD_TYPE} +# cp -a pg_install ./neon_previous/pg_install +# +# # Build current version of binaries and create a data snapshot: +# rm -rf pg_install target +# git checkout +# CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` +# ./scripts/pytest -k test_create_snapshot +# +# # Run the version mismatch test +# ./scripts/pytest -k test_version_mismatch + check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( os.environ.get("CHECK_ONDISK_DATA_COMPATIBILITY") is None, @@ -166,16 +195,11 @@ def test_backward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion, + compatibility_snapshot_dir: Path, ): """ Test that the new binaries can read old data """ - compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") - assert ( - compatibility_snapshot_dir_env is not None - ), f"COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg{pg_version.v_prefixed}` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" - compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() - breaking_changes_allowed = ( os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) @@ -214,27 +238,11 @@ def test_forward_compatibility( test_output_dir: Path, top_output_dir: Path, pg_version: PgVersion, + compatibility_snapshot_dir: Path, ): """ Test that the old binaries can read new data """ - compatibility_neon_bin_env = os.environ.get("COMPATIBILITY_NEON_BIN") - assert compatibility_neon_bin_env is not None, ( - "COMPATIBILITY_NEON_BIN is not set. It should be set to a path with Neon binaries " - "(ideally generated by the previous version of Neon)" - ) - compatibility_neon_bin = Path(compatibility_neon_bin_env).resolve() - - compatibility_postgres_distrib_dir_env = os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR") - assert ( - compatibility_postgres_distrib_dir_env is not None - ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)" - compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve() - - compatibility_snapshot_dir = ( - top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}" - ) - breaking_changes_allowed = ( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) @@ -245,9 +253,14 @@ def test_forward_compatibility( # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). # But always use the current version's neon_local binary. # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI. - neon_env_builder.neon_binpath = compatibility_neon_bin - neon_env_builder.pg_distrib_dir = compatibility_postgres_distrib_dir - neon_env_builder.neon_local_binpath = neon_env_builder.neon_local_binpath + assert ( + neon_env_builder.compatibility_neon_binpath is not None + ), "the environment variable COMPATIBILITY_NEON_BIN is required" + assert ( + neon_env_builder.compatibility_pg_distrib_dir is not None + ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required" + neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath + neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", @@ -558,3 +571,29 @@ def test_historic_storage_formats( env.pageserver.http_client().timeline_compact( dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True ) + + +@check_ondisk_data_compatibility_if_enabled +@pytest.mark.xdist_group("compatibility") +@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) +def test_versions_mismatch( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_version: PgVersion, + compatibility_snapshot_dir, + combination, +): + """ + Checks compatibility of different combinations of versions of the components + """ + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.from_repo_dir( + compatibility_snapshot_dir / "repo", + ) + env.pageserver.allowed_errors.extend( + [".*ingesting record with timestamp lagging more than wait_lsn_timeout.+"] + ) + env.start() + check_neon_works( + env, test_output_dir, compatibility_snapshot_dir / "dump.sql", test_output_dir / "repo" + ) diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 3a0a4b10bf..783fb813cf 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -162,6 +162,11 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID) env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1) + # We will stop the storage controller while it may have requests in + # flight, and the pageserver complains when requests are abandoned. + for ps in env.pageservers: + ps.allowed_errors.append(".*request was dropped before completing.*") + # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageservers[0].running = False env.pageservers[1].running = False diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 7be4d2ce0c..1dcc37c407 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -9,6 +9,7 @@ from datetime import datetime, timezone from enum import Enum from typing import TYPE_CHECKING +import fixtures.utils import pytest from fixtures.auth_tokens import TokenScope from fixtures.common_types import TenantId, TenantShardId, TimelineId @@ -38,7 +39,11 @@ from fixtures.pg_version import PgVersion, run_only_on_default_postgres from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.storage_controller_proxy import StorageControllerProxy -from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until +from fixtures.utils import ( + run_pg_bench_small, + subprocess_capture, + wait_until, +) from fixtures.workload import Workload from mypy_boto3_s3.type_defs import ( ObjectTypeDef, @@ -60,9 +65,8 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids): return counts -def test_storage_controller_smoke( - neon_env_builder: NeonEnvBuilder, -): +@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) +def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination): """ Test the basic lifecycle of a storage controller: - Restarting @@ -1300,11 +1304,11 @@ def test_storage_controller_heartbeats( node_to_tenants = build_node_to_tenants_map(env) log.info(f"Back online: {node_to_tenants=}") - # ... expecting the storage controller to reach a consistent state - def storage_controller_consistent(): - env.storage_controller.consistency_check() + # ... background reconciliation may need to run to clean up the location on the node that was offline + env.storage_controller.reconcile_until_idle() - wait_until(30, 1, storage_controller_consistent) + # ... expecting the storage controller to reach a consistent state + env.storage_controller.consistency_check() def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 9ea09d10d7..b41f1709bd 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -479,9 +479,9 @@ def assert_size_approx_equal(size_a, size_b): """ # Determined empirically from examples of equality failures: they differ - # by page multiples of 8272, and usually by 1-3 pages. Tolerate 4 to avoid + # by page multiples of 8272, and usually by 1-3 pages. Tolerate 6 to avoid # failing on outliers from that observed range. - threshold = 4 * 8272 + threshold = 6 * 8272 assert size_a == pytest.approx(size_b, abs=threshold) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 95dc0fec78..03cb79fc1d 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -2,6 +2,7 @@ from __future__ import annotations import concurrent.futures import os +import threading import time from contextlib import closing from datetime import datetime @@ -10,7 +11,7 @@ from pathlib import Path import pytest import requests -from fixtures.common_types import Lsn, TenantId +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -18,6 +19,7 @@ from fixtures.metrics import ( parse_metrics, ) from fixtures.neon_fixtures import ( + Endpoint, NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn, @@ -476,3 +478,38 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): assert counts log.info(f"directory counts: {counts}") assert counts[2] > COUNT_AT_LEAST_EXPECTED + + +def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): + """ + (Relaxed) regression test for issue that led to https://github.com/neondatabase/neon/pull/9268 + Create many endpoints in parallel and then restart them + """ + env = neon_simple_env + + # This param needs to be 200+ to reproduce the limit issue + n_threads = 16 + barrier = threading.Barrier(n_threads) + + def test_timeline(branch_name: str, timeline_id: TimelineId, endpoint: Endpoint): + endpoint.start() + endpoint.stop() + # Use a barrier to make sure we restart endpoints at the same time + barrier.wait() + endpoint.start() + + workers = [] + + for i in range(0, n_threads): + branch_name = f"branch_{i}" + timeline_id = env.create_branch(branch_name) + endpoint = env.endpoints.create(branch_name) + w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id, endpoint]) + workers.append(w) + + # Only start the restarts once we're done creating all timelines & endpoints + for w in workers: + w.start() + + for w in workers: + w.join() diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 841707d32e..ffaed5e130 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -6,6 +6,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverApiException +from fixtures.utils import wait_until @pytest.mark.parametrize("shard_count", [0, 4]) @@ -114,3 +115,103 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): leaf_timeline_id, state=TimelineArchivalState.UNARCHIVED, ) + + +@pytest.mark.parametrize("manual_offload", [False, True]) +def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool): + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, initial_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s" if manual_offload else "1s", + } + ) + + # Create two branches and archive them + parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id) + leaf_timeline_id = env.create_branch( + "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" + ) + + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,1000)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + ps_http.timeline_archival_config( + tenant_id, + parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + def timeline_offloaded(timeline_id: TimelineId) -> bool: + return ( + env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*") + is not None + ) + + if manual_offload: + with pytest.raises( + PageserverApiException, + match="timeline has attached children", + ): + # This only tests the (made for testing only) http handler, + # but still demonstrates the constraints we have. + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id) + + def parent_offloaded(): + if manual_offload: + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id) + assert timeline_offloaded(parent_timeline_id) + + def leaf_offloaded(): + if manual_offload: + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id) + assert timeline_offloaded(leaf_timeline_id) + + wait_until(30, 1, leaf_offloaded) + wait_until(30, 1, parent_offloaded) + + ps_http.timeline_archival_config( + tenant_id, + parent_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is False + + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + assert sum == sum_again + + assert not timeline_offloaded(initial_timeline_id) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 0a90b6b6f7..1347d6ddff 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -46,7 +46,8 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } -indexmap = { version = "1", default-features = false, features = ["std"] } +indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } +indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } itertools = { version = "0.12" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } @@ -101,7 +102,8 @@ either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } -indexmap = { version = "1", default-features = false, features = ["std"] } +indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } +indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } itertools = { version = "0.12" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] }