mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-11 06:30:37 +00:00
Compare commits
1 Commits
debug-chec
...
sk-migrate
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
22e9702525 |
@@ -1,2 +0,0 @@
|
|||||||
[profile.default]
|
|
||||||
slow-timeout = { period = "20s", terminate-after = 3 }
|
|
||||||
105
.github/workflows/build_and_push_docker_image.yml
vendored
105
.github/workflows/build_and_push_docker_image.yml
vendored
@@ -1,105 +0,0 @@
|
|||||||
name: Build and Push Docker Image
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_call:
|
|
||||||
inputs:
|
|
||||||
dockerfile-path:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
image-name:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
outputs:
|
|
||||||
build-tools-tag:
|
|
||||||
description: "tag generated for build tools"
|
|
||||||
value: ${{ jobs.tag.outputs.build-tools-tag }}
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
check-if-build-tools-dockerfile-changed:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
outputs:
|
|
||||||
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
|
|
||||||
steps:
|
|
||||||
- name: Check if Dockerfile.buildtools has changed
|
|
||||||
id: dockerfile
|
|
||||||
run: |
|
|
||||||
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
|
|
||||||
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
|
|
||||||
exit
|
|
||||||
fi
|
|
||||||
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
|
|
||||||
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
|
|
||||||
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
|
|
||||||
fi
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
|
|
||||||
tag:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
needs: [ check-if-build-tools-dockerfile-changed ]
|
|
||||||
outputs:
|
|
||||||
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Get buildtools tag
|
|
||||||
env:
|
|
||||||
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
|
|
||||||
run: |
|
|
||||||
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
|
|
||||||
IMAGE_TAG=$GITHUB_RUN_ID
|
|
||||||
else
|
|
||||||
IMAGE_TAG=pinned
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
|
|
||||||
shell: bash
|
|
||||||
id: buildtools-tag
|
|
||||||
|
|
||||||
kaniko:
|
|
||||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
|
||||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
|
||||||
runs-on: [ self-hosted, dev, x64 ]
|
|
||||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v1
|
|
||||||
|
|
||||||
- name: Configure ECR login
|
|
||||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
|
||||||
|
|
||||||
- name: Kaniko build
|
|
||||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
|
||||||
|
|
||||||
kaniko-arm:
|
|
||||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
|
||||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
|
||||||
runs-on: [ self-hosted, dev, arm64 ]
|
|
||||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v1
|
|
||||||
|
|
||||||
- name: Configure ECR login
|
|
||||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
|
||||||
|
|
||||||
- name: Kaniko build
|
|
||||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
|
||||||
|
|
||||||
manifest:
|
|
||||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
|
||||||
name: 'manifest'
|
|
||||||
runs-on: [ self-hosted, dev, x64 ]
|
|
||||||
needs:
|
|
||||||
- tag
|
|
||||||
- kaniko
|
|
||||||
- kaniko-arm
|
|
||||||
- check-if-build-tools-dockerfile-changed
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Create manifest
|
|
||||||
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
|
||||||
|
|
||||||
- name: Push manifest
|
|
||||||
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
|
|
||||||
67
.github/workflows/build_and_test.yml
vendored
67
.github/workflows/build_and_test.yml
vendored
@@ -44,6 +44,7 @@ jobs:
|
|||||||
|
|
||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
|
|
||||||
tag:
|
tag:
|
||||||
needs: [ check-permissions ]
|
needs: [ check-permissions ]
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
@@ -73,19 +74,11 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
id: build-tag
|
id: build-tag
|
||||||
|
|
||||||
build-buildtools-image:
|
|
||||||
needs: [ check-permissions ]
|
|
||||||
uses: ./.github/workflows/build_and_push_docker_image.yml
|
|
||||||
with:
|
|
||||||
dockerfile-path: Dockerfile.buildtools
|
|
||||||
image-name: build-tools
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
check-codestyle-python:
|
check-codestyle-python:
|
||||||
needs: [ check-permissions, build-buildtools-image ]
|
needs: [ check-permissions ]
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -105,20 +98,20 @@ jobs:
|
|||||||
- name: Install Python deps
|
- name: Install Python deps
|
||||||
run: ./scripts/pysync
|
run: ./scripts/pysync
|
||||||
|
|
||||||
- name: Run `ruff check` to ensure code format
|
- name: Run ruff to ensure code format
|
||||||
run: poetry run ruff check .
|
run: poetry run ruff .
|
||||||
|
|
||||||
- name: Run `ruff format` to ensure code format
|
- name: Run black to ensure code format
|
||||||
run: poetry run ruff format --check .
|
run: poetry run black --diff --check .
|
||||||
|
|
||||||
- name: Run mypy to check types
|
- name: Run mypy to check types
|
||||||
run: poetry run mypy .
|
run: poetry run mypy .
|
||||||
|
|
||||||
check-codestyle-rust:
|
check-codestyle-rust:
|
||||||
needs: [ check-permissions, build-buildtools-image ]
|
needs: [ check-permissions ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -182,10 +175,10 @@ jobs:
|
|||||||
run: cargo deny check --hide-inclusion-graph
|
run: cargo deny check --hide-inclusion-graph
|
||||||
|
|
||||||
build-neon:
|
build-neon:
|
||||||
needs: [ check-permissions, tag, build-buildtools-image ]
|
needs: [ check-permissions, tag ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
@@ -339,16 +332,16 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||||
|
|
||||||
- name: Run rust tests
|
- name: Run cargo test
|
||||||
run: |
|
run: |
|
||||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||||
|
|
||||||
# Run separate tests for real S3
|
# Run separate tests for real S3
|
||||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||||
|
|
||||||
# Run separate tests for real Azure Blob Storage
|
# Run separate tests for real Azure Blob Storage
|
||||||
# XXX: replace region with `eu-central-1`-like region
|
# XXX: replace region with `eu-central-1`-like region
|
||||||
@@ -358,7 +351,7 @@ jobs:
|
|||||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||||
|
|
||||||
- name: Install rust binaries
|
- name: Install rust binaries
|
||||||
run: |
|
run: |
|
||||||
@@ -415,10 +408,10 @@ jobs:
|
|||||||
uses: ./.github/actions/save-coverage-data
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
regress-tests:
|
regress-tests:
|
||||||
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
|
needs: [ check-permissions, build-neon, tag ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
# Default shared memory is 64mb
|
# Default shared memory is 64mb
|
||||||
options: --init --shm-size=512mb
|
options: --init --shm-size=512mb
|
||||||
strategy:
|
strategy:
|
||||||
@@ -454,10 +447,10 @@ jobs:
|
|||||||
uses: ./.github/actions/save-coverage-data
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
benchmarks:
|
benchmarks:
|
||||||
needs: [ check-permissions, build-neon, build-buildtools-image ]
|
needs: [ check-permissions, build-neon ]
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
# Default shared memory is 64mb
|
# Default shared memory is 64mb
|
||||||
options: --init --shm-size=512mb
|
options: --init --shm-size=512mb
|
||||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||||
@@ -486,12 +479,12 @@ jobs:
|
|||||||
# while coverage is currently collected for the debug ones
|
# while coverage is currently collected for the debug ones
|
||||||
|
|
||||||
create-test-report:
|
create-test-report:
|
||||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
|
needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
|
||||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||||
|
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -533,10 +526,11 @@ jobs:
|
|||||||
})
|
})
|
||||||
|
|
||||||
coverage-report:
|
coverage-report:
|
||||||
needs: [ check-permissions, regress-tests, build-buildtools-image ]
|
needs: [ check-permissions, regress-tests ]
|
||||||
|
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
@@ -700,7 +694,7 @@ jobs:
|
|||||||
}"
|
}"
|
||||||
|
|
||||||
neon-image:
|
neon-image:
|
||||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
needs: [ check-permissions, tag ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||||
defaults:
|
defaults:
|
||||||
@@ -739,7 +733,6 @@ jobs:
|
|||||||
--context .
|
--context .
|
||||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||||
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
|
||||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||||
@@ -750,7 +743,7 @@ jobs:
|
|||||||
|
|
||||||
compute-tools-image:
|
compute-tools-image:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
needs: [ check-permissions, tag ]
|
||||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
@@ -785,7 +778,6 @@ jobs:
|
|||||||
--context .
|
--context .
|
||||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
|
||||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
--dockerfile Dockerfile.compute-tools
|
--dockerfile Dockerfile.compute-tools
|
||||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||||
@@ -796,7 +788,7 @@ jobs:
|
|||||||
run: rm -rf ~/.ecr
|
run: rm -rf ~/.ecr
|
||||||
|
|
||||||
compute-node-image:
|
compute-node-image:
|
||||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
needs: [ check-permissions, tag ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
container:
|
container:
|
||||||
image: gcr.io/kaniko-project/executor:v1.9.2-debug
|
image: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||||
@@ -844,7 +836,6 @@ jobs:
|
|||||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
--build-arg PG_VERSION=${{ matrix.version }}
|
--build-arg PG_VERSION=${{ matrix.version }}
|
||||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
|
||||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
--dockerfile Dockerfile.compute-node
|
--dockerfile Dockerfile.compute-node
|
||||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||||
@@ -1131,7 +1122,7 @@ jobs:
|
|||||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
|
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
|
||||||
else
|
else
|
||||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -218,7 +218,7 @@ jobs:
|
|||||||
|
|
||||||
# Run separate tests for real S3
|
# Run separate tests for real S3
|
||||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||||
|
|||||||
130
.github/workflows/update_build_tools_image.yml
vendored
130
.github/workflows/update_build_tools_image.yml
vendored
@@ -1,130 +0,0 @@
|
|||||||
name: 'Update build tools image tag'
|
|
||||||
|
|
||||||
# This workflow it used to update tag of build tools in ECR.
|
|
||||||
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
from-tag:
|
|
||||||
description: 'Source tag'
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
to-tag:
|
|
||||||
description: 'Destination tag'
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
default: 'pinned'
|
|
||||||
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash -euo pipefail {0}
|
|
||||||
|
|
||||||
env:
|
|
||||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
|
||||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
|
||||||
|
|
||||||
permissions: {}
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
tag-image:
|
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
|
||||||
container: golang:1.19-bullseye
|
|
||||||
|
|
||||||
env:
|
|
||||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
|
||||||
FROM_TAG: ${{ inputs.from-tag }}
|
|
||||||
TO_TAG: ${{ inputs.to-tag }}
|
|
||||||
outputs:
|
|
||||||
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
|
|
||||||
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Install Crane & ECR helper
|
|
||||||
run: |
|
|
||||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
|
||||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
|
||||||
|
|
||||||
- name: Configure ECR login
|
|
||||||
run: |
|
|
||||||
mkdir /github/home/.docker/
|
|
||||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
|
||||||
|
|
||||||
- name: Get source image digest
|
|
||||||
id: next-digest
|
|
||||||
run: |
|
|
||||||
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
|
|
||||||
if [ -z "${NEXT_DIGEST}" ]; then
|
|
||||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
|
|
||||||
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Get destination image digest (if already exists)
|
|
||||||
id: prev-digest
|
|
||||||
run: |
|
|
||||||
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
|
|
||||||
if [ -z "${PREV_DIGEST}" ]; then
|
|
||||||
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
|
|
||||||
else
|
|
||||||
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
|
|
||||||
|
|
||||||
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Tag image
|
|
||||||
run: |
|
|
||||||
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
|
|
||||||
|
|
||||||
rollback-tag-image:
|
|
||||||
needs: tag-image
|
|
||||||
if: ${{ !success() }}
|
|
||||||
|
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
|
||||||
container: golang:1.19-bullseye
|
|
||||||
|
|
||||||
env:
|
|
||||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
|
||||||
FROM_TAG: ${{ inputs.from-tag }}
|
|
||||||
TO_TAG: ${{ inputs.to-tag }}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Install Crane & ECR helper
|
|
||||||
run: |
|
|
||||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
|
||||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
|
||||||
|
|
||||||
- name: Configure ECR login
|
|
||||||
run: |
|
|
||||||
mkdir /github/home/.docker/
|
|
||||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
|
||||||
|
|
||||||
- name: Restore previous tag if needed
|
|
||||||
run: |
|
|
||||||
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
|
|
||||||
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
|
|
||||||
|
|
||||||
if [ -z "${NEXT_DIGEST}" ]; then
|
|
||||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z "${PREV_DIGEST}" ]; then
|
|
||||||
# I guess we should delete the tag here/untag the image, but crane does not support it
|
|
||||||
# - https://github.com/google/go-containerregistry/issues/999
|
|
||||||
|
|
||||||
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
|
|
||||||
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
|
|
||||||
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
|
|
||||||
|
|
||||||
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
|
|
||||||
else
|
|
||||||
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
|
|
||||||
fi
|
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -6,7 +6,6 @@ __pycache__/
|
|||||||
test_output/
|
test_output/
|
||||||
.vscode
|
.vscode
|
||||||
.idea
|
.idea
|
||||||
neon.iml
|
|
||||||
/.neon
|
/.neon
|
||||||
/integration_tests/.neon
|
/integration_tests/.neon
|
||||||
|
|
||||||
|
|||||||
@@ -70,17 +70,3 @@ We're using the following approach to make it work:
|
|||||||
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
|
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
|
||||||
|
|
||||||
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
|
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
|
||||||
|
|
||||||
## How do I add the "pinned" tag to an buildtools image?
|
|
||||||
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
|
|
||||||
|
|
||||||
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
|
|
||||||
or using GitHub CLI:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
|
|
||||||
-f from-tag=6254913013 \
|
|
||||||
-f to-tag=pinned \
|
|
||||||
|
|
||||||
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
|
|
||||||
```
|
|
||||||
368
Cargo.lock
generated
368
Cargo.lock
generated
@@ -30,8 +30,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
|
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"const-random",
|
|
||||||
"getrandom 0.2.11",
|
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"version_check",
|
"version_check",
|
||||||
"zerocopy",
|
"zerocopy",
|
||||||
@@ -52,12 +50,6 @@ version = "0.2.16"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "android-tzdata"
|
|
||||||
version = "0.1.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "android_system_properties"
|
name = "android_system_properties"
|
||||||
version = "0.1.5"
|
version = "0.1.5"
|
||||||
@@ -255,12 +247,6 @@ dependencies = [
|
|||||||
"syn 2.0.32",
|
"syn 2.0.32",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "atomic"
|
|
||||||
version = "0.5.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "atomic-polyfill"
|
name = "atomic-polyfill"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
@@ -270,32 +256,6 @@ dependencies = [
|
|||||||
"critical-section",
|
"critical-section",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "attachment_service"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"camino",
|
|
||||||
"clap",
|
|
||||||
"control_plane",
|
|
||||||
"futures",
|
|
||||||
"git-version",
|
|
||||||
"hyper",
|
|
||||||
"metrics",
|
|
||||||
"pageserver_api",
|
|
||||||
"pageserver_client",
|
|
||||||
"postgres_backend",
|
|
||||||
"postgres_connection",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"thiserror",
|
|
||||||
"tokio",
|
|
||||||
"tokio-util",
|
|
||||||
"tracing",
|
|
||||||
"utils",
|
|
||||||
"workspace_hack",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "autocfg"
|
name = "autocfg"
|
||||||
version = "1.1.0"
|
version = "1.1.0"
|
||||||
@@ -1051,17 +1011,17 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "chrono"
|
name = "chrono"
|
||||||
version = "0.4.31"
|
version = "0.4.24"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
|
checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"android-tzdata",
|
|
||||||
"iana-time-zone",
|
"iana-time-zone",
|
||||||
"js-sys",
|
"js-sys",
|
||||||
|
"num-integer",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
"serde",
|
"serde",
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
"windows-targets 0.48.0",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1160,20 +1120,6 @@ version = "1.0.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
|
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "combine"
|
|
||||||
version = "4.6.6"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
|
|
||||||
dependencies = [
|
|
||||||
"bytes",
|
|
||||||
"futures-core",
|
|
||||||
"memchr",
|
|
||||||
"pin-project-lite",
|
|
||||||
"tokio",
|
|
||||||
"tokio-util",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "comfy-table"
|
name = "comfy-table"
|
||||||
version = "6.1.4"
|
version = "6.1.4"
|
||||||
@@ -1215,7 +1161,6 @@ dependencies = [
|
|||||||
"flate2",
|
"flate2",
|
||||||
"futures",
|
"futures",
|
||||||
"hyper",
|
"hyper",
|
||||||
"nix 0.26.2",
|
|
||||||
"notify",
|
"notify",
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
@@ -1223,10 +1168,8 @@ dependencies = [
|
|||||||
"regex",
|
"regex",
|
||||||
"remote_storage",
|
"remote_storage",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"rust-ini",
|
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"signal-hook",
|
|
||||||
"tar",
|
"tar",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
@@ -1258,26 +1201,6 @@ version = "0.9.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
|
checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "const-random"
|
|
||||||
version = "0.1.17"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
|
|
||||||
dependencies = [
|
|
||||||
"const-random-macro",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "const-random-macro"
|
|
||||||
version = "0.1.16"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
|
|
||||||
dependencies = [
|
|
||||||
"getrandom 0.2.11",
|
|
||||||
"once_cell",
|
|
||||||
"tiny-keccak",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "const_fn"
|
name = "const_fn"
|
||||||
version = "0.4.9"
|
version = "0.4.9"
|
||||||
@@ -1510,12 +1433,6 @@ dependencies = [
|
|||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "crunchy"
|
|
||||||
version = "0.2.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crypto-bigint"
|
name = "crypto-bigint"
|
||||||
version = "0.4.9"
|
version = "0.4.9"
|
||||||
@@ -1658,15 +1575,6 @@ dependencies = [
|
|||||||
"syn 2.0.32",
|
"syn 2.0.32",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "dlv-list"
|
|
||||||
version = "0.5.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
|
|
||||||
dependencies = [
|
|
||||||
"const-random",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dyn-clone"
|
name = "dyn-clone"
|
||||||
version = "1.0.14"
|
version = "1.0.14"
|
||||||
@@ -1774,12 +1682,6 @@ dependencies = [
|
|||||||
"termcolor",
|
"termcolor",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "equivalent"
|
|
||||||
version = "1.0.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "errno"
|
name = "errno"
|
||||||
version = "0.3.1"
|
version = "0.3.1"
|
||||||
@@ -2138,9 +2040,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "h2"
|
name = "h2"
|
||||||
version = "0.3.24"
|
version = "0.3.19"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
|
checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"fnv",
|
"fnv",
|
||||||
@@ -2148,7 +2050,7 @@ dependencies = [
|
|||||||
"futures-sink",
|
"futures-sink",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"http",
|
"http",
|
||||||
"indexmap 2.0.1",
|
"indexmap",
|
||||||
"slab",
|
"slab",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
@@ -2204,20 +2106,6 @@ dependencies = [
|
|||||||
"hashbrown 0.13.2",
|
"hashbrown 0.13.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "hdrhistogram"
|
|
||||||
version = "7.5.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
|
|
||||||
dependencies = [
|
|
||||||
"base64 0.21.1",
|
|
||||||
"byteorder",
|
|
||||||
"crossbeam-channel",
|
|
||||||
"flate2",
|
|
||||||
"nom",
|
|
||||||
"num-traits",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heapless"
|
name = "heapless"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
@@ -2484,16 +2372,6 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "indexmap"
|
|
||||||
version = "2.0.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
|
|
||||||
dependencies = [
|
|
||||||
"equivalent",
|
|
||||||
"hashbrown 0.14.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "infer"
|
name = "infer"
|
||||||
version = "0.2.3"
|
version = "0.2.3"
|
||||||
@@ -2545,12 +2423,6 @@ dependencies = [
|
|||||||
"web-sys",
|
"web-sys",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "integer-encoding"
|
|
||||||
version = "3.0.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "io-lifetimes"
|
name = "io-lifetimes"
|
||||||
version = "1.0.11"
|
version = "1.0.11"
|
||||||
@@ -2914,19 +2786,6 @@ dependencies = [
|
|||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "num"
|
|
||||||
version = "0.4.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
|
|
||||||
dependencies = [
|
|
||||||
"num-complex",
|
|
||||||
"num-integer",
|
|
||||||
"num-iter",
|
|
||||||
"num-rational",
|
|
||||||
"num-traits",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-bigint"
|
name = "num-bigint"
|
||||||
version = "0.4.3"
|
version = "0.4.3"
|
||||||
@@ -2938,15 +2797,6 @@ dependencies = [
|
|||||||
"num-traits",
|
"num-traits",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "num-complex"
|
|
||||||
version = "0.4.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214"
|
|
||||||
dependencies = [
|
|
||||||
"num-traits",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-integer"
|
name = "num-integer"
|
||||||
version = "0.1.45"
|
version = "0.1.45"
|
||||||
@@ -2957,28 +2807,6 @@ dependencies = [
|
|||||||
"num-traits",
|
"num-traits",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "num-iter"
|
|
||||||
version = "0.1.43"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
|
|
||||||
dependencies = [
|
|
||||||
"autocfg",
|
|
||||||
"num-integer",
|
|
||||||
"num-traits",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "num-rational"
|
|
||||||
version = "0.4.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
|
|
||||||
dependencies = [
|
|
||||||
"autocfg",
|
|
||||||
"num-integer",
|
|
||||||
"num-traits",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-traits"
|
name = "num-traits"
|
||||||
version = "0.2.15"
|
version = "0.2.15"
|
||||||
@@ -3172,7 +3000,7 @@ dependencies = [
|
|||||||
"fnv",
|
"fnv",
|
||||||
"futures-channel",
|
"futures-channel",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"indexmap 1.9.3",
|
"indexmap",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
@@ -3201,25 +3029,6 @@ dependencies = [
|
|||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ordered-float"
|
|
||||||
version = "2.10.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
|
|
||||||
dependencies = [
|
|
||||||
"num-traits",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ordered-multimap"
|
|
||||||
version = "0.7.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
|
|
||||||
dependencies = [
|
|
||||||
"dlv-list",
|
|
||||||
"hashbrown 0.14.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "os_info"
|
name = "os_info"
|
||||||
version = "3.7.0"
|
version = "3.7.0"
|
||||||
@@ -3248,29 +3057,6 @@ dependencies = [
|
|||||||
"sha2",
|
"sha2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pagebench"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"camino",
|
|
||||||
"clap",
|
|
||||||
"futures",
|
|
||||||
"hdrhistogram",
|
|
||||||
"humantime",
|
|
||||||
"humantime-serde",
|
|
||||||
"pageserver_api",
|
|
||||||
"pageserver_client",
|
|
||||||
"rand 0.8.5",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"tokio",
|
|
||||||
"tokio-util",
|
|
||||||
"tracing",
|
|
||||||
"utils",
|
|
||||||
"workspace_hack",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pagectl"
|
name = "pagectl"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
@@ -3382,7 +3168,6 @@ dependencies = [
|
|||||||
"const_format",
|
"const_format",
|
||||||
"enum-map",
|
"enum-map",
|
||||||
"hex",
|
"hex",
|
||||||
"humantime-serde",
|
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -3470,35 +3255,6 @@ dependencies = [
|
|||||||
"windows-targets 0.48.0",
|
"windows-targets 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "parquet"
|
|
||||||
version = "49.0.0"
|
|
||||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
|
||||||
dependencies = [
|
|
||||||
"ahash",
|
|
||||||
"bytes",
|
|
||||||
"chrono",
|
|
||||||
"hashbrown 0.14.0",
|
|
||||||
"num",
|
|
||||||
"num-bigint",
|
|
||||||
"paste",
|
|
||||||
"seq-macro",
|
|
||||||
"thrift",
|
|
||||||
"twox-hash",
|
|
||||||
"zstd",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "parquet_derive"
|
|
||||||
version = "49.0.0"
|
|
||||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
|
||||||
dependencies = [
|
|
||||||
"parquet",
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 2.0.32",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "password-hash"
|
name = "password-hash"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
@@ -3567,7 +3323,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
|
checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fixedbitset",
|
"fixedbitset",
|
||||||
"indexmap 1.9.3",
|
"indexmap",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3922,8 +3678,6 @@ dependencies = [
|
|||||||
"base64 0.13.1",
|
"base64 0.13.1",
|
||||||
"bstr",
|
"bstr",
|
||||||
"bytes",
|
"bytes",
|
||||||
"camino",
|
|
||||||
"camino-tempfile",
|
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"consumption_metrics",
|
"consumption_metrics",
|
||||||
@@ -3946,8 +3700,6 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
"parking_lot 0.12.1",
|
"parking_lot 0.12.1",
|
||||||
"parquet",
|
|
||||||
"parquet_derive",
|
|
||||||
"pbkdf2",
|
"pbkdf2",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"postgres-native-tls",
|
"postgres-native-tls",
|
||||||
@@ -3957,9 +3709,7 @@ dependencies = [
|
|||||||
"prometheus",
|
"prometheus",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"rcgen",
|
"rcgen",
|
||||||
"redis",
|
|
||||||
"regex",
|
"regex",
|
||||||
"remote_storage",
|
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"reqwest-middleware",
|
"reqwest-middleware",
|
||||||
"reqwest-retry",
|
"reqwest-retry",
|
||||||
@@ -3991,7 +3741,6 @@ dependencies = [
|
|||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
"uuid",
|
"uuid",
|
||||||
"walkdir",
|
|
||||||
"webpki-roots 0.25.2",
|
"webpki-roots 0.25.2",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
"x509-parser",
|
"x509-parser",
|
||||||
@@ -4121,32 +3870,6 @@ dependencies = [
|
|||||||
"yasna",
|
"yasna",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "redis"
|
|
||||||
version = "0.24.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
|
|
||||||
dependencies = [
|
|
||||||
"async-trait",
|
|
||||||
"bytes",
|
|
||||||
"combine",
|
|
||||||
"futures-util",
|
|
||||||
"itoa",
|
|
||||||
"percent-encoding",
|
|
||||||
"pin-project-lite",
|
|
||||||
"rustls",
|
|
||||||
"rustls-native-certs",
|
|
||||||
"rustls-pemfile",
|
|
||||||
"rustls-webpki 0.101.7",
|
|
||||||
"ryu",
|
|
||||||
"sha1_smol",
|
|
||||||
"socket2 0.4.9",
|
|
||||||
"tokio",
|
|
||||||
"tokio-rustls",
|
|
||||||
"tokio-util",
|
|
||||||
"url",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "redox_syscall"
|
name = "redox_syscall"
|
||||||
version = "0.2.16"
|
version = "0.2.16"
|
||||||
@@ -4457,16 +4180,6 @@ dependencies = [
|
|||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rust-ini"
|
|
||||||
version = "0.20.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"ordered-multimap",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustc-demangle"
|
name = "rustc-demangle"
|
||||||
version = "0.1.23"
|
version = "0.1.23"
|
||||||
@@ -4598,14 +4311,12 @@ dependencies = [
|
|||||||
"async-stream",
|
"async-stream",
|
||||||
"aws-config",
|
"aws-config",
|
||||||
"aws-sdk-s3",
|
"aws-sdk-s3",
|
||||||
"aws-smithy-async",
|
|
||||||
"bincode",
|
"bincode",
|
||||||
"bytes",
|
"bytes",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"crc32c",
|
"crc32c",
|
||||||
"either",
|
"either",
|
||||||
"futures",
|
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"hex",
|
"hex",
|
||||||
"histogram",
|
"histogram",
|
||||||
@@ -4644,7 +4355,6 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"const_format",
|
"const_format",
|
||||||
"crc32c",
|
"crc32c",
|
||||||
"fail",
|
|
||||||
"fs2",
|
"fs2",
|
||||||
"futures",
|
"futures",
|
||||||
"git-version",
|
"git-version",
|
||||||
@@ -4668,7 +4378,6 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_with",
|
"serde_with",
|
||||||
"sha2",
|
|
||||||
"signal-hook",
|
"signal-hook",
|
||||||
"storage_broker",
|
"storage_broker",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
@@ -4875,12 +4584,6 @@ dependencies = [
|
|||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "seq-macro"
|
|
||||||
version = "0.3.5"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.183"
|
version = "1.0.183"
|
||||||
@@ -4973,7 +4676,7 @@ dependencies = [
|
|||||||
"base64 0.13.1",
|
"base64 0.13.1",
|
||||||
"chrono",
|
"chrono",
|
||||||
"hex",
|
"hex",
|
||||||
"indexmap 1.9.3",
|
"indexmap",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_with_macros",
|
"serde_with_macros",
|
||||||
@@ -5003,12 +4706,6 @@ dependencies = [
|
|||||||
"digest",
|
"digest",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "sha1_smol"
|
|
||||||
version = "1.0.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sha2"
|
name = "sha2"
|
||||||
version = "0.10.6"
|
version = "0.10.6"
|
||||||
@@ -5407,17 +5104,6 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "thrift"
|
|
||||||
version = "0.17.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
|
|
||||||
dependencies = [
|
|
||||||
"byteorder",
|
|
||||||
"integer-encoding",
|
|
||||||
"ordered-float",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time"
|
name = "time"
|
||||||
version = "0.3.21"
|
version = "0.3.21"
|
||||||
@@ -5448,15 +5134,6 @@ dependencies = [
|
|||||||
"time-core",
|
"time-core",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tiny-keccak"
|
|
||||||
version = "2.0.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
|
|
||||||
dependencies = [
|
|
||||||
"crunchy",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tinytemplate"
|
name = "tinytemplate"
|
||||||
version = "1.2.1"
|
version = "1.2.1"
|
||||||
@@ -5674,7 +5351,7 @@ version = "0.19.10"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
|
checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"indexmap 1.9.3",
|
"indexmap",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_spanned",
|
"serde_spanned",
|
||||||
"toml_datetime",
|
"toml_datetime",
|
||||||
@@ -5766,7 +5443,7 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"futures-core",
|
"futures-core",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"indexmap 1.9.3",
|
"indexmap",
|
||||||
"pin-project",
|
"pin-project",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
@@ -5962,16 +5639,6 @@ dependencies = [
|
|||||||
"utf-8",
|
"utf-8",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "twox-hash"
|
|
||||||
version = "1.6.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"static_assertions",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typenum"
|
name = "typenum"
|
||||||
version = "1.16.0"
|
version = "1.16.0"
|
||||||
@@ -6110,7 +5777,6 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
"const_format",
|
"const_format",
|
||||||
"criterion",
|
"criterion",
|
||||||
"fail",
|
|
||||||
"futures",
|
"futures",
|
||||||
"heapless",
|
"heapless",
|
||||||
"hex",
|
"hex",
|
||||||
@@ -6149,11 +5815,10 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "uuid"
|
name = "uuid"
|
||||||
version = "1.6.1"
|
version = "1.3.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
|
checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"atomic",
|
|
||||||
"getrandom 0.2.11",
|
"getrandom 0.2.11",
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
@@ -6636,12 +6301,9 @@ dependencies = [
|
|||||||
"futures-io",
|
"futures-io",
|
||||||
"futures-sink",
|
"futures-sink",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"getrandom 0.2.11",
|
|
||||||
"hashbrown 0.14.0",
|
|
||||||
"hex",
|
"hex",
|
||||||
"hmac",
|
"hmac",
|
||||||
"hyper",
|
"hyper",
|
||||||
"indexmap 1.9.3",
|
|
||||||
"itertools",
|
"itertools",
|
||||||
"libc",
|
"libc",
|
||||||
"log",
|
"log",
|
||||||
@@ -6650,8 +6312,6 @@ dependencies = [
|
|||||||
"num-bigint",
|
"num-bigint",
|
||||||
"num-integer",
|
"num-integer",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
"once_cell",
|
|
||||||
"parquet",
|
|
||||||
"prost",
|
"prost",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"regex",
|
"regex",
|
||||||
|
|||||||
12
Cargo.toml
12
Cargo.toml
@@ -3,11 +3,9 @@ resolver = "2"
|
|||||||
members = [
|
members = [
|
||||||
"compute_tools",
|
"compute_tools",
|
||||||
"control_plane",
|
"control_plane",
|
||||||
"control_plane/attachment_service",
|
|
||||||
"pageserver",
|
"pageserver",
|
||||||
"pageserver/ctl",
|
"pageserver/ctl",
|
||||||
"pageserver/client",
|
"pageserver/client",
|
||||||
"pageserver/pagebench",
|
|
||||||
"proxy",
|
"proxy",
|
||||||
"safekeeper",
|
"safekeeper",
|
||||||
"storage_broker",
|
"storage_broker",
|
||||||
@@ -81,7 +79,6 @@ futures-util = "0.3"
|
|||||||
git-version = "0.3"
|
git-version = "0.3"
|
||||||
hashbrown = "0.13"
|
hashbrown = "0.13"
|
||||||
hashlink = "0.8.1"
|
hashlink = "0.8.1"
|
||||||
hdrhistogram = "7.5.2"
|
|
||||||
hex = "0.4"
|
hex = "0.4"
|
||||||
hex-literal = "0.4"
|
hex-literal = "0.4"
|
||||||
hmac = "0.12.1"
|
hmac = "0.12.1"
|
||||||
@@ -108,14 +105,11 @@ opentelemetry = "0.19.0"
|
|||||||
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||||
opentelemetry-semantic-conventions = "0.11.0"
|
opentelemetry-semantic-conventions = "0.11.0"
|
||||||
parking_lot = "0.12"
|
parking_lot = "0.12"
|
||||||
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
|
|
||||||
parquet_derive = "49.0.0"
|
|
||||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||||
pin-project-lite = "0.2"
|
pin-project-lite = "0.2"
|
||||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||||
prost = "0.11"
|
prost = "0.11"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
|
|
||||||
regex = "1.10.2"
|
regex = "1.10.2"
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||||
@@ -165,7 +159,7 @@ tracing-error = "0.2.0"
|
|||||||
tracing-opentelemetry = "0.19.0"
|
tracing-opentelemetry = "0.19.0"
|
||||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||||
url = "2.2"
|
url = "2.2"
|
||||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||||
walkdir = "2.3.2"
|
walkdir = "2.3.2"
|
||||||
webpki-roots = "0.25"
|
webpki-roots = "0.25"
|
||||||
x509-parser = "0.15"
|
x509-parser = "0.15"
|
||||||
@@ -219,10 +213,6 @@ tonic-build = "0.9"
|
|||||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||||
|
|
||||||
# bug fixes for UUID
|
|
||||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
|
||||||
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
|
||||||
|
|
||||||
################# Binary contents sections
|
################# Binary contents sections
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
|
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
|
||||||
### inside this image in the real deployments.
|
### inside this image in the real deployments.
|
||||||
ARG REPOSITORY=neondatabase
|
ARG REPOSITORY=neondatabase
|
||||||
ARG IMAGE=build-tools
|
ARG IMAGE=rust
|
||||||
ARG TAG=pinned
|
ARG TAG=pinned
|
||||||
|
|
||||||
# Build Postgres
|
# Build Postgres
|
||||||
|
|||||||
@@ -1,166 +0,0 @@
|
|||||||
FROM debian:bullseye-slim
|
|
||||||
|
|
||||||
# Add nonroot user
|
|
||||||
RUN useradd -ms /bin/bash nonroot -b /home
|
|
||||||
SHELL ["/bin/bash", "-c"]
|
|
||||||
|
|
||||||
# System deps
|
|
||||||
RUN set -e \
|
|
||||||
&& apt update \
|
|
||||||
&& apt install -y \
|
|
||||||
autoconf \
|
|
||||||
automake \
|
|
||||||
bison \
|
|
||||||
build-essential \
|
|
||||||
ca-certificates \
|
|
||||||
cmake \
|
|
||||||
curl \
|
|
||||||
flex \
|
|
||||||
git \
|
|
||||||
gnupg \
|
|
||||||
gzip \
|
|
||||||
jq \
|
|
||||||
libcurl4-openssl-dev \
|
|
||||||
libbz2-dev \
|
|
||||||
libffi-dev \
|
|
||||||
liblzma-dev \
|
|
||||||
libncurses5-dev \
|
|
||||||
libncursesw5-dev \
|
|
||||||
libpq-dev \
|
|
||||||
libreadline-dev \
|
|
||||||
libseccomp-dev \
|
|
||||||
libsqlite3-dev \
|
|
||||||
libssl-dev \
|
|
||||||
libstdc++-10-dev \
|
|
||||||
libtool \
|
|
||||||
libxml2-dev \
|
|
||||||
libxmlsec1-dev \
|
|
||||||
libxxhash-dev \
|
|
||||||
lsof \
|
|
||||||
make \
|
|
||||||
netcat \
|
|
||||||
net-tools \
|
|
||||||
openssh-client \
|
|
||||||
parallel \
|
|
||||||
pkg-config \
|
|
||||||
unzip \
|
|
||||||
wget \
|
|
||||||
xz-utils \
|
|
||||||
zlib1g-dev \
|
|
||||||
zstd \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
|
||||||
|
|
||||||
# protobuf-compiler (protoc)
|
|
||||||
ENV PROTOC_VERSION 25.1
|
|
||||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
|
||||||
&& unzip -q protoc.zip -d protoc \
|
|
||||||
&& mv protoc/bin/protoc /usr/local/bin/protoc \
|
|
||||||
&& mv protoc/include/google /usr/local/include/google \
|
|
||||||
&& rm -rf protoc.zip protoc
|
|
||||||
|
|
||||||
# LLVM
|
|
||||||
ENV LLVM_VERSION=17
|
|
||||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
|
||||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
|
||||||
&& apt update \
|
|
||||||
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
|
||||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
|
||||||
|
|
||||||
# PostgreSQL 14
|
|
||||||
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
|
|
||||||
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
|
|
||||||
&& apt update \
|
|
||||||
&& apt install -y postgresql-client-14 \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
|
||||||
|
|
||||||
# AWS CLI
|
|
||||||
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
|
||||||
&& unzip -q awscliv2.zip \
|
|
||||||
&& ./aws/install \
|
|
||||||
&& rm awscliv2.zip
|
|
||||||
|
|
||||||
# Mold: A Modern Linker
|
|
||||||
ENV MOLD_VERSION v2.4.0
|
|
||||||
RUN set -e \
|
|
||||||
&& git clone https://github.com/rui314/mold.git \
|
|
||||||
&& mkdir mold/build \
|
|
||||||
&& cd mold/build \
|
|
||||||
&& git checkout ${MOLD_VERSION} \
|
|
||||||
&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
|
|
||||||
&& cmake --build . -j $(nproc) \
|
|
||||||
&& cmake --install . \
|
|
||||||
&& cd .. \
|
|
||||||
&& rm -rf mold
|
|
||||||
|
|
||||||
# LCOV
|
|
||||||
# Build lcov from a fork:
|
|
||||||
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
|
|
||||||
# And patches from us:
|
|
||||||
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
|
|
||||||
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
|
|
||||||
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
|
|
||||||
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
|
|
||||||
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
|
|
||||||
&& cd lcov \
|
|
||||||
&& make install \
|
|
||||||
&& rm -rf ../lcov.tar.gz
|
|
||||||
|
|
||||||
# Switch to nonroot user
|
|
||||||
USER nonroot:nonroot
|
|
||||||
WORKDIR /home/nonroot
|
|
||||||
|
|
||||||
# Python
|
|
||||||
ENV PYTHON_VERSION=3.9.2 \
|
|
||||||
PYENV_ROOT=/home/nonroot/.pyenv \
|
|
||||||
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
|
|
||||||
RUN set -e \
|
|
||||||
&& cd $HOME \
|
|
||||||
&& curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
|
|
||||||
&& chmod +x pyenv-installer \
|
|
||||||
&& ./pyenv-installer \
|
|
||||||
&& export PYENV_ROOT=/home/nonroot/.pyenv \
|
|
||||||
&& export PATH="$PYENV_ROOT/bin:$PATH" \
|
|
||||||
&& export PATH="$PYENV_ROOT/shims:$PATH" \
|
|
||||||
&& pyenv install ${PYTHON_VERSION} \
|
|
||||||
&& pyenv global ${PYTHON_VERSION} \
|
|
||||||
&& python --version \
|
|
||||||
&& pip install --upgrade pip \
|
|
||||||
&& pip --version \
|
|
||||||
&& pip install pipenv wheel poetry
|
|
||||||
|
|
||||||
# Switch to nonroot user (again)
|
|
||||||
USER nonroot:nonroot
|
|
||||||
WORKDIR /home/nonroot
|
|
||||||
|
|
||||||
# Rust
|
|
||||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
|
||||||
ENV RUSTC_VERSION=1.75.0
|
|
||||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
|
||||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
|
||||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
|
||||||
chmod +x rustup-init && \
|
|
||||||
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
|
|
||||||
rm rustup-init && \
|
|
||||||
export PATH="$HOME/.cargo/bin:$PATH" && \
|
|
||||||
. "$HOME/.cargo/env" && \
|
|
||||||
cargo --version && rustup --version && \
|
|
||||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
|
||||||
cargo install --git https://github.com/paritytech/cachepot && \
|
|
||||||
cargo install rustfilt && \
|
|
||||||
cargo install cargo-hakari && \
|
|
||||||
cargo install cargo-deny && \
|
|
||||||
cargo install cargo-hack && \
|
|
||||||
cargo install cargo-nextest && \
|
|
||||||
rm -rf /home/nonroot/.cargo/registry && \
|
|
||||||
rm -rf /home/nonroot/.cargo/git
|
|
||||||
ENV RUSTC_WRAPPER=cachepot
|
|
||||||
|
|
||||||
# Show versions
|
|
||||||
RUN whoami \
|
|
||||||
&& python --version \
|
|
||||||
&& pip --version \
|
|
||||||
&& cargo --version --verbose \
|
|
||||||
&& rustup --version --verbose \
|
|
||||||
&& rustc --version --verbose \
|
|
||||||
&& clang --version
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
ARG PG_VERSION
|
ARG PG_VERSION
|
||||||
ARG REPOSITORY=neondatabase
|
ARG REPOSITORY=neondatabase
|
||||||
ARG IMAGE=build-tools
|
ARG IMAGE=rust
|
||||||
ARG TAG=pinned
|
ARG TAG=pinned
|
||||||
ARG BUILD_TAG
|
ARG BUILD_TAG
|
||||||
|
|
||||||
@@ -48,29 +48,7 @@ RUN cd postgres && \
|
|||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
||||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
|
||||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
|
||||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
|
||||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
|
||||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
|
||||||
# so we do it here.
|
|
||||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
|
||||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
|
||||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
|
||||||
filename=$(basename "$file"); \
|
|
||||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
|
||||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
|
||||||
fi; \
|
|
||||||
done; \
|
|
||||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
|
||||||
# where pg_stat_statement_reset() got 3 additional arguments
|
|
||||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
|
||||||
filename=$(basename "$file"); \
|
|
||||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
|
||||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
|
||||||
fi; \
|
|
||||||
done
|
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
@@ -883,10 +861,8 @@ FROM debian:bullseye-slim
|
|||||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||||
echo "postgres:test_console_pass" | chpasswd && \
|
echo "postgres:test_console_pass" | chpasswd && \
|
||||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||||
mkdir /var/db/postgres/pgbouncer && \
|
|
||||||
chown -R postgres:postgres /var/db/postgres && \
|
chown -R postgres:postgres /var/db/postgres && \
|
||||||
chmod 0750 /var/db/postgres/compute && \
|
chmod 0750 /var/db/postgres/compute && \
|
||||||
chmod 0750 /var/db/postgres/pgbouncer && \
|
|
||||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||||
# create folder for file cache
|
# create folder for file cache
|
||||||
mkdir -p -m 777 /neon/cache
|
mkdir -p -m 777 /neon/cache
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# First transient image to build compute_tools binaries
|
# First transient image to build compute_tools binaries
|
||||||
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
||||||
ARG REPOSITORY=neondatabase
|
ARG REPOSITORY=neondatabase
|
||||||
ARG IMAGE=build-tools
|
ARG IMAGE=rust
|
||||||
ARG TAG=pinned
|
ARG TAG=pinned
|
||||||
ARG BUILD_TAG
|
ARG BUILD_TAG
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ clap.workspace = true
|
|||||||
flate2.workspace = true
|
flate2.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
hyper = { workspace = true, features = ["full"] }
|
hyper = { workspace = true, features = ["full"] }
|
||||||
nix.workspace = true
|
|
||||||
notify.workspace = true
|
notify.workspace = true
|
||||||
num_cpus.workspace = true
|
num_cpus.workspace = true
|
||||||
opentelemetry.workspace = true
|
opentelemetry.workspace = true
|
||||||
@@ -21,7 +20,6 @@ postgres.workspace = true
|
|||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
signal-hook.workspace = true
|
|
||||||
tar.workspace = true
|
tar.workspace = true
|
||||||
reqwest = { workspace = true, features = ["json"] }
|
reqwest = { workspace = true, features = ["json"] }
|
||||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||||
@@ -41,4 +39,3 @@ remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
|||||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||||
zstd = "0.13"
|
zstd = "0.13"
|
||||||
bytes = "1.0"
|
bytes = "1.0"
|
||||||
rust-ini = "0.20.0"
|
|
||||||
|
|||||||
@@ -31,29 +31,25 @@
|
|||||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||||
//! -S /var/db/postgres/specs/current.json \
|
//! -S /var/db/postgres/specs/current.json \
|
||||||
//! -b /usr/local/bin/postgres \
|
//! -b /usr/local/bin/postgres \
|
||||||
//! -r http://pg-ext-s3-gateway \
|
//! -r http://pg-ext-s3-gateway
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::process::exit;
|
use std::process::exit;
|
||||||
use std::sync::atomic::Ordering;
|
|
||||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||||
use std::{thread, time::Duration};
|
use std::{thread, time::Duration};
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use clap::Arg;
|
use clap::Arg;
|
||||||
use nix::sys::signal::{kill, Signal};
|
|
||||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
|
||||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use compute_api::responses::ComputeStatus;
|
use compute_api::responses::ComputeStatus;
|
||||||
|
|
||||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
|
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||||
use compute_tools::configurator::launch_configurator;
|
use compute_tools::configurator::launch_configurator;
|
||||||
use compute_tools::extension_server::get_pg_version;
|
use compute_tools::extension_server::get_pg_version;
|
||||||
use compute_tools::http::api::launch_http_server;
|
use compute_tools::http::api::launch_http_server;
|
||||||
@@ -69,13 +65,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
|
|||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||||
|
|
||||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
|
||||||
thread::spawn(move || {
|
|
||||||
for sig in signals.forever() {
|
|
||||||
handle_exit_signal(sig);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let build_tag = option_env!("BUILD_TAG")
|
let build_tag = option_env!("BUILD_TAG")
|
||||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||||
.to_string();
|
.to_string();
|
||||||
@@ -224,9 +213,9 @@ fn main() -> Result<()> {
|
|||||||
let compute = Arc::new(compute_node);
|
let compute = Arc::new(compute_node);
|
||||||
|
|
||||||
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
||||||
// available for binding. Prewarming helps Postgres start quicker later,
|
// available for binding. Prewarming helps postgres start quicker later,
|
||||||
// because QEMU will already have it's memory allocated from the host, and
|
// because QEMU will already have it's memory allocated from the host, and
|
||||||
// the necessary binaries will already be cached.
|
// the necessary binaries will alreaady be cached.
|
||||||
if !spec_set {
|
if !spec_set {
|
||||||
compute.prewarm_postgres()?;
|
compute.prewarm_postgres()?;
|
||||||
}
|
}
|
||||||
@@ -269,11 +258,6 @@ fn main() -> Result<()> {
|
|||||||
|
|
||||||
state.status = ComputeStatus::Init;
|
state.status = ComputeStatus::Init;
|
||||||
compute.state_changed.notify_all();
|
compute.state_changed.notify_all();
|
||||||
|
|
||||||
info!(
|
|
||||||
"running compute with features: {:?}",
|
|
||||||
state.pspec.as_ref().unwrap().spec.features
|
|
||||||
);
|
|
||||||
drop(state);
|
drop(state);
|
||||||
|
|
||||||
// Launch remaining service threads
|
// Launch remaining service threads
|
||||||
@@ -286,7 +270,7 @@ fn main() -> Result<()> {
|
|||||||
let pg = match compute.start_compute(extension_server_port) {
|
let pg = match compute.start_compute(extension_server_port) {
|
||||||
Ok(pg) => Some(pg),
|
Ok(pg) => Some(pg),
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
error!("could not start the compute node: {:#}", err);
|
error!("could not start the compute node: {:?}", err);
|
||||||
let mut state = compute.state.lock().unwrap();
|
let mut state = compute.state.lock().unwrap();
|
||||||
state.error = Some(format!("{:?}", err));
|
state.error = Some(format!("{:?}", err));
|
||||||
state.status = ComputeStatus::Failed;
|
state.status = ComputeStatus::Failed;
|
||||||
@@ -348,20 +332,13 @@ fn main() -> Result<()> {
|
|||||||
|
|
||||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||||
// propagate to Postgres and it will be shut down as well.
|
// propagate to Postgres and it will be shut down as well.
|
||||||
if let Some((mut pg, logs_handle)) = pg {
|
if let Some(mut pg) = pg {
|
||||||
// Startup is finished, exit the startup tracing span
|
// Startup is finished, exit the startup tracing span
|
||||||
drop(startup_context_guard);
|
drop(startup_context_guard);
|
||||||
|
|
||||||
let ecode = pg
|
let ecode = pg
|
||||||
.wait()
|
.wait()
|
||||||
.expect("failed to start waiting on Postgres process");
|
.expect("failed to start waiting on Postgres process");
|
||||||
PG_PID.store(0, Ordering::SeqCst);
|
|
||||||
|
|
||||||
// Process has exited, so we can join the logs thread.
|
|
||||||
let _ = logs_handle
|
|
||||||
.join()
|
|
||||||
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
|
|
||||||
|
|
||||||
info!("Postgres exited with code {}, shutting down", ecode);
|
info!("Postgres exited with code {}, shutting down", ecode);
|
||||||
exit_code = ecode.code()
|
exit_code = ecode.code()
|
||||||
}
|
}
|
||||||
@@ -518,24 +495,6 @@ fn cli() -> clap::Command {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
|
||||||
/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
|
|
||||||
/// wait for termination which would be easy then.
|
|
||||||
fn handle_exit_signal(sig: i32) {
|
|
||||||
info!("received {sig} termination signal");
|
|
||||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
|
||||||
if ss_pid != 0 {
|
|
||||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
|
||||||
kill(ss_pid, Signal::SIGTERM).ok();
|
|
||||||
}
|
|
||||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
|
||||||
if pg_pid != 0 {
|
|
||||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
|
||||||
kill(pg_pid, Signal::SIGTERM).ok();
|
|
||||||
}
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn verify_cli() {
|
fn verify_cli() {
|
||||||
cli().debug_assert()
|
cli().debug_assert()
|
||||||
|
|||||||
@@ -6,10 +6,7 @@ use std::os::unix::fs::PermissionsExt;
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, Stdio};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::atomic::AtomicU32;
|
|
||||||
use std::sync::atomic::Ordering;
|
|
||||||
use std::sync::{Condvar, Mutex, RwLock};
|
use std::sync::{Condvar, Mutex, RwLock};
|
||||||
use std::thread;
|
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
@@ -20,7 +17,7 @@ use futures::StreamExt;
|
|||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use tokio;
|
use tokio;
|
||||||
use tokio_postgres;
|
use tokio_postgres;
|
||||||
use tracing::{debug, error, info, instrument, warn};
|
use tracing::{error, info, instrument, warn};
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -31,15 +28,11 @@ use utils::measured_stream::MeasuredReader;
|
|||||||
use remote_storage::{DownloadError, RemotePath};
|
use remote_storage::{DownloadError, RemotePath};
|
||||||
|
|
||||||
use crate::checker::create_availability_check_data;
|
use crate::checker::create_availability_check_data;
|
||||||
use crate::logger::inlinify;
|
|
||||||
use crate::pg_helpers::*;
|
use crate::pg_helpers::*;
|
||||||
use crate::spec::*;
|
use crate::spec::*;
|
||||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||||
use crate::{config, extension_server};
|
use crate::{config, extension_server};
|
||||||
|
|
||||||
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
|
|
||||||
pub static PG_PID: AtomicU32 = AtomicU32::new(0);
|
|
||||||
|
|
||||||
/// Compute node info shared across several `compute_ctl` threads.
|
/// Compute node info shared across several `compute_ctl` threads.
|
||||||
pub struct ComputeNode {
|
pub struct ComputeNode {
|
||||||
// Url type maintains proper escaping
|
// Url type maintains proper escaping
|
||||||
@@ -276,7 +269,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
|||||||
$$;"#,
|
$$;"#,
|
||||||
roles_decl, database_decl,
|
roles_decl, database_decl,
|
||||||
);
|
);
|
||||||
info!("Neon superuser created: {}", inlinify(&query));
|
info!("Neon superuser created:\n{}", &query);
|
||||||
client
|
client
|
||||||
.simple_query(&query)
|
.simple_query(&query)
|
||||||
.map_err(|e| anyhow::anyhow!(e).context(query))?;
|
.map_err(|e| anyhow::anyhow!(e).context(query))?;
|
||||||
@@ -492,7 +485,7 @@ impl ComputeNode {
|
|||||||
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
||||||
let start_time = Utc::now();
|
let start_time = Utc::now();
|
||||||
|
|
||||||
let mut sync_handle = maybe_cgexec(&self.pgbin)
|
let sync_handle = maybe_cgexec(&self.pgbin)
|
||||||
.args(["--sync-safekeepers"])
|
.args(["--sync-safekeepers"])
|
||||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||||
@@ -501,29 +494,15 @@ impl ComputeNode {
|
|||||||
vec![]
|
vec![]
|
||||||
})
|
})
|
||||||
.stdout(Stdio::piped())
|
.stdout(Stdio::piped())
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.spawn()
|
.spawn()
|
||||||
.expect("postgres --sync-safekeepers failed to start");
|
.expect("postgres --sync-safekeepers failed to start");
|
||||||
SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
|
|
||||||
|
|
||||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||||
// final LSN to stdout. So we leave stdout to collect LSN, while stderr logs
|
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||||
// will be collected in a child thread.
|
// redirected to the caller output.
|
||||||
let stderr = sync_handle
|
|
||||||
.stderr
|
|
||||||
.take()
|
|
||||||
.expect("stderr should be captured");
|
|
||||||
let logs_handle = handle_postgres_logs(stderr);
|
|
||||||
|
|
||||||
let sync_output = sync_handle
|
let sync_output = sync_handle
|
||||||
.wait_with_output()
|
.wait_with_output()
|
||||||
.expect("postgres --sync-safekeepers failed");
|
.expect("postgres --sync-safekeepers failed");
|
||||||
SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
|
|
||||||
|
|
||||||
// Process has exited, so we can join the logs thread.
|
|
||||||
let _ = logs_handle
|
|
||||||
.join()
|
|
||||||
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
|
|
||||||
|
|
||||||
if !sync_output.status.success() {
|
if !sync_output.status.success() {
|
||||||
anyhow::bail!(
|
anyhow::bail!(
|
||||||
@@ -661,12 +640,11 @@ impl ComputeNode {
|
|||||||
|
|
||||||
/// Start Postgres as a child process and manage DBs/roles.
|
/// Start Postgres as a child process and manage DBs/roles.
|
||||||
/// After that this will hang waiting on the postmaster process to exit.
|
/// After that this will hang waiting on the postmaster process to exit.
|
||||||
/// Returns a handle to the child process and a handle to the logs thread.
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
pub fn start_postgres(
|
pub fn start_postgres(
|
||||||
&self,
|
&self,
|
||||||
storage_auth_token: Option<String>,
|
storage_auth_token: Option<String>,
|
||||||
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
) -> Result<std::process::Child> {
|
||||||
let pgdata_path = Path::new(&self.pgdata);
|
let pgdata_path = Path::new(&self.pgdata);
|
||||||
|
|
||||||
// Run postgres as a child process.
|
// Run postgres as a child process.
|
||||||
@@ -677,18 +655,12 @@ impl ComputeNode {
|
|||||||
} else {
|
} else {
|
||||||
vec![]
|
vec![]
|
||||||
})
|
})
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.spawn()
|
.spawn()
|
||||||
.expect("cannot start postgres process");
|
.expect("cannot start postgres process");
|
||||||
PG_PID.store(pg.id(), Ordering::SeqCst);
|
|
||||||
|
|
||||||
// Start a thread to collect logs from stderr.
|
|
||||||
let stderr = pg.stderr.take().expect("stderr should be captured");
|
|
||||||
let logs_handle = handle_postgres_logs(stderr);
|
|
||||||
|
|
||||||
wait_for_postgres(&mut pg, pgdata_path)?;
|
wait_for_postgres(&mut pg, pgdata_path)?;
|
||||||
|
|
||||||
Ok((pg, logs_handle))
|
Ok(pg)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do initial configuration of the already started Postgres.
|
/// Do initial configuration of the already started Postgres.
|
||||||
@@ -765,25 +737,6 @@ impl ComputeNode {
|
|||||||
pub fn reconfigure(&self) -> Result<()> {
|
pub fn reconfigure(&self) -> Result<()> {
|
||||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||||
|
|
||||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
|
||||||
info!("tuning pgbouncer");
|
|
||||||
|
|
||||||
let rt = tokio::runtime::Builder::new_current_thread()
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.expect("failed to create rt");
|
|
||||||
|
|
||||||
// Spawn a thread to do the tuning,
|
|
||||||
// so that we don't block the main thread that starts Postgres.
|
|
||||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
|
||||||
let _handle = thread::spawn(move || {
|
|
||||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
|
||||||
if let Err(err) = res {
|
|
||||||
error!("error while tuning pgbouncer: {err:?}");
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write new config
|
// Write new config
|
||||||
let pgdata_path = Path::new(&self.pgdata);
|
let pgdata_path = Path::new(&self.pgdata);
|
||||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||||
@@ -827,10 +780,7 @@ impl ComputeNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
pub fn start_compute(
|
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
|
||||||
&self,
|
|
||||||
extension_server_port: u16,
|
|
||||||
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
|
||||||
let compute_state = self.state.lock().unwrap().clone();
|
let compute_state = self.state.lock().unwrap().clone();
|
||||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||||
info!(
|
info!(
|
||||||
@@ -841,26 +791,6 @@ impl ComputeNode {
|
|||||||
pspec.timeline_id,
|
pspec.timeline_id,
|
||||||
);
|
);
|
||||||
|
|
||||||
// tune pgbouncer
|
|
||||||
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
|
|
||||||
info!("tuning pgbouncer");
|
|
||||||
|
|
||||||
let rt = tokio::runtime::Builder::new_current_thread()
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.expect("failed to create rt");
|
|
||||||
|
|
||||||
// Spawn a thread to do the tuning,
|
|
||||||
// so that we don't block the main thread that starts Postgres.
|
|
||||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
|
||||||
let _handle = thread::spawn(move || {
|
|
||||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
|
||||||
if let Err(err) = res {
|
|
||||||
error!("error while tuning pgbouncer: {err:?}");
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"start_compute spec.remote_extensions {:?}",
|
"start_compute spec.remote_extensions {:?}",
|
||||||
pspec.spec.remote_extensions
|
pspec.spec.remote_extensions
|
||||||
@@ -895,7 +825,7 @@ impl ComputeNode {
|
|||||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||||
|
|
||||||
let start_time = Utc::now();
|
let start_time = Utc::now();
|
||||||
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
|
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||||
|
|
||||||
let config_time = Utc::now();
|
let config_time = Utc::now();
|
||||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||||
@@ -945,17 +875,7 @@ impl ComputeNode {
|
|||||||
};
|
};
|
||||||
info!(?metrics, "compute start finished");
|
info!(?metrics, "compute start finished");
|
||||||
|
|
||||||
Ok(pg_process)
|
Ok(pg)
|
||||||
}
|
|
||||||
|
|
||||||
/// Update the `last_active` in the shared state, but ensure that it's a more recent one.
|
|
||||||
pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
|
|
||||||
let mut state = self.state.lock().unwrap();
|
|
||||||
// NB: `Some(<DateTime>)` is always greater than `None`.
|
|
||||||
if last_active > state.last_active {
|
|
||||||
state.last_active = last_active;
|
|
||||||
debug!("set the last compute activity time to: {:?}", last_active);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Look for core dumps and collect backtraces.
|
// Look for core dumps and collect backtraces.
|
||||||
|
|||||||
@@ -38,9 +38,3 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Replace all newline characters with a special character to make it
|
|
||||||
/// easier to grep for log messages.
|
|
||||||
pub fn inlinify(s: &str) -> String {
|
|
||||||
s.replace('\n', "\u{200B}")
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -3,165 +3,97 @@ use std::{thread, time::Duration};
|
|||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, info};
|
||||||
|
|
||||||
use crate::compute::ComputeNode;
|
use crate::compute::ComputeNode;
|
||||||
use compute_api::responses::ComputeStatus;
|
|
||||||
use compute_api::spec::ComputeFeature;
|
|
||||||
|
|
||||||
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||||
|
|
||||||
// Spin in a loop and figure out the last activity time in the Postgres.
|
// Spin in a loop and figure out the last activity time in the Postgres.
|
||||||
// Then update it in the shared state. This function never errors out.
|
// Then update it in the shared state. This function never errors out.
|
||||||
// NB: the only expected panic is at `Mutex` unwrap(), all other errors
|
// XXX: the only expected panic is at `RwLock` unwrap().
|
||||||
// should be handled gracefully.
|
|
||||||
fn watch_compute_activity(compute: &ComputeNode) {
|
fn watch_compute_activity(compute: &ComputeNode) {
|
||||||
// Suppose that `connstr` doesn't change
|
// Suppose that `connstr` doesn't change
|
||||||
let connstr = compute.connstr.as_str();
|
let connstr = compute.connstr.as_str();
|
||||||
|
|
||||||
// During startup and configuration we connect to every Postgres database,
|
|
||||||
// but we don't want to count this as some user activity. So wait until
|
|
||||||
// the compute fully started before monitoring activity.
|
|
||||||
wait_for_postgres_start(compute);
|
|
||||||
|
|
||||||
// Define `client` outside of the loop to reuse existing connection if it's active.
|
// Define `client` outside of the loop to reuse existing connection if it's active.
|
||||||
let mut client = Client::connect(connstr, NoTls);
|
let mut client = Client::connect(connstr, NoTls);
|
||||||
|
|
||||||
let mut sleep = false;
|
info!("watching Postgres activity at {}", connstr);
|
||||||
let mut prev_active_time: Option<f64> = None;
|
|
||||||
let mut prev_sessions: Option<i64> = None;
|
|
||||||
|
|
||||||
if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
|
|
||||||
info!("starting experimental activity monitor for {}", connstr);
|
|
||||||
} else {
|
|
||||||
info!("starting activity monitor for {}", connstr);
|
|
||||||
}
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// We use `continue` a lot, so it's more convenient to sleep at the top of the loop.
|
// Should be outside of the write lock to allow others to read while we sleep.
|
||||||
// But skip the first sleep, so we can connect to Postgres immediately.
|
thread::sleep(MONITOR_CHECK_INTERVAL);
|
||||||
if sleep {
|
|
||||||
// Should be outside of the mutex lock to allow others to read while we sleep.
|
|
||||||
thread::sleep(MONITOR_CHECK_INTERVAL);
|
|
||||||
} else {
|
|
||||||
sleep = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
match &mut client {
|
match &mut client {
|
||||||
Ok(cli) => {
|
Ok(cli) => {
|
||||||
if cli.is_closed() {
|
if cli.is_closed() {
|
||||||
info!("connection to Postgres is closed, trying to reconnect");
|
info!("connection to postgres closed, trying to reconnect");
|
||||||
|
|
||||||
// Connection is closed, reconnect and try again.
|
// Connection is closed, reconnect and try again.
|
||||||
client = Client::connect(connstr, NoTls);
|
client = Client::connect(connstr, NoTls);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is a new logic, only enable if the feature flag is set.
|
// Get all running client backends except ourself, use RFC3339 DateTime format.
|
||||||
// TODO: remove this once we are sure that it works OR drop it altogether.
|
let backends = cli
|
||||||
if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
|
.query(
|
||||||
// First, check if the total active time or sessions across all databases has changed.
|
"SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
|
||||||
// If it did, it means that user executed some queries. In theory, it can even go down if
|
FROM pg_stat_activity
|
||||||
// some databases were dropped, but it's still a user activity.
|
WHERE backend_type = 'client backend'
|
||||||
match get_database_stats(cli) {
|
AND pid != pg_backend_pid()
|
||||||
Ok((active_time, sessions)) => {
|
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
|
||||||
let mut detected_activity = false;
|
&[],
|
||||||
|
);
|
||||||
|
let mut last_active = compute.state.lock().unwrap().last_active;
|
||||||
|
|
||||||
prev_active_time = match prev_active_time {
|
if let Ok(backs) = backends {
|
||||||
Some(prev_active_time) => {
|
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
|
||||||
if active_time != prev_active_time {
|
|
||||||
detected_activity = true;
|
for b in backs.into_iter() {
|
||||||
}
|
let state: String = match b.try_get("state") {
|
||||||
Some(active_time)
|
Ok(state) => state,
|
||||||
}
|
Err(_) => continue,
|
||||||
None => Some(active_time),
|
};
|
||||||
|
|
||||||
|
if state == "idle" {
|
||||||
|
let change: String = match b.try_get("state_change") {
|
||||||
|
Ok(state_change) => state_change,
|
||||||
|
Err(_) => continue,
|
||||||
};
|
};
|
||||||
prev_sessions = match prev_sessions {
|
let change = DateTime::parse_from_rfc3339(&change);
|
||||||
Some(prev_sessions) => {
|
match change {
|
||||||
if sessions != prev_sessions {
|
Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
|
||||||
detected_activity = true;
|
Err(e) => {
|
||||||
}
|
info!("cannot parse backend state_change DateTime: {}", e);
|
||||||
Some(sessions)
|
continue;
|
||||||
}
|
}
|
||||||
None => Some(sessions),
|
|
||||||
};
|
|
||||||
|
|
||||||
if detected_activity {
|
|
||||||
// Update the last active time and continue, we don't need to
|
|
||||||
// check backends state change.
|
|
||||||
compute.update_last_active(Some(Utc::now()));
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Found non-idle backend, so the last activity is NOW.
|
||||||
|
// Save it and exit the for loop. Also clear the idle backend
|
||||||
|
// `state_change` timestamps array as it doesn't matter now.
|
||||||
|
last_active = Some(Utc::now());
|
||||||
|
idle_backs.clear();
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
Err(e) => {
|
}
|
||||||
error!("could not get database statistics: {}", e);
|
|
||||||
continue;
|
// Get idle backend `state_change` with the max timestamp.
|
||||||
}
|
if let Some(last) = idle_backs.iter().max() {
|
||||||
|
last_active = Some(*last);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Second, if database statistics is the same, check all backends state change,
|
// Update the last activity in the shared state if we got a more recent one.
|
||||||
// maybe there is some with more recent activity. `get_backends_state_change()`
|
let mut state = compute.state.lock().unwrap();
|
||||||
// can return None or stale timestamp, so it's `compute.update_last_active()`
|
// NB: `Some(<DateTime>)` is always greater than `None`.
|
||||||
// responsibility to check if the new timestamp is more recent than the current one.
|
if last_active > state.last_active {
|
||||||
// This helps us to discover new sessions, that did nothing yet.
|
state.last_active = last_active;
|
||||||
match get_backends_state_change(cli) {
|
debug!("set the last compute activity time to: {:?}", last_active);
|
||||||
Ok(last_active) => {
|
|
||||||
compute.update_last_active(last_active);
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
error!("could not get backends state change: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finally, if there are existing (logical) walsenders, do not suspend.
|
|
||||||
//
|
|
||||||
// walproposer doesn't currently show up in pg_stat_replication,
|
|
||||||
// but protect if it will be
|
|
||||||
let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
|
|
||||||
match cli.query_one(ws_count_query, &[]) {
|
|
||||||
Ok(r) => match r.try_get::<&str, i64>("count") {
|
|
||||||
Ok(num_ws) => {
|
|
||||||
if num_ws > 0 {
|
|
||||||
compute.update_last_active(Some(Utc::now()));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
warn!("failed to parse walsenders count: {:?}", e);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(e) => {
|
|
||||||
warn!("failed to get list of walsenders: {:?}", e);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//
|
|
||||||
// Do not suspend compute if autovacuum is running
|
|
||||||
//
|
|
||||||
let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
|
|
||||||
match cli.query_one(autovacuum_count_query, &[]) {
|
|
||||||
Ok(r) => match r.try_get::<&str, i64>("count") {
|
|
||||||
Ok(num_workers) => {
|
|
||||||
if num_workers > 0 {
|
|
||||||
compute.update_last_active(Some(Utc::now()));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
warn!("failed to parse autovacuum workers count: {:?}", e);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(e) => {
|
|
||||||
warn!("failed to get list of autovacuum workers: {:?}", e);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
debug!("could not connect to Postgres: {}, retrying", e);
|
debug!("cannot connect to postgres: {}, retrying", e);
|
||||||
|
|
||||||
// Establish a new connection and try again.
|
// Establish a new connection and try again.
|
||||||
client = Client::connect(connstr, NoTls);
|
client = Client::connect(connstr, NoTls);
|
||||||
@@ -170,124 +102,12 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hang on condition variable waiting until the compute status is `Running`.
|
|
||||||
fn wait_for_postgres_start(compute: &ComputeNode) {
|
|
||||||
let mut state = compute.state.lock().unwrap();
|
|
||||||
while state.status != ComputeStatus::Running {
|
|
||||||
info!("compute is not running, waiting before monitoring activity");
|
|
||||||
state = compute.state_changed.wait(state).unwrap();
|
|
||||||
|
|
||||||
if state.status == ComputeStatus::Running {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Figure out the total active time and sessions across all non-system databases.
|
|
||||||
// Returned tuple is `(active_time, sessions)`.
|
|
||||||
// It can return `0.0` active time or `0` sessions, which means no user databases exist OR
|
|
||||||
// it was a start with skipped `pg_catalog` updates and user didn't do any queries
|
|
||||||
// (or open any sessions) yet.
|
|
||||||
fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
|
|
||||||
// Filter out `postgres` database as `compute_ctl` and other monitoring tools
|
|
||||||
// like `postgres_exporter` use it to query Postgres statistics.
|
|
||||||
// Use explicit 8 bytes type casts to match Rust types.
|
|
||||||
let stats = cli.query_one(
|
|
||||||
"SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
|
|
||||||
coalesce(sum(sessions), 0)::bigint AS total_sessions
|
|
||||||
FROM pg_stat_database
|
|
||||||
WHERE datname NOT IN (
|
|
||||||
'postgres',
|
|
||||||
'template0',
|
|
||||||
'template1'
|
|
||||||
);",
|
|
||||||
&[],
|
|
||||||
);
|
|
||||||
let stats = match stats {
|
|
||||||
Ok(stats) => stats,
|
|
||||||
Err(e) => {
|
|
||||||
return Err(anyhow::anyhow!("could not query active_time: {}", e));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let active_time: f64 = match stats.try_get("total_active_time") {
|
|
||||||
Ok(active_time) => active_time,
|
|
||||||
Err(e) => return Err(anyhow::anyhow!("could not get total_active_time: {}", e)),
|
|
||||||
};
|
|
||||||
|
|
||||||
let sessions: i64 = match stats.try_get("total_sessions") {
|
|
||||||
Ok(sessions) => sessions,
|
|
||||||
Err(e) => return Err(anyhow::anyhow!("could not get total_sessions: {}", e)),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok((active_time, sessions))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Figure out the most recent state change time across all client backends.
|
|
||||||
// If there is currently active backend, timestamp will be `Utc::now()`.
|
|
||||||
// It can return `None`, which means no client backends exist or we were
|
|
||||||
// unable to parse the timestamp.
|
|
||||||
fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime<Utc>>> {
|
|
||||||
let mut last_active: Option<DateTime<Utc>> = None;
|
|
||||||
// Get all running client backends except ourself, use RFC3339 DateTime format.
|
|
||||||
let backends = cli.query(
|
|
||||||
"SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
|
|
||||||
FROM pg_stat_activity
|
|
||||||
WHERE backend_type = 'client backend'
|
|
||||||
AND pid != pg_backend_pid()
|
|
||||||
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
|
|
||||||
&[],
|
|
||||||
);
|
|
||||||
|
|
||||||
match backends {
|
|
||||||
Ok(backs) => {
|
|
||||||
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
|
|
||||||
|
|
||||||
for b in backs.into_iter() {
|
|
||||||
let state: String = match b.try_get("state") {
|
|
||||||
Ok(state) => state,
|
|
||||||
Err(_) => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
if state == "idle" {
|
|
||||||
let change: String = match b.try_get("state_change") {
|
|
||||||
Ok(state_change) => state_change,
|
|
||||||
Err(_) => continue,
|
|
||||||
};
|
|
||||||
let change = DateTime::parse_from_rfc3339(&change);
|
|
||||||
match change {
|
|
||||||
Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
|
|
||||||
Err(e) => {
|
|
||||||
info!("cannot parse backend state_change DateTime: {}", e);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Found non-idle backend, so the last activity is NOW.
|
|
||||||
// Return immediately, no need to check other backends.
|
|
||||||
return Ok(Some(Utc::now()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get idle backend `state_change` with the max timestamp.
|
|
||||||
if let Some(last) = idle_backs.iter().max() {
|
|
||||||
last_active = Some(*last);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
return Err(anyhow::anyhow!("could not query backends: {}", e));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(last_active)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||||
pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||||
let compute = Arc::clone(compute);
|
let state = Arc::clone(state);
|
||||||
|
|
||||||
thread::Builder::new()
|
thread::Builder::new()
|
||||||
.name("compute-monitor".into())
|
.name("compute-monitor".into())
|
||||||
.spawn(move || watch_compute_activity(&compute))
|
.spawn(move || watch_compute_activity(&state))
|
||||||
.expect("cannot launch compute monitor thread")
|
.expect("cannot launch compute monitor thread")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,17 +6,12 @@ use std::io::{BufRead, BufReader};
|
|||||||
use std::os::unix::fs::PermissionsExt;
|
use std::os::unix::fs::PermissionsExt;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::process::Child;
|
use std::process::Child;
|
||||||
use std::thread::JoinHandle;
|
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use anyhow::{bail, Result};
|
use anyhow::{bail, Result};
|
||||||
use ini::Ini;
|
|
||||||
use notify::{RecursiveMode, Watcher};
|
use notify::{RecursiveMode, Watcher};
|
||||||
use postgres::{Client, Transaction};
|
use postgres::{Client, Transaction};
|
||||||
use tokio::io::AsyncBufReadExt;
|
use tracing::{debug, instrument};
|
||||||
use tokio::time::timeout;
|
|
||||||
use tokio_postgres::NoTls;
|
|
||||||
use tracing::{debug, error, info, instrument};
|
|
||||||
|
|
||||||
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||||
|
|
||||||
@@ -364,172 +359,3 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update pgbouncer.ini with provided options
|
|
||||||
fn update_pgbouncer_ini(
|
|
||||||
pgbouncer_config: HashMap<String, String>,
|
|
||||||
pgbouncer_ini_path: &str,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
|
|
||||||
let section = conf.section_mut(Some("pgbouncer")).unwrap();
|
|
||||||
|
|
||||||
for (option_name, value) in pgbouncer_config.iter() {
|
|
||||||
section.insert(option_name, value);
|
|
||||||
debug!(
|
|
||||||
"Updating pgbouncer.ini with new values {}={}",
|
|
||||||
option_name, value
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
conf.write_to_file(pgbouncer_ini_path)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Tune pgbouncer.
|
|
||||||
/// 1. Apply new config using pgbouncer admin console
|
|
||||||
/// 2. Add new values to pgbouncer.ini to preserve them after restart
|
|
||||||
pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
|
|
||||||
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
|
|
||||||
// for VMs use pgbouncer specific way to connect to
|
|
||||||
// pgbouncer admin console without password
|
|
||||||
// when pgbouncer is running under the same user.
|
|
||||||
"host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string()
|
|
||||||
} else {
|
|
||||||
// for k8s use normal connection string with password
|
|
||||||
// to connect to pgbouncer admin console
|
|
||||||
let mut pgbouncer_connstr =
|
|
||||||
"host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
|
|
||||||
if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
|
|
||||||
pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
|
|
||||||
}
|
|
||||||
pgbouncer_connstr
|
|
||||||
};
|
|
||||||
|
|
||||||
info!(
|
|
||||||
"Connecting to pgbouncer with connection string: {}",
|
|
||||||
pgbouncer_connstr
|
|
||||||
);
|
|
||||||
|
|
||||||
// connect to pgbouncer, retrying several times
|
|
||||||
// because pgbouncer may not be ready yet
|
|
||||||
let mut retries = 3;
|
|
||||||
let client = loop {
|
|
||||||
match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await {
|
|
||||||
Ok((client, connection)) => {
|
|
||||||
tokio::spawn(async move {
|
|
||||||
if let Err(e) = connection.await {
|
|
||||||
eprintln!("connection error: {}", e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
break client;
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
if retries == 0 {
|
|
||||||
return Err(e.into());
|
|
||||||
}
|
|
||||||
error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e);
|
|
||||||
retries -= 1;
|
|
||||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Apply new config
|
|
||||||
for (option_name, value) in pgbouncer_config.iter() {
|
|
||||||
let query = format!("SET {}={}", option_name, value);
|
|
||||||
// keep this log line for debugging purposes
|
|
||||||
info!("Applying pgbouncer setting change: {}", query);
|
|
||||||
|
|
||||||
if let Err(err) = client.simple_query(&query).await {
|
|
||||||
// Don't fail on error, just print it into log
|
|
||||||
error!(
|
|
||||||
"Failed to apply pgbouncer setting change: {}, {}",
|
|
||||||
query, err
|
|
||||||
);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// save values to pgbouncer.ini
|
|
||||||
// so that they are preserved after pgbouncer restart
|
|
||||||
let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() {
|
|
||||||
// in VMs we use /etc/pgbouncer.ini
|
|
||||||
"/etc/pgbouncer.ini".to_string()
|
|
||||||
} else {
|
|
||||||
// in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini
|
|
||||||
// this is a shared volume between pgbouncer and postgres containers
|
|
||||||
// FIXME: fix permissions for this file
|
|
||||||
"/var/db/postgres/pgbouncer/pgbouncer.ini".to_string()
|
|
||||||
};
|
|
||||||
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
|
|
||||||
/// and send them to the logger. In the future we may also want to add context to
|
|
||||||
/// these logs.
|
|
||||||
pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
|
|
||||||
std::thread::spawn(move || {
|
|
||||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.expect("failed to build tokio runtime");
|
|
||||||
|
|
||||||
let res = runtime.block_on(async move {
|
|
||||||
let stderr = tokio::process::ChildStderr::from_std(stderr)?;
|
|
||||||
handle_postgres_logs_async(stderr).await
|
|
||||||
});
|
|
||||||
if let Err(e) = res {
|
|
||||||
tracing::error!("error while processing postgres logs: {}", e);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
|
|
||||||
/// - next line starts with timestamp
|
|
||||||
/// - EOF
|
|
||||||
/// - no new lines were written for the last second
|
|
||||||
async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
|
|
||||||
let mut lines = tokio::io::BufReader::new(stderr).lines();
|
|
||||||
let timeout_duration = Duration::from_millis(100);
|
|
||||||
let ts_regex =
|
|
||||||
regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid");
|
|
||||||
|
|
||||||
let mut buf = vec![];
|
|
||||||
loop {
|
|
||||||
let next_line = timeout(timeout_duration, lines.next_line()).await;
|
|
||||||
|
|
||||||
// we should flush lines from the buffer if we cannot continue reading multiline message
|
|
||||||
let should_flush_buf = match next_line {
|
|
||||||
// Flushing if new line starts with timestamp
|
|
||||||
Ok(Ok(Some(ref line))) => ts_regex.is_match(line),
|
|
||||||
// Flushing on EOF, timeout or error
|
|
||||||
_ => true,
|
|
||||||
};
|
|
||||||
|
|
||||||
if !buf.is_empty() && should_flush_buf {
|
|
||||||
// join multiline message into a single line, separated by unicode Zero Width Space.
|
|
||||||
// "PG:" suffix is used to distinguish postgres logs from other logs.
|
|
||||||
let combined = format!("PG:{}\n", buf.join("\u{200B}"));
|
|
||||||
buf.clear();
|
|
||||||
|
|
||||||
// sync write to stderr to avoid interleaving with other logs
|
|
||||||
use std::io::Write;
|
|
||||||
let res = std::io::stderr().lock().write_all(combined.as_bytes());
|
|
||||||
if let Err(e) = res {
|
|
||||||
tracing::error!("error while writing to stderr: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// if not timeout, append line to the buffer
|
|
||||||
if next_line.is_ok() {
|
|
||||||
match next_line?? {
|
|
||||||
Some(line) => buf.push(line),
|
|
||||||
// EOF
|
|
||||||
None => break,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ use reqwest::StatusCode;
|
|||||||
use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
|
use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
|
||||||
|
|
||||||
use crate::config;
|
use crate::config;
|
||||||
use crate::logger::inlinify;
|
|
||||||
use crate::params::PG_HBA_ALL_MD5;
|
use crate::params::PG_HBA_ALL_MD5;
|
||||||
use crate::pg_helpers::*;
|
use crate::pg_helpers::*;
|
||||||
|
|
||||||
@@ -190,20 +189,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
|
|
||||||
// Print a list of existing Postgres roles (only in debug mode)
|
// Print a list of existing Postgres roles (only in debug mode)
|
||||||
if span_enabled!(Level::INFO) {
|
if span_enabled!(Level::INFO) {
|
||||||
let mut vec = Vec::new();
|
info!("postgres roles:");
|
||||||
for r in &existing_roles {
|
for r in &existing_roles {
|
||||||
vec.push(format!(
|
info!(
|
||||||
"{}:{}",
|
" - {}:{}",
|
||||||
r.name,
|
r.name,
|
||||||
if r.encrypted_password.is_some() {
|
if r.encrypted_password.is_some() {
|
||||||
"[FILTERED]"
|
"[FILTERED]"
|
||||||
} else {
|
} else {
|
||||||
"(null)"
|
"(null)"
|
||||||
}
|
}
|
||||||
));
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("postgres roles (total {}): {:?}", vec.len(), vec);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process delta operations first
|
// Process delta operations first
|
||||||
@@ -241,10 +238,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
// Refresh Postgres roles info to handle possible roles renaming
|
// Refresh Postgres roles info to handle possible roles renaming
|
||||||
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
||||||
|
|
||||||
info!(
|
info!("cluster spec roles:");
|
||||||
"handling cluster spec roles (total {})",
|
|
||||||
spec.cluster.roles.len()
|
|
||||||
);
|
|
||||||
for role in &spec.cluster.roles {
|
for role in &spec.cluster.roles {
|
||||||
let name = &role.name;
|
let name = &role.name;
|
||||||
// XXX: with a limited number of roles it is fine, but consider making it a HashMap
|
// XXX: with a limited number of roles it is fine, but consider making it a HashMap
|
||||||
@@ -307,7 +301,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||||
name.pg_quote()
|
name.pg_quote()
|
||||||
);
|
);
|
||||||
info!("running role create query: '{}'", &query);
|
info!("role create query: '{}'", &query);
|
||||||
query.push_str(&role.to_pg_options());
|
query.push_str(&role.to_pg_options());
|
||||||
xact.execute(query.as_str(), &[])?;
|
xact.execute(query.as_str(), &[])?;
|
||||||
}
|
}
|
||||||
@@ -324,7 +318,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
RoleAction::Create => " -> create",
|
RoleAction::Create => " -> create",
|
||||||
RoleAction::Update => " -> update",
|
RoleAction::Update => " -> update",
|
||||||
};
|
};
|
||||||
info!(" - {}:{}{}", name, pwd, action_str);
|
info!(" - {}:{}{}", name, pwd, action_str);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -433,11 +427,10 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
|
|
||||||
// Print a list of existing Postgres databases (only in debug mode)
|
// Print a list of existing Postgres databases (only in debug mode)
|
||||||
if span_enabled!(Level::INFO) {
|
if span_enabled!(Level::INFO) {
|
||||||
let mut vec = Vec::new();
|
info!("postgres databases:");
|
||||||
for (dbname, db) in &existing_dbs {
|
for (dbname, db) in &existing_dbs {
|
||||||
vec.push(format!("{}:{}", dbname, db.owner));
|
info!(" {}:{}", dbname, db.owner);
|
||||||
}
|
}
|
||||||
info!("postgres databases (total {}): {:?}", vec.len(), vec);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process delta operations first
|
// Process delta operations first
|
||||||
@@ -509,10 +502,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
// Refresh Postgres databases info to handle possible renames
|
// Refresh Postgres databases info to handle possible renames
|
||||||
let existing_dbs = get_existing_dbs(client)?;
|
let existing_dbs = get_existing_dbs(client)?;
|
||||||
|
|
||||||
info!(
|
info!("cluster spec databases:");
|
||||||
"handling cluster spec databases (total {})",
|
|
||||||
spec.cluster.databases.len()
|
|
||||||
);
|
|
||||||
for db in &spec.cluster.databases {
|
for db in &spec.cluster.databases {
|
||||||
let name = &db.name;
|
let name = &db.name;
|
||||||
let pg_db = existing_dbs.get(name);
|
let pg_db = existing_dbs.get(name);
|
||||||
@@ -571,7 +561,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
DatabaseAction::Create => " -> create",
|
DatabaseAction::Create => " -> create",
|
||||||
DatabaseAction::Update => " -> update",
|
DatabaseAction::Update => " -> update",
|
||||||
};
|
};
|
||||||
info!(" - {}:{}{}", db.name, db.owner, action_str);
|
info!(" - {}:{}{}", db.name, db.owner, action_str);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -672,11 +662,7 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
|
|||||||
$$;"
|
$$;"
|
||||||
.to_string();
|
.to_string();
|
||||||
|
|
||||||
info!(
|
info!("grant query for db {} : {}", &db.name, &grant_query);
|
||||||
"grant query for db {} : {}",
|
|
||||||
&db.name,
|
|
||||||
inlinify(&grant_query)
|
|
||||||
);
|
|
||||||
db_client.simple_query(&grant_query)?;
|
db_client.simple_query(&grant_query)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,32 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "attachment_service"
|
|
||||||
version = "0.1.0"
|
|
||||||
edition.workspace = true
|
|
||||||
license.workspace = true
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
anyhow.workspace = true
|
|
||||||
camino.workspace = true
|
|
||||||
clap.workspace = true
|
|
||||||
futures.workspace = true
|
|
||||||
git-version.workspace = true
|
|
||||||
hyper.workspace = true
|
|
||||||
pageserver_api.workspace = true
|
|
||||||
pageserver_client.workspace = true
|
|
||||||
postgres_connection.workspace = true
|
|
||||||
serde.workspace = true
|
|
||||||
serde_json.workspace = true
|
|
||||||
thiserror.workspace = true
|
|
||||||
tokio.workspace = true
|
|
||||||
tokio-util.workspace = true
|
|
||||||
tracing.workspace = true
|
|
||||||
|
|
||||||
# TODO: remove this after DB persistence is added, it is only used for
|
|
||||||
# a parsing function when loading pageservers from neon_local LocalEnv
|
|
||||||
postgres_backend.workspace = true
|
|
||||||
|
|
||||||
utils = { path = "../../libs/utils/" }
|
|
||||||
metrics = { path = "../../libs/metrics/" }
|
|
||||||
control_plane = { path = ".." }
|
|
||||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
|
||||||
|
|
||||||
@@ -1,116 +0,0 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
use control_plane::endpoint::ComputeControlPlane;
|
|
||||||
use control_plane::local_env::LocalEnv;
|
|
||||||
use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId};
|
|
||||||
use postgres_connection::parse_host_port;
|
|
||||||
use utils::id::{NodeId, TenantId};
|
|
||||||
|
|
||||||
pub(super) struct ComputeHookTenant {
|
|
||||||
shards: Vec<(ShardIndex, NodeId)>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ComputeHookTenant {
|
|
||||||
pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> {
|
|
||||||
// Find the highest shard count and drop any shards that aren't
|
|
||||||
// for that shard count.
|
|
||||||
let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
|
|
||||||
let Some(shard_count) = shard_count else {
|
|
||||||
// No shards, nothing to do.
|
|
||||||
tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
|
|
||||||
return Ok(());
|
|
||||||
};
|
|
||||||
|
|
||||||
self.shards.retain(|(k, _v)| k.shard_count == shard_count);
|
|
||||||
self.shards
|
|
||||||
.sort_by_key(|(shard, _node_id)| shard.shard_number);
|
|
||||||
|
|
||||||
if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
|
|
||||||
// We have pageservers for all the shards: proceed to reconfigure compute
|
|
||||||
let env = match LocalEnv::load_config() {
|
|
||||||
Ok(e) => e,
|
|
||||||
Err(e) => {
|
|
||||||
tracing::warn!(
|
|
||||||
"Couldn't load neon_local config, skipping compute update ({e})"
|
|
||||||
);
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let cplane = ComputeControlPlane::load(env.clone())
|
|
||||||
.expect("Error loading compute control plane");
|
|
||||||
|
|
||||||
let compute_pageservers = self
|
|
||||||
.shards
|
|
||||||
.iter()
|
|
||||||
.map(|(_shard, node_id)| {
|
|
||||||
let ps_conf = env
|
|
||||||
.get_pageserver_conf(*node_id)
|
|
||||||
.expect("Unknown pageserver");
|
|
||||||
let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
|
|
||||||
.expect("Unable to parse listen_pg_addr");
|
|
||||||
(pg_host, pg_port.unwrap_or(5432))
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
|
||||||
if endpoint.tenant_id == tenant_id && endpoint.status() == "running" {
|
|
||||||
tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
|
|
||||||
endpoint.reconfigure(compute_pageservers.clone()).await?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
tracing::info!(
|
|
||||||
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
|
||||||
self.shards.len(),
|
|
||||||
shard_count.0
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The compute hook is a destination for notifications about changes to tenant:pageserver
|
|
||||||
/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures
|
|
||||||
/// the compute connection string.
|
|
||||||
pub(super) struct ComputeHook {
|
|
||||||
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ComputeHook {
|
|
||||||
pub(super) fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
state: Default::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(super) async fn notify(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
node_id: NodeId,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id);
|
|
||||||
let mut locked = self.state.lock().await;
|
|
||||||
let entry = locked
|
|
||||||
.entry(tenant_shard_id.tenant_id)
|
|
||||||
.or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
|
|
||||||
|
|
||||||
let shard_index = ShardIndex {
|
|
||||||
shard_count: tenant_shard_id.shard_count,
|
|
||||||
shard_number: tenant_shard_id.shard_number,
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut set = false;
|
|
||||||
for (existing_shard, existing_node) in &mut entry.shards {
|
|
||||||
if *existing_shard == shard_index {
|
|
||||||
*existing_node = node_id;
|
|
||||||
set = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !set {
|
|
||||||
entry.shards.push((shard_index, node_id));
|
|
||||||
}
|
|
||||||
|
|
||||||
entry.maybe_reconfigure(tenant_shard_id.tenant_id).await
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,218 +0,0 @@
|
|||||||
use crate::reconciler::ReconcileError;
|
|
||||||
use crate::service::Service;
|
|
||||||
use hyper::{Body, Request, Response};
|
|
||||||
use hyper::{StatusCode, Uri};
|
|
||||||
use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use std::sync::Arc;
|
|
||||||
use utils::auth::SwappableJwtAuth;
|
|
||||||
use utils::http::endpoint::{auth_middleware, request_span};
|
|
||||||
use utils::http::request::parse_request_param;
|
|
||||||
use utils::id::TenantId;
|
|
||||||
|
|
||||||
use utils::{
|
|
||||||
http::{
|
|
||||||
endpoint::{self},
|
|
||||||
error::ApiError,
|
|
||||||
json::{json_request, json_response},
|
|
||||||
RequestExt, RouterBuilder,
|
|
||||||
},
|
|
||||||
id::NodeId,
|
|
||||||
};
|
|
||||||
|
|
||||||
use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
|
|
||||||
|
|
||||||
use control_plane::attachment_service::{
|
|
||||||
AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
|
|
||||||
TenantShardMigrateRequest,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// State available to HTTP request handlers
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct HttpState {
|
|
||||||
service: Arc<crate::service::Service>,
|
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
|
||||||
allowlist_routes: Vec<Uri>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl HttpState {
|
|
||||||
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
|
|
||||||
let allowlist_routes = ["/status"]
|
|
||||||
.iter()
|
|
||||||
.map(|v| v.parse().unwrap())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
Self {
|
|
||||||
service,
|
|
||||||
auth,
|
|
||||||
allowlist_routes,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn get_state(request: &Request<Body>) -> &HttpState {
|
|
||||||
request
|
|
||||||
.data::<Arc<HttpState>>()
|
|
||||||
.expect("unknown state type")
|
|
||||||
.as_ref()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
|
||||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
|
||||||
let state = get_state(&req);
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
state
|
|
||||||
.service
|
|
||||||
.re_attach(reattach_req)
|
|
||||||
.await
|
|
||||||
.map_err(ApiError::InternalServerError)?,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
|
||||||
/// holds the latest generation for the tenants with deletions enqueued
|
|
||||||
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
|
||||||
let state = get_state(&req);
|
|
||||||
json_response(StatusCode::OK, state.service.validate(validate_req))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
|
|
||||||
/// (in the real control plane this is unnecessary, because the same program is managing
|
|
||||||
/// generation numbers and doing attachments).
|
|
||||||
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
|
||||||
let state = get_state(&req);
|
|
||||||
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
state
|
|
||||||
.service
|
|
||||||
.attach_hook(attach_req)
|
|
||||||
.await
|
|
||||||
.map_err(ApiError::InternalServerError)?,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
|
|
||||||
|
|
||||||
let state = get_state(&req);
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, state.service.inspect(inspect_req))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
|
||||||
let state = get_state(&req);
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
state.service.tenant_create(create_req).await?,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
|
||||||
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
|
|
||||||
|
|
||||||
let state = get_state(&req);
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
state
|
|
||||||
.service
|
|
||||||
.tenant_timeline_create(tenant_id, create_req)
|
|
||||||
.await?,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
|
||||||
let state = get_state(&req);
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
|
|
||||||
let state = get_state(&req);
|
|
||||||
state.service.node_register(register_req).await?;
|
|
||||||
json_response(StatusCode::OK, ())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
|
||||||
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
|
|
||||||
if node_id != config_req.node_id {
|
|
||||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
|
||||||
"Path and body node_id differ"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
let state = get_state(&req);
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
|
||||||
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
|
|
||||||
let state = get_state(&req);
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
state
|
|
||||||
.service
|
|
||||||
.tenant_shard_migrate(tenant_shard_id, migrate_req)
|
|
||||||
.await?,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
|
||||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
json_response(StatusCode::OK, ())
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<ReconcileError> for ApiError {
|
|
||||||
fn from(value: ReconcileError) -> Self {
|
|
||||||
ApiError::Conflict(format!("Reconciliation error: {}", value))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn make_router(
|
|
||||||
service: Arc<Service>,
|
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
|
||||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
|
||||||
let mut router = endpoint::make_router();
|
|
||||||
if auth.is_some() {
|
|
||||||
router = router.middleware(auth_middleware(|request| {
|
|
||||||
let state = get_state(request);
|
|
||||||
if state.allowlist_routes.contains(request.uri()) {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
state.auth.as_deref()
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
router
|
|
||||||
.data(Arc::new(HttpState::new(service, auth)))
|
|
||||||
.get("/status", |r| request_span(r, handle_status))
|
|
||||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
|
||||||
.post("/validate", |r| request_span(r, handle_validate))
|
|
||||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
|
||||||
.post("/inspect", |r| request_span(r, handle_inspect))
|
|
||||||
.post("/node", |r| request_span(r, handle_node_register))
|
|
||||||
.put("/node/:node_id/config", |r| {
|
|
||||||
request_span(r, handle_node_configure)
|
|
||||||
})
|
|
||||||
.post("/tenant", |r| request_span(r, handle_tenant_create))
|
|
||||||
.post("/tenant/:tenant_id/timeline", |r| {
|
|
||||||
request_span(r, handle_tenant_timeline_create)
|
|
||||||
})
|
|
||||||
.get("/tenant/:tenant_id/locate", |r| {
|
|
||||||
request_span(r, handle_tenant_locate)
|
|
||||||
})
|
|
||||||
.put("/tenant/:tenant_shard_id/migrate", |r| {
|
|
||||||
request_span(r, handle_tenant_shard_migrate)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use utils::seqwait::MonotonicCounter;
|
|
||||||
|
|
||||||
mod compute_hook;
|
|
||||||
pub mod http;
|
|
||||||
mod node;
|
|
||||||
pub mod persistence;
|
|
||||||
mod reconciler;
|
|
||||||
mod scheduler;
|
|
||||||
pub mod service;
|
|
||||||
mod tenant_state;
|
|
||||||
|
|
||||||
#[derive(Clone, Serialize, Deserialize)]
|
|
||||||
enum PlacementPolicy {
|
|
||||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
|
||||||
Single,
|
|
||||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
|
||||||
/// some number of secondaries.
|
|
||||||
Double(usize),
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
|
|
||||||
struct Sequence(u64);
|
|
||||||
|
|
||||||
impl Sequence {
|
|
||||||
fn initial() -> Self {
|
|
||||||
Self(0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for Sequence {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
||||||
write!(f, "{}", self.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MonotonicCounter<Sequence> for Sequence {
|
|
||||||
fn cnt_advance(&mut self, v: Sequence) {
|
|
||||||
assert!(*self <= v);
|
|
||||||
*self = v;
|
|
||||||
}
|
|
||||||
fn cnt_value(&self) -> Sequence {
|
|
||||||
*self
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Sequence {
|
|
||||||
fn next(&self) -> Sequence {
|
|
||||||
Sequence(self.0 + 1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for PlacementPolicy {
|
|
||||||
fn default() -> Self {
|
|
||||||
PlacementPolicy::Double(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,100 +0,0 @@
|
|||||||
/// The attachment service mimics the aspects of the control plane API
|
|
||||||
/// that are required for a pageserver to operate.
|
|
||||||
///
|
|
||||||
/// This enables running & testing pageservers without a full-blown
|
|
||||||
/// deployment of the Neon cloud platform.
|
|
||||||
///
|
|
||||||
use anyhow::anyhow;
|
|
||||||
use attachment_service::http::make_router;
|
|
||||||
use attachment_service::persistence::Persistence;
|
|
||||||
use attachment_service::service::{Config, Service};
|
|
||||||
use camino::Utf8PathBuf;
|
|
||||||
use clap::Parser;
|
|
||||||
use metrics::launch_timestamp::LaunchTimestamp;
|
|
||||||
use std::sync::Arc;
|
|
||||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
|
||||||
use utils::logging::{self, LogFormat};
|
|
||||||
use utils::signals::{ShutdownSignals, Signal};
|
|
||||||
|
|
||||||
use utils::{project_build_tag, project_git_version, tcp_listener};
|
|
||||||
|
|
||||||
project_git_version!(GIT_VERSION);
|
|
||||||
project_build_tag!(BUILD_TAG);
|
|
||||||
|
|
||||||
#[derive(Parser)]
|
|
||||||
#[command(author, version, about, long_about = None)]
|
|
||||||
#[command(arg_required_else_help(true))]
|
|
||||||
struct Cli {
|
|
||||||
/// Host and port to listen on, like `127.0.0.1:1234`
|
|
||||||
#[arg(short, long)]
|
|
||||||
listen: std::net::SocketAddr,
|
|
||||||
|
|
||||||
/// Path to public key for JWT authentication of clients
|
|
||||||
#[arg(long)]
|
|
||||||
public_key: Option<camino::Utf8PathBuf>,
|
|
||||||
|
|
||||||
/// Token for authenticating this service with the pageservers it controls
|
|
||||||
#[arg(short, long)]
|
|
||||||
jwt_token: Option<String>,
|
|
||||||
|
|
||||||
/// Path to the .json file to store state (will be created if it doesn't exist)
|
|
||||||
#[arg(short, long)]
|
|
||||||
path: Utf8PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
|
||||||
async fn main() -> anyhow::Result<()> {
|
|
||||||
let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
|
|
||||||
|
|
||||||
logging::init(
|
|
||||||
LogFormat::Plain,
|
|
||||||
logging::TracingErrorLayerEnablement::Disabled,
|
|
||||||
logging::Output::Stdout,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let args = Cli::parse();
|
|
||||||
tracing::info!(
|
|
||||||
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
|
|
||||||
GIT_VERSION,
|
|
||||||
launch_ts.to_string(),
|
|
||||||
BUILD_TAG,
|
|
||||||
args.path,
|
|
||||||
args.listen
|
|
||||||
);
|
|
||||||
|
|
||||||
let config = Config {
|
|
||||||
jwt_token: args.jwt_token,
|
|
||||||
};
|
|
||||||
|
|
||||||
let persistence = Arc::new(Persistence::new(&args.path).await);
|
|
||||||
|
|
||||||
let service = Service::spawn(config, persistence).await?;
|
|
||||||
|
|
||||||
let http_listener = tcp_listener::bind(args.listen)?;
|
|
||||||
|
|
||||||
let auth = if let Some(public_key_path) = &args.public_key {
|
|
||||||
let jwt_auth = JwtAuth::from_key_path(public_key_path)?;
|
|
||||||
Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
let router = make_router(service, auth)
|
|
||||||
.build()
|
|
||||||
.map_err(|err| anyhow!(err))?;
|
|
||||||
let service = utils::http::RouterService::new(router).unwrap();
|
|
||||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
|
||||||
|
|
||||||
tracing::info!("Serving on {0}", args.listen);
|
|
||||||
|
|
||||||
tokio::task::spawn(server);
|
|
||||||
|
|
||||||
ShutdownSignals::handle(|signal| match signal {
|
|
||||||
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
|
|
||||||
tracing::info!("Got {}. Terminating", signal.name());
|
|
||||||
// We're just a test helper: no graceful shutdown.
|
|
||||||
std::process::exit(0);
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -1,37 +0,0 @@
|
|||||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
|
||||||
use utils::id::NodeId;
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) struct Node {
|
|
||||||
pub(crate) id: NodeId,
|
|
||||||
|
|
||||||
pub(crate) availability: NodeAvailability,
|
|
||||||
pub(crate) scheduling: NodeSchedulingPolicy,
|
|
||||||
|
|
||||||
pub(crate) listen_http_addr: String,
|
|
||||||
pub(crate) listen_http_port: u16,
|
|
||||||
|
|
||||||
pub(crate) listen_pg_addr: String,
|
|
||||||
pub(crate) listen_pg_port: u16,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Node {
|
|
||||||
pub(crate) fn base_url(&self) -> String {
|
|
||||||
format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Is this node elegible to have work scheduled onto it?
|
|
||||||
pub(crate) fn may_schedule(&self) -> bool {
|
|
||||||
match self.availability {
|
|
||||||
NodeAvailability::Active => {}
|
|
||||||
NodeAvailability::Offline => return false,
|
|
||||||
}
|
|
||||||
|
|
||||||
match self.scheduling {
|
|
||||||
NodeSchedulingPolicy::Active => true,
|
|
||||||
NodeSchedulingPolicy::Draining => false,
|
|
||||||
NodeSchedulingPolicy::Filling => true,
|
|
||||||
NodeSchedulingPolicy::Pause => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,272 +0,0 @@
|
|||||||
use std::{collections::HashMap, str::FromStr};
|
|
||||||
|
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
|
||||||
use control_plane::{
|
|
||||||
attachment_service::{NodeAvailability, NodeSchedulingPolicy},
|
|
||||||
local_env::LocalEnv,
|
|
||||||
};
|
|
||||||
use pageserver_api::{
|
|
||||||
models::TenantConfig,
|
|
||||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
|
||||||
};
|
|
||||||
use postgres_connection::parse_host_port;
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use utils::{
|
|
||||||
generation::Generation,
|
|
||||||
id::{NodeId, TenantId},
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::{node::Node, PlacementPolicy};
|
|
||||||
|
|
||||||
/// Placeholder for storage. This will be replaced with a database client.
|
|
||||||
pub struct Persistence {
|
|
||||||
state: std::sync::Mutex<PersistentState>,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Top level state available to all HTTP handlers
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
struct PersistentState {
|
|
||||||
tenants: HashMap<TenantShardId, TenantShardPersistence>,
|
|
||||||
|
|
||||||
#[serde(skip)]
|
|
||||||
path: Utf8PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A convenience for serializing the state inside a sync lock, and then
|
|
||||||
/// writing it to disk outside of the lock. This will go away when switching
|
|
||||||
/// to a database backend.
|
|
||||||
struct PendingWrite {
|
|
||||||
bytes: Vec<u8>,
|
|
||||||
path: Utf8PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PendingWrite {
|
|
||||||
async fn commit(&self) -> anyhow::Result<()> {
|
|
||||||
tokio::fs::write(&self.path, &self.bytes).await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PersistentState {
|
|
||||||
fn save(&self) -> PendingWrite {
|
|
||||||
PendingWrite {
|
|
||||||
bytes: serde_json::to_vec(self).expect("Serialization error"),
|
|
||||||
path: self.path.clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
|
|
||||||
let bytes = tokio::fs::read(path).await?;
|
|
||||||
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
|
|
||||||
decoded.path = path.to_owned();
|
|
||||||
|
|
||||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
|
||||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
|
||||||
// empty strings with proper defaults.
|
|
||||||
if tenant.tenant_id.is_empty() {
|
|
||||||
tenant.tenant_id = format!("{}", tenant_id);
|
|
||||||
tenant.config = serde_json::to_string(&TenantConfig::default())?;
|
|
||||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(decoded)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn load_or_new(path: &Utf8Path) -> Self {
|
|
||||||
match Self::load(path).await {
|
|
||||||
Ok(s) => {
|
|
||||||
tracing::info!("Loaded state file at {}", path);
|
|
||||||
s
|
|
||||||
}
|
|
||||||
Err(e)
|
|
||||||
if e.downcast_ref::<std::io::Error>()
|
|
||||||
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
|
|
||||||
.unwrap_or(false) =>
|
|
||||||
{
|
|
||||||
tracing::info!("Will create state file at {}", path);
|
|
||||||
Self {
|
|
||||||
tenants: HashMap::new(),
|
|
||||||
path: path.to_owned(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Persistence {
|
|
||||||
pub async fn new(path: &Utf8Path) -> Self {
|
|
||||||
let state = PersistentState::load_or_new(path).await;
|
|
||||||
Self {
|
|
||||||
state: std::sync::Mutex::new(state),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// When registering a node, persist it so that on next start we will be able to
|
|
||||||
/// iterate over known nodes to synchronize their tenant shard states with our observed state.
|
|
||||||
pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
|
|
||||||
// TODO: node persitence will come with database backend
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// At startup, we populate the service's list of nodes, and use this list to call into
|
|
||||||
/// each node to do an initial reconciliation of the state of the world with our in-memory
|
|
||||||
/// observed state.
|
|
||||||
pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
|
|
||||||
let env = LocalEnv::load_config()?;
|
|
||||||
// TODO: node persitence will come with database backend
|
|
||||||
|
|
||||||
// XXX hack: enable test_backward_compatibility to work by populating our list of
|
|
||||||
// nodes from LocalEnv when it is not present in persistent storage. Otherwise at
|
|
||||||
// first startup in the compat test, we may have shards but no nodes.
|
|
||||||
let mut result = Vec::new();
|
|
||||||
tracing::info!(
|
|
||||||
"Loaded {} pageserver nodes from LocalEnv",
|
|
||||||
env.pageservers.len()
|
|
||||||
);
|
|
||||||
for ps_conf in env.pageservers {
|
|
||||||
let (pg_host, pg_port) =
|
|
||||||
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
|
||||||
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
|
|
||||||
.expect("Unable to parse listen_http_addr");
|
|
||||||
result.push(Node {
|
|
||||||
id: ps_conf.id,
|
|
||||||
listen_pg_addr: pg_host.to_string(),
|
|
||||||
listen_pg_port: pg_port.unwrap_or(5432),
|
|
||||||
listen_http_addr: http_host.to_string(),
|
|
||||||
listen_http_port: http_port.unwrap_or(80),
|
|
||||||
availability: NodeAvailability::Active,
|
|
||||||
scheduling: NodeSchedulingPolicy::Active,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// At startup, we populate our map of tenant shards from persistent storage.
|
|
||||||
pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
|
|
||||||
let locked = self.state.lock().unwrap();
|
|
||||||
Ok(locked.tenants.values().cloned().collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Tenants must be persisted before we schedule them for the first time. This enables us
|
|
||||||
/// to correctly retain generation monotonicity, and the externally provided placement policy & config.
|
|
||||||
pub(crate) async fn insert_tenant_shards(
|
|
||||||
&self,
|
|
||||||
shards: Vec<TenantShardPersistence>,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let write = {
|
|
||||||
let mut locked = self.state.lock().unwrap();
|
|
||||||
for shard in shards {
|
|
||||||
let tenant_shard_id = TenantShardId {
|
|
||||||
tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
|
|
||||||
shard_number: ShardNumber(shard.shard_number as u8),
|
|
||||||
shard_count: ShardCount(shard.shard_count as u8),
|
|
||||||
};
|
|
||||||
|
|
||||||
locked.tenants.insert(tenant_shard_id, shard);
|
|
||||||
}
|
|
||||||
locked.save()
|
|
||||||
};
|
|
||||||
|
|
||||||
write.commit().await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
|
|
||||||
/// advancing generation number. We also store the NodeId for which the generation was issued, so that in
|
|
||||||
/// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
|
|
||||||
pub(crate) async fn increment_generation(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
node_id: Option<NodeId>,
|
|
||||||
) -> anyhow::Result<Generation> {
|
|
||||||
let (write, gen) = {
|
|
||||||
let mut locked = self.state.lock().unwrap();
|
|
||||||
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
|
|
||||||
anyhow::bail!("Tried to increment generation of unknown shard");
|
|
||||||
};
|
|
||||||
|
|
||||||
// If we're called with a None pageserver, we need only update the generation
|
|
||||||
// record to disassociate it with this pageserver, not actually increment the number, as
|
|
||||||
// the increment is guaranteed to happen the next time this tenant is attached.
|
|
||||||
if node_id.is_some() {
|
|
||||||
shard.generation += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
shard.generation_pageserver = node_id;
|
|
||||||
let gen = Generation::new(shard.generation);
|
|
||||||
(locked.save(), gen)
|
|
||||||
};
|
|
||||||
|
|
||||||
write.commit().await?;
|
|
||||||
Ok(gen)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn re_attach(
|
|
||||||
&self,
|
|
||||||
node_id: NodeId,
|
|
||||||
) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
|
|
||||||
let (write, result) = {
|
|
||||||
let mut result = HashMap::new();
|
|
||||||
let mut locked = self.state.lock().unwrap();
|
|
||||||
for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
|
|
||||||
if shard.generation_pageserver == Some(node_id) {
|
|
||||||
shard.generation += 1;
|
|
||||||
result.insert(*tenant_shard_id, Generation::new(shard.generation));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(locked.save(), result)
|
|
||||||
};
|
|
||||||
|
|
||||||
write.commit().await?;
|
|
||||||
Ok(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: when we start shard splitting, we must durably mark the tenant so that
|
|
||||||
// on restart, we know that we must go through recovery (list shards that exist
|
|
||||||
// and pick up where we left off and/or revert to parent shards).
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
|
|
||||||
todo!();
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: when we finish shard splitting, we must atomically clean up the old shards
|
|
||||||
// and insert the new shards, and clear the splitting marker.
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
|
|
||||||
todo!();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
|
||||||
#[derive(Serialize, Deserialize, Clone)]
|
|
||||||
pub(crate) struct TenantShardPersistence {
|
|
||||||
#[serde(default)]
|
|
||||||
pub(crate) tenant_id: String,
|
|
||||||
#[serde(default)]
|
|
||||||
pub(crate) shard_number: i32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub(crate) shard_count: i32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub(crate) shard_stripe_size: i32,
|
|
||||||
|
|
||||||
// Currently attached pageserver
|
|
||||||
#[serde(rename = "pageserver")]
|
|
||||||
pub(crate) generation_pageserver: Option<NodeId>,
|
|
||||||
|
|
||||||
// Latest generation number: next time we attach, increment this
|
|
||||||
// and use the incremented number when attaching
|
|
||||||
pub(crate) generation: u32,
|
|
||||||
|
|
||||||
#[serde(default)]
|
|
||||||
pub(crate) placement_policy: String,
|
|
||||||
#[serde(default)]
|
|
||||||
pub(crate) config: String,
|
|
||||||
}
|
|
||||||
@@ -1,495 +0,0 @@
|
|||||||
use crate::persistence::Persistence;
|
|
||||||
use crate::service;
|
|
||||||
use control_plane::attachment_service::NodeAvailability;
|
|
||||||
use pageserver_api::models::{
|
|
||||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
|
||||||
};
|
|
||||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
|
||||||
use pageserver_client::mgmt_api;
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::sync::Arc;
|
|
||||||
use std::time::Duration;
|
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use utils::generation::Generation;
|
|
||||||
use utils::id::{NodeId, TimelineId};
|
|
||||||
use utils::lsn::Lsn;
|
|
||||||
|
|
||||||
use crate::compute_hook::ComputeHook;
|
|
||||||
use crate::node::Node;
|
|
||||||
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
|
|
||||||
|
|
||||||
/// Object with the lifetime of the background reconcile task that is created
|
|
||||||
/// for tenants which have a difference between their intent and observed states.
|
|
||||||
pub(super) struct Reconciler {
|
|
||||||
/// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
|
|
||||||
/// of a tenant's state from when we spawned a reconcile task.
|
|
||||||
pub(super) tenant_shard_id: TenantShardId,
|
|
||||||
pub(crate) shard: ShardIdentity,
|
|
||||||
pub(crate) generation: Generation,
|
|
||||||
pub(crate) intent: IntentState,
|
|
||||||
pub(crate) config: TenantConfig,
|
|
||||||
pub(crate) observed: ObservedState,
|
|
||||||
|
|
||||||
pub(crate) service_config: service::Config,
|
|
||||||
|
|
||||||
/// A snapshot of the pageservers as they were when we were asked
|
|
||||||
/// to reconcile.
|
|
||||||
pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
|
|
||||||
|
|
||||||
/// A hook to notify the running postgres instances when we change the location
|
|
||||||
/// of a tenant
|
|
||||||
pub(crate) compute_hook: Arc<ComputeHook>,
|
|
||||||
|
|
||||||
/// A means to abort background reconciliation: it is essential to
|
|
||||||
/// call this when something changes in the original TenantState that
|
|
||||||
/// will make this reconciliation impossible or unnecessary, for
|
|
||||||
/// example when a pageserver node goes offline, or the PlacementPolicy for
|
|
||||||
/// the tenant is changed.
|
|
||||||
pub(crate) cancel: CancellationToken,
|
|
||||||
|
|
||||||
/// Access to persistent storage for updating generation numbers
|
|
||||||
pub(crate) persistence: Arc<Persistence>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
pub enum ReconcileError {
|
|
||||||
#[error(transparent)]
|
|
||||||
Other(#[from] anyhow::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Reconciler {
|
|
||||||
async fn location_config(
|
|
||||||
&mut self,
|
|
||||||
node_id: NodeId,
|
|
||||||
config: LocationConfig,
|
|
||||||
flush_ms: Option<Duration>,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let node = self
|
|
||||||
.pageservers
|
|
||||||
.get(&node_id)
|
|
||||||
.expect("Pageserver may not be removed while referenced");
|
|
||||||
|
|
||||||
self.observed
|
|
||||||
.locations
|
|
||||||
.insert(node.id, ObservedStateLocation { conf: None });
|
|
||||||
|
|
||||||
tracing::info!("location_config({}) calling: {:?}", node_id, config);
|
|
||||||
let client =
|
|
||||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
|
||||||
client
|
|
||||||
.location_config(self.tenant_shard_id, config.clone(), flush_ms)
|
|
||||||
.await?;
|
|
||||||
tracing::info!("location_config({}) complete: {:?}", node_id, config);
|
|
||||||
|
|
||||||
self.observed
|
|
||||||
.locations
|
|
||||||
.insert(node.id, ObservedStateLocation { conf: Some(config) });
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
|
|
||||||
let destination = if let Some(node_id) = self.intent.attached {
|
|
||||||
match self.observed.locations.get(&node_id) {
|
|
||||||
Some(conf) => {
|
|
||||||
// We will do a live migration only if the intended destination is not
|
|
||||||
// currently in an attached state.
|
|
||||||
match &conf.conf {
|
|
||||||
Some(conf) if conf.mode == LocationConfigMode::Secondary => {
|
|
||||||
// Fall through to do a live migration
|
|
||||||
node_id
|
|
||||||
}
|
|
||||||
None | Some(_) => {
|
|
||||||
// Attached or uncertain: don't do a live migration, proceed
|
|
||||||
// with a general-case reconciliation
|
|
||||||
tracing::info!("maybe_live_migrate: destination is None or attached");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
// Our destination is not attached: maybe live migrate if some other
|
|
||||||
// node is currently attached. Fall through.
|
|
||||||
node_id
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// No intent to be attached
|
|
||||||
tracing::info!("maybe_live_migrate: no attached intent");
|
|
||||||
return Ok(());
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut origin = None;
|
|
||||||
for (node_id, state) in &self.observed.locations {
|
|
||||||
if let Some(observed_conf) = &state.conf {
|
|
||||||
if observed_conf.mode == LocationConfigMode::AttachedSingle {
|
|
||||||
let node = self
|
|
||||||
.pageservers
|
|
||||||
.get(node_id)
|
|
||||||
.expect("Nodes may not be removed while referenced");
|
|
||||||
// We will only attempt live migration if the origin is not offline: this
|
|
||||||
// avoids trying to do it while reconciling after responding to an HA failover.
|
|
||||||
if !matches!(node.availability, NodeAvailability::Offline) {
|
|
||||||
origin = Some(*node_id);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(origin) = origin else {
|
|
||||||
tracing::info!("maybe_live_migrate: no origin found");
|
|
||||||
return Ok(());
|
|
||||||
};
|
|
||||||
|
|
||||||
// We have an origin and a destination: proceed to do the live migration
|
|
||||||
tracing::info!("Live migrating {}->{}", origin, destination);
|
|
||||||
self.live_migrate(origin, destination).await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_lsns(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
node_id: &NodeId,
|
|
||||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
|
||||||
let node = self
|
|
||||||
.pageservers
|
|
||||||
.get(node_id)
|
|
||||||
.expect("Pageserver may not be removed while referenced");
|
|
||||||
|
|
||||||
let client =
|
|
||||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
|
||||||
|
|
||||||
let timelines = client.timeline_list(&tenant_shard_id).await?;
|
|
||||||
Ok(timelines
|
|
||||||
.into_iter()
|
|
||||||
.map(|t| (t.timeline_id, t.last_record_lsn))
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
|
|
||||||
let node = self
|
|
||||||
.pageservers
|
|
||||||
.get(node_id)
|
|
||||||
.expect("Pageserver may not be removed while referenced");
|
|
||||||
|
|
||||||
let client =
|
|
||||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
|
||||||
|
|
||||||
match client.tenant_secondary_download(tenant_shard_id).await {
|
|
||||||
Ok(()) => {}
|
|
||||||
Err(_) => {
|
|
||||||
tracing::info!(" (skipping, destination wasn't in secondary mode)")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn await_lsn(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
pageserver_id: &NodeId,
|
|
||||||
baseline: HashMap<TimelineId, Lsn>,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
loop {
|
|
||||||
let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
|
|
||||||
Ok(l) => l,
|
|
||||||
Err(e) => {
|
|
||||||
println!(
|
|
||||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
|
||||||
pageserver_id
|
|
||||||
);
|
|
||||||
std::thread::sleep(Duration::from_millis(500));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut any_behind: bool = false;
|
|
||||||
for (timeline_id, baseline_lsn) in &baseline {
|
|
||||||
match latest.get(timeline_id) {
|
|
||||||
Some(latest_lsn) => {
|
|
||||||
println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
|
||||||
if latest_lsn < baseline_lsn {
|
|
||||||
any_behind = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
// Expected timeline isn't yet visible on migration destination.
|
|
||||||
// (IRL we would have to account for timeline deletion, but this
|
|
||||||
// is just test helper)
|
|
||||||
any_behind = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !any_behind {
|
|
||||||
println!("✅ LSN caught up. Proceeding...");
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
std::thread::sleep(Duration::from_millis(500));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn live_migrate(
|
|
||||||
&mut self,
|
|
||||||
origin_ps_id: NodeId,
|
|
||||||
dest_ps_id: NodeId,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
// `maybe_live_migrate` is responsibble for sanity of inputs
|
|
||||||
assert!(origin_ps_id != dest_ps_id);
|
|
||||||
|
|
||||||
fn build_location_config(
|
|
||||||
shard: &ShardIdentity,
|
|
||||||
config: &TenantConfig,
|
|
||||||
mode: LocationConfigMode,
|
|
||||||
generation: Option<Generation>,
|
|
||||||
secondary_conf: Option<LocationConfigSecondary>,
|
|
||||||
) -> LocationConfig {
|
|
||||||
LocationConfig {
|
|
||||||
mode,
|
|
||||||
generation: generation.map(|g| g.into().unwrap()),
|
|
||||||
secondary_conf,
|
|
||||||
tenant_conf: config.clone(),
|
|
||||||
shard_number: shard.number.0,
|
|
||||||
shard_count: shard.count.0,
|
|
||||||
shard_stripe_size: shard.stripe_size.0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tracing::info!(
|
|
||||||
"🔁 Switching origin pageserver {} to stale mode",
|
|
||||||
origin_ps_id
|
|
||||||
);
|
|
||||||
|
|
||||||
// FIXME: it is incorrect to use self.generation here, we should use the generation
|
|
||||||
// from the ObservedState of the origin pageserver (it might be older than self.generation)
|
|
||||||
let stale_conf = build_location_config(
|
|
||||||
&self.shard,
|
|
||||||
&self.config,
|
|
||||||
LocationConfigMode::AttachedStale,
|
|
||||||
Some(self.generation),
|
|
||||||
None,
|
|
||||||
);
|
|
||||||
self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
|
|
||||||
|
|
||||||
// If we are migrating to a destination that has a secondary location, warm it up first
|
|
||||||
if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
|
|
||||||
if let Some(destination_conf) = &destination_conf.conf {
|
|
||||||
if destination_conf.mode == LocationConfigMode::Secondary {
|
|
||||||
tracing::info!(
|
|
||||||
"🔁 Downloading latest layers to destination pageserver {}",
|
|
||||||
dest_ps_id,
|
|
||||||
);
|
|
||||||
self.secondary_download(self.tenant_shard_id, &dest_ps_id)
|
|
||||||
.await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Increment generation before attaching to new pageserver
|
|
||||||
self.generation = self
|
|
||||||
.persistence
|
|
||||||
.increment_generation(self.tenant_shard_id, Some(dest_ps_id))
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let dest_conf = build_location_config(
|
|
||||||
&self.shard,
|
|
||||||
&self.config,
|
|
||||||
LocationConfigMode::AttachedMulti,
|
|
||||||
Some(self.generation),
|
|
||||||
None,
|
|
||||||
);
|
|
||||||
|
|
||||||
tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
|
|
||||||
self.location_config(dest_ps_id, dest_conf, None).await?;
|
|
||||||
|
|
||||||
if let Some(baseline) = baseline_lsns {
|
|
||||||
tracing::info!("🕑 Waiting for LSN to catch up...");
|
|
||||||
self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
|
|
||||||
self.compute_hook
|
|
||||||
.notify(self.tenant_shard_id, dest_ps_id)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
|
|
||||||
// this location will be deleted in the general case reconciliation that runs after this.
|
|
||||||
let origin_secondary_conf = build_location_config(
|
|
||||||
&self.shard,
|
|
||||||
&self.config,
|
|
||||||
LocationConfigMode::Secondary,
|
|
||||||
None,
|
|
||||||
Some(LocationConfigSecondary { warm: true }),
|
|
||||||
);
|
|
||||||
self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
|
|
||||||
.await?;
|
|
||||||
// TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
|
|
||||||
// partway through. In fact, all location conf API calls should be in a wrapper that sets
|
|
||||||
// the observed state to None, then runs, then sets it to what we wrote.
|
|
||||||
self.observed.locations.insert(
|
|
||||||
origin_ps_id,
|
|
||||||
ObservedStateLocation {
|
|
||||||
conf: Some(origin_secondary_conf),
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
|
||||||
dest_ps_id
|
|
||||||
);
|
|
||||||
let dest_final_conf = build_location_config(
|
|
||||||
&self.shard,
|
|
||||||
&self.config,
|
|
||||||
LocationConfigMode::AttachedSingle,
|
|
||||||
Some(self.generation),
|
|
||||||
None,
|
|
||||||
);
|
|
||||||
self.location_config(dest_ps_id, dest_final_conf.clone(), None)
|
|
||||||
.await?;
|
|
||||||
self.observed.locations.insert(
|
|
||||||
dest_ps_id,
|
|
||||||
ObservedStateLocation {
|
|
||||||
conf: Some(dest_final_conf),
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
println!("✅ Migration complete");
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Reconciling a tenant makes API calls to pageservers until the observed state
|
|
||||||
/// matches the intended state.
|
|
||||||
///
|
|
||||||
/// First we apply special case handling (e.g. for live migrations), and then a
|
|
||||||
/// general case reconciliation where we walk through the intent by pageserver
|
|
||||||
/// and call out to the pageserver to apply the desired state.
|
|
||||||
pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
|
|
||||||
// TODO: if any of self.observed is None, call to remote pageservers
|
|
||||||
// to learn correct state.
|
|
||||||
|
|
||||||
// Special case: live migration
|
|
||||||
self.maybe_live_migrate().await?;
|
|
||||||
|
|
||||||
// If the attached pageserver is not attached, do so now.
|
|
||||||
if let Some(node_id) = self.intent.attached {
|
|
||||||
let mut wanted_conf =
|
|
||||||
attached_location_conf(self.generation, &self.shard, &self.config);
|
|
||||||
match self.observed.locations.get(&node_id) {
|
|
||||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
|
||||||
// Nothing to do
|
|
||||||
tracing::info!("Observed configuration already correct.")
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
// In all cases other than a matching observed configuration, we will
|
|
||||||
// reconcile this location. This includes locations with different configurations, as well
|
|
||||||
// as locations with unknown (None) observed state.
|
|
||||||
self.generation = self
|
|
||||||
.persistence
|
|
||||||
.increment_generation(self.tenant_shard_id, Some(node_id))
|
|
||||||
.await?;
|
|
||||||
wanted_conf.generation = self.generation.into();
|
|
||||||
tracing::info!("Observed configuration requires update.");
|
|
||||||
self.location_config(node_id, wanted_conf, None).await?;
|
|
||||||
if let Err(e) = self
|
|
||||||
.compute_hook
|
|
||||||
.notify(self.tenant_shard_id, node_id)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
tracing::warn!(
|
|
||||||
"Failed to notify compute of newly attached pageserver {node_id}: {e}"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Configure secondary locations: if these were previously attached this
|
|
||||||
// implicitly downgrades them from attached to secondary.
|
|
||||||
let mut changes = Vec::new();
|
|
||||||
for node_id in &self.intent.secondary {
|
|
||||||
let wanted_conf = secondary_location_conf(&self.shard, &self.config);
|
|
||||||
match self.observed.locations.get(node_id) {
|
|
||||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
|
||||||
// Nothing to do
|
|
||||||
tracing::info!(%node_id, "Observed configuration already correct.")
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
// In all cases other than a matching observed configuration, we will
|
|
||||||
// reconcile this location.
|
|
||||||
tracing::info!(%node_id, "Observed configuration requires update.");
|
|
||||||
changes.push((*node_id, wanted_conf))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Detach any extraneous pageservers that are no longer referenced
|
|
||||||
// by our intent.
|
|
||||||
let all_pageservers = self.intent.all_pageservers();
|
|
||||||
for node_id in self.observed.locations.keys() {
|
|
||||||
if all_pageservers.contains(node_id) {
|
|
||||||
// We are only detaching pageservers that aren't used at all.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
changes.push((
|
|
||||||
*node_id,
|
|
||||||
LocationConfig {
|
|
||||||
mode: LocationConfigMode::Detached,
|
|
||||||
generation: None,
|
|
||||||
secondary_conf: None,
|
|
||||||
shard_number: self.shard.number.0,
|
|
||||||
shard_count: self.shard.count.0,
|
|
||||||
shard_stripe_size: self.shard.stripe_size.0,
|
|
||||||
tenant_conf: self.config.clone(),
|
|
||||||
},
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (node_id, conf) in changes {
|
|
||||||
self.location_config(node_id, conf, None).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn attached_location_conf(
|
|
||||||
generation: Generation,
|
|
||||||
shard: &ShardIdentity,
|
|
||||||
config: &TenantConfig,
|
|
||||||
) -> LocationConfig {
|
|
||||||
LocationConfig {
|
|
||||||
mode: LocationConfigMode::AttachedSingle,
|
|
||||||
generation: generation.into(),
|
|
||||||
secondary_conf: None,
|
|
||||||
shard_number: shard.number.0,
|
|
||||||
shard_count: shard.count.0,
|
|
||||||
shard_stripe_size: shard.stripe_size.0,
|
|
||||||
tenant_conf: config.clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn secondary_location_conf(
|
|
||||||
shard: &ShardIdentity,
|
|
||||||
config: &TenantConfig,
|
|
||||||
) -> LocationConfig {
|
|
||||||
LocationConfig {
|
|
||||||
mode: LocationConfigMode::Secondary,
|
|
||||||
generation: None,
|
|
||||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
|
||||||
shard_number: shard.number.0,
|
|
||||||
shard_count: shard.count.0,
|
|
||||||
shard_stripe_size: shard.stripe_size.0,
|
|
||||||
tenant_conf: config.clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,89 +0,0 @@
|
|||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use std::collections::{BTreeMap, HashMap};
|
|
||||||
use utils::{http::error::ApiError, id::NodeId};
|
|
||||||
|
|
||||||
use crate::{node::Node, tenant_state::TenantState};
|
|
||||||
|
|
||||||
/// Scenarios in which we cannot find a suitable location for a tenant shard
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
pub enum ScheduleError {
|
|
||||||
#[error("No pageservers found")]
|
|
||||||
NoPageservers,
|
|
||||||
#[error("No pageserver found matching constraint")]
|
|
||||||
ImpossibleConstraint,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<ScheduleError> for ApiError {
|
|
||||||
fn from(value: ScheduleError) -> Self {
|
|
||||||
ApiError::Conflict(format!("Scheduling error: {}", value))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) struct Scheduler {
|
|
||||||
tenant_counts: HashMap<NodeId, usize>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Scheduler {
|
|
||||||
pub(crate) fn new(
|
|
||||||
tenants: &BTreeMap<TenantShardId, TenantState>,
|
|
||||||
nodes: &HashMap<NodeId, Node>,
|
|
||||||
) -> Self {
|
|
||||||
let mut tenant_counts = HashMap::new();
|
|
||||||
for node_id in nodes.keys() {
|
|
||||||
tenant_counts.insert(*node_id, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
for tenant in tenants.values() {
|
|
||||||
if let Some(ps) = tenant.intent.attached {
|
|
||||||
let entry = tenant_counts.entry(ps).or_insert(0);
|
|
||||||
*entry += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (node_id, node) in nodes {
|
|
||||||
if !node.may_schedule() {
|
|
||||||
tenant_counts.remove(node_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Self { tenant_counts }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn schedule_shard(
|
|
||||||
&mut self,
|
|
||||||
hard_exclude: &[NodeId],
|
|
||||||
) -> Result<NodeId, ScheduleError> {
|
|
||||||
if self.tenant_counts.is_empty() {
|
|
||||||
return Err(ScheduleError::NoPageservers);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
|
||||||
.tenant_counts
|
|
||||||
.iter()
|
|
||||||
.filter_map(|(k, v)| {
|
|
||||||
if hard_exclude.contains(k) {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some((*k, *v))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// Sort by tenant count. Nodes with the same tenant count are sorted by ID.
|
|
||||||
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
|
||||||
|
|
||||||
if tenant_counts.is_empty() {
|
|
||||||
// After applying constraints, no pageservers were left
|
|
||||||
return Err(ScheduleError::ImpossibleConstraint);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (node_id, count) in &tenant_counts {
|
|
||||||
tracing::info!("tenant_counts[{node_id}]={count}");
|
|
||||||
}
|
|
||||||
|
|
||||||
let node_id = tenant_counts.first().unwrap().0;
|
|
||||||
tracing::info!("scheduler selected node {node_id}");
|
|
||||||
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
|
|
||||||
Ok(node_id)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,455 +0,0 @@
|
|||||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
|
||||||
|
|
||||||
use control_plane::attachment_service::NodeAvailability;
|
|
||||||
use pageserver_api::{
|
|
||||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
|
||||||
shard::{ShardIdentity, TenantShardId},
|
|
||||||
};
|
|
||||||
use tokio::task::JoinHandle;
|
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use utils::{
|
|
||||||
generation::Generation,
|
|
||||||
id::NodeId,
|
|
||||||
seqwait::{SeqWait, SeqWaitError},
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
compute_hook::ComputeHook,
|
|
||||||
node::Node,
|
|
||||||
persistence::Persistence,
|
|
||||||
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
|
|
||||||
scheduler::{ScheduleError, Scheduler},
|
|
||||||
service, PlacementPolicy, Sequence,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub(crate) struct TenantState {
|
|
||||||
pub(crate) tenant_shard_id: TenantShardId,
|
|
||||||
|
|
||||||
pub(crate) shard: ShardIdentity,
|
|
||||||
|
|
||||||
// Runtime only: sequence used to coordinate when updating this object while
|
|
||||||
// with background reconcilers may be running. A reconciler runs to a particular
|
|
||||||
// sequence.
|
|
||||||
pub(crate) sequence: Sequence,
|
|
||||||
|
|
||||||
// Latest generation number: next time we attach, increment this
|
|
||||||
// and use the incremented number when attaching
|
|
||||||
pub(crate) generation: Generation,
|
|
||||||
|
|
||||||
// High level description of how the tenant should be set up. Provided
|
|
||||||
// externally.
|
|
||||||
pub(crate) policy: PlacementPolicy,
|
|
||||||
|
|
||||||
// Low level description of exactly which pageservers should fulfil
|
|
||||||
// which role. Generated by `Self::schedule`.
|
|
||||||
pub(crate) intent: IntentState,
|
|
||||||
|
|
||||||
// Low level description of how the tenant is configured on pageservers:
|
|
||||||
// if this does not match `Self::intent` then the tenant needs reconciliation
|
|
||||||
// with `Self::reconcile`.
|
|
||||||
pub(crate) observed: ObservedState,
|
|
||||||
|
|
||||||
// Tenant configuration, passed through opaquely to the pageserver. Identical
|
|
||||||
// for all shards in a tenant.
|
|
||||||
pub(crate) config: TenantConfig,
|
|
||||||
|
|
||||||
/// If a reconcile task is currently in flight, it may be joined here (it is
|
|
||||||
/// only safe to join if either the result has been received or the reconciler's
|
|
||||||
/// cancellation token has been fired)
|
|
||||||
pub(crate) reconciler: Option<ReconcilerHandle>,
|
|
||||||
|
|
||||||
/// Optionally wait for reconciliation to complete up to a particular
|
|
||||||
/// sequence number.
|
|
||||||
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
|
||||||
|
|
||||||
/// Indicates sequence number for which we have encountered an error reconciling. If
|
|
||||||
/// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
|
|
||||||
/// and callers should stop waiting for `waiter` and propagate the error.
|
|
||||||
pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
|
||||||
|
|
||||||
/// The most recent error from a reconcile on this tenant
|
|
||||||
/// TODO: generalize to an array of recent events
|
|
||||||
/// TOOD: use a ArcSwap instead of mutex for faster reads?
|
|
||||||
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default, Clone, Debug)]
|
|
||||||
pub(crate) struct IntentState {
|
|
||||||
pub(crate) attached: Option<NodeId>,
|
|
||||||
pub(crate) secondary: Vec<NodeId>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default, Clone)]
|
|
||||||
pub(crate) struct ObservedState {
|
|
||||||
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Our latest knowledge of how this tenant is configured in the outside world.
|
|
||||||
///
|
|
||||||
/// Meaning:
|
|
||||||
/// * No instance of this type exists for a node: we are certain that we have nothing configured on that
|
|
||||||
/// node for this shard.
|
|
||||||
/// * Instance exists with conf==None: we *might* have some state on that node, but we don't know
|
|
||||||
/// what it is (e.g. we failed partway through configuring it)
|
|
||||||
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
|
|
||||||
/// and that configuration will still be present unless something external interfered.
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) struct ObservedStateLocation {
|
|
||||||
/// If None, it means we do not know the status of this shard's location on this node, but
|
|
||||||
/// we know that we might have some state on this node.
|
|
||||||
pub(crate) conf: Option<LocationConfig>,
|
|
||||||
}
|
|
||||||
pub(crate) struct ReconcilerWaiter {
|
|
||||||
// For observability purposes, remember the ID of the shard we're
|
|
||||||
// waiting for.
|
|
||||||
pub(crate) tenant_shard_id: TenantShardId,
|
|
||||||
|
|
||||||
seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
|
||||||
error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
|
||||||
error: std::sync::Arc<std::sync::Mutex<String>>,
|
|
||||||
seq: Sequence,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
pub enum ReconcileWaitError {
|
|
||||||
#[error("Timeout waiting for shard {0}")]
|
|
||||||
Timeout(TenantShardId),
|
|
||||||
#[error("shutting down")]
|
|
||||||
Shutdown,
|
|
||||||
#[error("Reconcile error on shard {0}: {1}")]
|
|
||||||
Failed(TenantShardId, String),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ReconcilerWaiter {
|
|
||||||
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
|
|
||||||
tokio::select! {
|
|
||||||
result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> {
|
|
||||||
result.map_err(|e| match e {
|
|
||||||
SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id),
|
|
||||||
SeqWaitError::Shutdown => ReconcileWaitError::Shutdown
|
|
||||||
})?;
|
|
||||||
},
|
|
||||||
result = self.error_seq_wait.wait_for(self.seq) => {
|
|
||||||
result.map_err(|e| match e {
|
|
||||||
SeqWaitError::Shutdown => ReconcileWaitError::Shutdown,
|
|
||||||
SeqWaitError::Timeout => unreachable!()
|
|
||||||
})?;
|
|
||||||
|
|
||||||
return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Having spawned a reconciler task, the tenant shard's state will carry enough
|
|
||||||
/// information to optionally cancel & await it later.
|
|
||||||
pub(crate) struct ReconcilerHandle {
|
|
||||||
sequence: Sequence,
|
|
||||||
handle: JoinHandle<()>,
|
|
||||||
cancel: CancellationToken,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// When a reconcile task completes, it sends this result object
|
|
||||||
/// to be applied to the primary TenantState.
|
|
||||||
pub(crate) struct ReconcileResult {
|
|
||||||
pub(crate) sequence: Sequence,
|
|
||||||
/// On errors, `observed` should be treated as an incompleted description
|
|
||||||
/// of state (i.e. any nodes present in the result should override nodes
|
|
||||||
/// present in the parent tenant state, but any unmentioned nodes should
|
|
||||||
/// not be removed from parent tenant state)
|
|
||||||
pub(crate) result: Result<(), ReconcileError>,
|
|
||||||
|
|
||||||
pub(crate) tenant_shard_id: TenantShardId,
|
|
||||||
pub(crate) generation: Generation,
|
|
||||||
pub(crate) observed: ObservedState,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IntentState {
|
|
||||||
pub(crate) fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
attached: None,
|
|
||||||
secondary: vec![],
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
|
|
||||||
let mut result = Vec::new();
|
|
||||||
if let Some(p) = self.attached {
|
|
||||||
result.push(p)
|
|
||||||
}
|
|
||||||
|
|
||||||
result.extend(self.secondary.iter().copied());
|
|
||||||
|
|
||||||
result
|
|
||||||
}
|
|
||||||
|
|
||||||
/// When a node goes offline, we update intents to avoid using it
|
|
||||||
/// as their attached pageserver.
|
|
||||||
///
|
|
||||||
/// Returns true if a change was made
|
|
||||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
|
||||||
if self.attached == Some(node_id) {
|
|
||||||
self.attached = None;
|
|
||||||
self.secondary.push(node_id);
|
|
||||||
true
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ObservedState {
|
|
||||||
pub(crate) fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
locations: HashMap::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TenantState {
|
|
||||||
pub(crate) fn new(
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
shard: ShardIdentity,
|
|
||||||
policy: PlacementPolicy,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
|
||||||
tenant_shard_id,
|
|
||||||
policy,
|
|
||||||
intent: IntentState::default(),
|
|
||||||
generation: Generation::new(0),
|
|
||||||
shard,
|
|
||||||
observed: ObservedState::default(),
|
|
||||||
config: TenantConfig::default(),
|
|
||||||
reconciler: None,
|
|
||||||
sequence: Sequence(1),
|
|
||||||
waiter: Arc::new(SeqWait::new(Sequence(0))),
|
|
||||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
|
||||||
last_error: Arc::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// For use on startup when learning state from pageservers: generate my [`IntentState`] from my
|
|
||||||
/// [`ObservedState`], even if it violates my [`PlacementPolicy`]. Call [`Self::schedule`] next,
|
|
||||||
/// to get an intent state that complies with placement policy. The overall goal is to do scheduling
|
|
||||||
/// in a way that makes use of any configured locations that already exist in the outside world.
|
|
||||||
pub(crate) fn intent_from_observed(&mut self) {
|
|
||||||
// Choose an attached location by filtering observed locations, and then sorting to get the highest
|
|
||||||
// generation
|
|
||||||
let mut attached_locs = self
|
|
||||||
.observed
|
|
||||||
.locations
|
|
||||||
.iter()
|
|
||||||
.filter_map(|(node_id, l)| {
|
|
||||||
if let Some(conf) = &l.conf {
|
|
||||||
if conf.mode == LocationConfigMode::AttachedMulti
|
|
||||||
|| conf.mode == LocationConfigMode::AttachedSingle
|
|
||||||
|| conf.mode == LocationConfigMode::AttachedStale
|
|
||||||
{
|
|
||||||
Some((node_id, conf.generation))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
attached_locs.sort_by_key(|i| i.1);
|
|
||||||
if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
|
|
||||||
self.intent.attached = Some(*node_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// All remaining observed locations generate secondary intents. This includes None
|
|
||||||
// observations, as these may well have some local content on disk that is usable (this
|
|
||||||
// is an edge case that might occur if we restarted during a migration or other change)
|
|
||||||
self.observed.locations.keys().for_each(|node_id| {
|
|
||||||
if Some(*node_id) != self.intent.attached {
|
|
||||||
self.intent.secondary.push(*node_id);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
|
||||||
// TODO: before scheduling new nodes, check if any existing content in
|
|
||||||
// self.intent refers to pageservers that are offline, and pick other
|
|
||||||
// pageservers if so.
|
|
||||||
|
|
||||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
|
||||||
// more work on the same pageservers we're already using.
|
|
||||||
let mut used_pageservers = self.intent.all_pageservers();
|
|
||||||
let mut modified = false;
|
|
||||||
|
|
||||||
use PlacementPolicy::*;
|
|
||||||
match self.policy {
|
|
||||||
Single => {
|
|
||||||
// Should have exactly one attached, and zero secondaries
|
|
||||||
if self.intent.attached.is_none() {
|
|
||||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
|
||||||
self.intent.attached = Some(node_id);
|
|
||||||
used_pageservers.push(node_id);
|
|
||||||
modified = true;
|
|
||||||
}
|
|
||||||
if !self.intent.secondary.is_empty() {
|
|
||||||
self.intent.secondary.clear();
|
|
||||||
modified = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Double(secondary_count) => {
|
|
||||||
// Should have exactly one attached, and N secondaries
|
|
||||||
if self.intent.attached.is_none() {
|
|
||||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
|
||||||
self.intent.attached = Some(node_id);
|
|
||||||
used_pageservers.push(node_id);
|
|
||||||
modified = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
while self.intent.secondary.len() < secondary_count {
|
|
||||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
|
||||||
self.intent.secondary.push(node_id);
|
|
||||||
used_pageservers.push(node_id);
|
|
||||||
modified = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if modified {
|
|
||||||
self.sequence.0 += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn dirty(&self) -> bool {
|
|
||||||
if let Some(node_id) = self.intent.attached {
|
|
||||||
let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
|
|
||||||
match self.observed.locations.get(&node_id) {
|
|
||||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
|
||||||
Some(_) | None => {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for node_id in &self.intent.secondary {
|
|
||||||
let wanted_conf = secondary_location_conf(&self.shard, &self.config);
|
|
||||||
match self.observed.locations.get(node_id) {
|
|
||||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
|
||||||
Some(_) | None => {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn maybe_reconcile(
|
|
||||||
&mut self,
|
|
||||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
|
||||||
pageservers: &Arc<HashMap<NodeId, Node>>,
|
|
||||||
compute_hook: &Arc<ComputeHook>,
|
|
||||||
service_config: &service::Config,
|
|
||||||
persistence: &Arc<Persistence>,
|
|
||||||
) -> Option<ReconcilerWaiter> {
|
|
||||||
// If there are any ambiguous observed states, and the nodes they refer to are available,
|
|
||||||
// we should reconcile to clean them up.
|
|
||||||
let mut dirty_observed = false;
|
|
||||||
for (node_id, observed_loc) in &self.observed.locations {
|
|
||||||
let node = pageservers
|
|
||||||
.get(node_id)
|
|
||||||
.expect("Nodes may not be removed while referenced");
|
|
||||||
if observed_loc.conf.is_none()
|
|
||||||
&& !matches!(node.availability, NodeAvailability::Offline)
|
|
||||||
{
|
|
||||||
dirty_observed = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !self.dirty() && !dirty_observed {
|
|
||||||
tracing::info!("Not dirty, no reconciliation needed.");
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reconcile already in flight for the current sequence?
|
|
||||||
if let Some(handle) = &self.reconciler {
|
|
||||||
if handle.sequence == self.sequence {
|
|
||||||
return Some(ReconcilerWaiter {
|
|
||||||
tenant_shard_id: self.tenant_shard_id,
|
|
||||||
seq_wait: self.waiter.clone(),
|
|
||||||
error_seq_wait: self.error_waiter.clone(),
|
|
||||||
error: self.last_error.clone(),
|
|
||||||
seq: self.sequence,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reconcile in flight for a stale sequence? Our sequence's task will wait for it before
|
|
||||||
// doing our sequence's work.
|
|
||||||
let old_handle = self.reconciler.take();
|
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
|
||||||
let mut reconciler = Reconciler {
|
|
||||||
tenant_shard_id: self.tenant_shard_id,
|
|
||||||
shard: self.shard,
|
|
||||||
generation: self.generation,
|
|
||||||
intent: self.intent.clone(),
|
|
||||||
config: self.config.clone(),
|
|
||||||
observed: self.observed.clone(),
|
|
||||||
pageservers: pageservers.clone(),
|
|
||||||
compute_hook: compute_hook.clone(),
|
|
||||||
service_config: service_config.clone(),
|
|
||||||
cancel: cancel.clone(),
|
|
||||||
persistence: persistence.clone(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let reconcile_seq = self.sequence;
|
|
||||||
|
|
||||||
tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
|
|
||||||
let join_handle = tokio::task::spawn(async move {
|
|
||||||
// Wait for any previous reconcile task to complete before we start
|
|
||||||
if let Some(old_handle) = old_handle {
|
|
||||||
old_handle.cancel.cancel();
|
|
||||||
if let Err(e) = old_handle.handle.await {
|
|
||||||
// We can't do much with this other than log it: the task is done, so
|
|
||||||
// we may proceed with our work.
|
|
||||||
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Early check for cancellation before doing any work
|
|
||||||
// TODO: wrap all remote API operations in cancellation check
|
|
||||||
// as well.
|
|
||||||
if reconciler.cancel.is_cancelled() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let result = reconciler.reconcile().await;
|
|
||||||
result_tx
|
|
||||||
.send(ReconcileResult {
|
|
||||||
sequence: reconcile_seq,
|
|
||||||
result,
|
|
||||||
tenant_shard_id: reconciler.tenant_shard_id,
|
|
||||||
generation: reconciler.generation,
|
|
||||||
observed: reconciler.observed,
|
|
||||||
})
|
|
||||||
.ok();
|
|
||||||
});
|
|
||||||
|
|
||||||
self.reconciler = Some(ReconcilerHandle {
|
|
||||||
sequence: self.sequence,
|
|
||||||
handle: join_handle,
|
|
||||||
cancel,
|
|
||||||
});
|
|
||||||
|
|
||||||
Some(ReconcilerWaiter {
|
|
||||||
tenant_shard_id: self.tenant_shard_id,
|
|
||||||
seq_wait: self.waiter.clone(),
|
|
||||||
error_seq_wait: self.error_waiter.clone(),
|
|
||||||
error: self.last_error.clone(),
|
|
||||||
seq: self.sequence,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,27 +1,14 @@
|
|||||||
use crate::{background_process, local_env::LocalEnv};
|
use crate::{background_process, local_env::LocalEnv};
|
||||||
|
use anyhow::anyhow;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use hyper::Method;
|
use serde::{Deserialize, Serialize};
|
||||||
use pageserver_api::{
|
use std::{path::PathBuf, process::Child};
|
||||||
models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
|
use utils::id::{NodeId, TenantId};
|
||||||
shard::TenantShardId,
|
|
||||||
};
|
|
||||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
|
||||||
use postgres_backend::AuthType;
|
|
||||||
use postgres_connection::parse_host_port;
|
|
||||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
|
||||||
use std::{path::PathBuf, process::Child, str::FromStr};
|
|
||||||
use tracing::instrument;
|
|
||||||
use utils::{
|
|
||||||
auth::{Claims, Scope},
|
|
||||||
id::{NodeId, TenantId},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct AttachmentService {
|
pub struct AttachmentService {
|
||||||
env: LocalEnv,
|
env: LocalEnv,
|
||||||
listen: String,
|
listen: String,
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
jwt_token: Option<String>,
|
|
||||||
public_key_path: Option<Utf8PathBuf>,
|
|
||||||
client: reqwest::Client,
|
client: reqwest::Client,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -29,7 +16,7 @@ const COMMAND: &str = "attachment_service";
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct AttachHookRequest {
|
pub struct AttachHookRequest {
|
||||||
pub tenant_shard_id: TenantShardId,
|
pub tenant_id: TenantId,
|
||||||
pub node_id: Option<NodeId>,
|
pub node_id: Option<NodeId>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -40,7 +27,7 @@ pub struct AttachHookResponse {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct InspectRequest {
|
pub struct InspectRequest {
|
||||||
pub tenant_shard_id: TenantShardId,
|
pub tenant_id: TenantId,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
@@ -48,125 +35,6 @@ pub struct InspectResponse {
|
|||||||
pub attachment: Option<(u32, NodeId)>,
|
pub attachment: Option<(u32, NodeId)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct TenantCreateResponseShard {
|
|
||||||
pub node_id: NodeId,
|
|
||||||
pub generation: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct TenantCreateResponse {
|
|
||||||
pub shards: Vec<TenantCreateResponseShard>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct NodeRegisterRequest {
|
|
||||||
pub node_id: NodeId,
|
|
||||||
|
|
||||||
pub listen_pg_addr: String,
|
|
||||||
pub listen_pg_port: u16,
|
|
||||||
|
|
||||||
pub listen_http_addr: String,
|
|
||||||
pub listen_http_port: u16,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct NodeConfigureRequest {
|
|
||||||
pub node_id: NodeId,
|
|
||||||
|
|
||||||
pub availability: Option<NodeAvailability>,
|
|
||||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
|
||||||
pub struct TenantLocateResponseShard {
|
|
||||||
pub shard_id: TenantShardId,
|
|
||||||
pub node_id: NodeId,
|
|
||||||
|
|
||||||
pub listen_pg_addr: String,
|
|
||||||
pub listen_pg_port: u16,
|
|
||||||
|
|
||||||
pub listen_http_addr: String,
|
|
||||||
pub listen_http_port: u16,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct TenantLocateResponse {
|
|
||||||
pub shards: Vec<TenantLocateResponseShard>,
|
|
||||||
pub shard_params: ShardParameters,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Explicitly migrating a particular shard is a low level operation
|
|
||||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
|
||||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
|
||||||
pub struct TenantShardMigrateRequest {
|
|
||||||
pub tenant_shard_id: TenantShardId,
|
|
||||||
pub node_id: NodeId,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
|
||||||
pub enum NodeAvailability {
|
|
||||||
// Normal, happy state
|
|
||||||
Active,
|
|
||||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
|
||||||
// secondary locations on this node still exist. Newly added nodes are in this
|
|
||||||
// state until we successfully contact them.
|
|
||||||
Offline,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FromStr for NodeAvailability {
|
|
||||||
type Err = anyhow::Error;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
match s {
|
|
||||||
"active" => Ok(Self::Active),
|
|
||||||
"offline" => Ok(Self::Offline),
|
|
||||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
|
||||||
/// type needs to be defined with diesel traits in there.
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
|
||||||
pub enum NodeSchedulingPolicy {
|
|
||||||
Active,
|
|
||||||
Filling,
|
|
||||||
Pause,
|
|
||||||
Draining,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FromStr for NodeSchedulingPolicy {
|
|
||||||
type Err = anyhow::Error;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
match s {
|
|
||||||
"active" => Ok(Self::Active),
|
|
||||||
"filling" => Ok(Self::Filling),
|
|
||||||
"pause" => Ok(Self::Pause),
|
|
||||||
"draining" => Ok(Self::Draining),
|
|
||||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<NodeSchedulingPolicy> for String {
|
|
||||||
fn from(value: NodeSchedulingPolicy) -> String {
|
|
||||||
use NodeSchedulingPolicy::*;
|
|
||||||
match value {
|
|
||||||
Active => "active",
|
|
||||||
Filling => "filling",
|
|
||||||
Pause => "pause",
|
|
||||||
Draining => "draining",
|
|
||||||
}
|
|
||||||
.to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
|
||||||
pub struct TenantShardMigrateResponse {}
|
|
||||||
|
|
||||||
impl AttachmentService {
|
impl AttachmentService {
|
||||||
pub fn from_env(env: &LocalEnv) -> Self {
|
pub fn from_env(env: &LocalEnv) -> Self {
|
||||||
let path = env.base_data_dir.join("attachments.json");
|
let path = env.base_data_dir.join("attachments.json");
|
||||||
@@ -181,34 +49,10 @@ impl AttachmentService {
|
|||||||
listen_url.port().unwrap()
|
listen_url.port().unwrap()
|
||||||
);
|
);
|
||||||
|
|
||||||
// Assume all pageservers have symmetric auth configuration: this service
|
|
||||||
// expects to use one JWT token to talk to all of them.
|
|
||||||
let ps_conf = env
|
|
||||||
.pageservers
|
|
||||||
.first()
|
|
||||||
.expect("Config is validated to contain at least one pageserver");
|
|
||||||
let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
|
|
||||||
AuthType::Trust => (None, None),
|
|
||||||
AuthType::NeonJWT => {
|
|
||||||
let jwt_token = env
|
|
||||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// If pageserver auth is enabled, this implicitly enables auth for this service,
|
|
||||||
// using the same credentials.
|
|
||||||
let public_key_path =
|
|
||||||
camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
|
|
||||||
.unwrap();
|
|
||||||
(Some(jwt_token), Some(public_key_path))
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
env: env.clone(),
|
env: env.clone(),
|
||||||
path,
|
path,
|
||||||
listen,
|
listen,
|
||||||
jwt_token,
|
|
||||||
public_key_path,
|
|
||||||
client: reqwest::ClientBuilder::new()
|
client: reqwest::ClientBuilder::new()
|
||||||
.build()
|
.build()
|
||||||
.expect("Failed to construct http client"),
|
.expect("Failed to construct http client"),
|
||||||
@@ -223,199 +67,72 @@ impl AttachmentService {
|
|||||||
pub async fn start(&self) -> anyhow::Result<Child> {
|
pub async fn start(&self) -> anyhow::Result<Child> {
|
||||||
let path_str = self.path.to_string_lossy();
|
let path_str = self.path.to_string_lossy();
|
||||||
|
|
||||||
let mut args = vec!["-l", &self.listen, "-p", &path_str]
|
background_process::start_process(
|
||||||
.into_iter()
|
|
||||||
.map(|s| s.to_string())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
if let Some(jwt_token) = &self.jwt_token {
|
|
||||||
args.push(format!("--jwt-token={jwt_token}"));
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(public_key_path) = &self.public_key_path {
|
|
||||||
args.push(format!("--public-key={public_key_path}"));
|
|
||||||
}
|
|
||||||
|
|
||||||
let result = background_process::start_process(
|
|
||||||
COMMAND,
|
COMMAND,
|
||||||
&self.env.base_data_dir,
|
&self.env.base_data_dir,
|
||||||
&self.env.attachment_service_bin(),
|
&self.env.attachment_service_bin(),
|
||||||
args,
|
["-l", &self.listen, "-p", &path_str],
|
||||||
[(
|
[],
|
||||||
"NEON_REPO_DIR".to_string(),
|
|
||||||
self.env.base_data_dir.to_string_lossy().to_string(),
|
|
||||||
)],
|
|
||||||
background_process::InitialPidFile::Create(self.pid_file()),
|
background_process::InitialPidFile::Create(self.pid_file()),
|
||||||
|| async {
|
// TODO: a real status check
|
||||||
match self.status().await {
|
|| async move { anyhow::Ok(true) },
|
||||||
Ok(_) => Ok(true),
|
|
||||||
Err(_) => Ok(false),
|
|
||||||
}
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
.await;
|
.await
|
||||||
|
|
||||||
for ps_conf in &self.env.pageservers {
|
|
||||||
let (pg_host, pg_port) =
|
|
||||||
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
|
||||||
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
|
|
||||||
.expect("Unable to parse listen_http_addr");
|
|
||||||
self.node_register(NodeRegisterRequest {
|
|
||||||
node_id: ps_conf.id,
|
|
||||||
listen_pg_addr: pg_host.to_string(),
|
|
||||||
listen_pg_port: pg_port.unwrap_or(5432),
|
|
||||||
listen_http_addr: http_host.to_string(),
|
|
||||||
listen_http_port: http_port.unwrap_or(80),
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
result
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())
|
background_process::stop_process(immediate, COMMAND, &self.pid_file())
|
||||||
}
|
}
|
||||||
/// Simple HTTP request wrapper for calling into attachment service
|
|
||||||
async fn dispatch<RQ, RS>(
|
/// Call into the attach_hook API, for use before handing out attachments to pageservers
|
||||||
|
pub async fn attach_hook(
|
||||||
&self,
|
&self,
|
||||||
method: hyper::Method,
|
tenant_id: TenantId,
|
||||||
path: String,
|
pageserver_id: NodeId,
|
||||||
body: Option<RQ>,
|
) -> anyhow::Result<Option<u32>> {
|
||||||
) -> anyhow::Result<RS>
|
use hyper::StatusCode;
|
||||||
where
|
|
||||||
RQ: Serialize + Sized,
|
|
||||||
RS: DeserializeOwned + Sized,
|
|
||||||
{
|
|
||||||
let url = self
|
let url = self
|
||||||
.env
|
.env
|
||||||
.control_plane_api
|
.control_plane_api
|
||||||
.clone()
|
.clone()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.join(&path)
|
.join("attach-hook")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let mut builder = self.client.request(method, url);
|
|
||||||
if let Some(body) = body {
|
|
||||||
builder = builder.json(&body)
|
|
||||||
}
|
|
||||||
if let Some(jwt_token) = &self.jwt_token {
|
|
||||||
builder = builder.header(
|
|
||||||
reqwest::header::AUTHORIZATION,
|
|
||||||
format!("Bearer {jwt_token}"),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let response = builder.send().await?;
|
|
||||||
let response = response.error_from_body().await?;
|
|
||||||
|
|
||||||
Ok(response
|
|
||||||
.json()
|
|
||||||
.await
|
|
||||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Call into the attach_hook API, for use before handing out attachments to pageservers
|
|
||||||
#[instrument(skip(self))]
|
|
||||||
pub async fn attach_hook(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
pageserver_id: NodeId,
|
|
||||||
) -> anyhow::Result<Option<u32>> {
|
|
||||||
let request = AttachHookRequest {
|
let request = AttachHookRequest {
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
node_id: Some(pageserver_id),
|
node_id: Some(pageserver_id),
|
||||||
};
|
};
|
||||||
|
|
||||||
let response = self
|
let response = self.client.post(url).json(&request).send().await?;
|
||||||
.dispatch::<_, AttachHookResponse>(
|
if response.status() != StatusCode::OK {
|
||||||
Method::POST,
|
return Err(anyhow!("Unexpected status {}", response.status()));
|
||||||
"attach-hook".to_string(),
|
}
|
||||||
Some(request),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
|
let response = response.json::<AttachHookResponse>().await?;
|
||||||
Ok(response.gen)
|
Ok(response.gen)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
|
||||||
pub async fn inspect(
|
use hyper::StatusCode;
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
) -> anyhow::Result<Option<(u32, NodeId)>> {
|
|
||||||
let request = InspectRequest { tenant_shard_id };
|
|
||||||
|
|
||||||
let response = self
|
let url = self
|
||||||
.dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
|
.env
|
||||||
.await?;
|
.control_plane_api
|
||||||
|
.clone()
|
||||||
|
.unwrap()
|
||||||
|
.join("inspect")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let request = InspectRequest { tenant_id };
|
||||||
|
|
||||||
|
let response = self.client.post(url).json(&request).send().await?;
|
||||||
|
if response.status() != StatusCode::OK {
|
||||||
|
return Err(anyhow!("Unexpected status {}", response.status()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = response.json::<InspectResponse>().await?;
|
||||||
Ok(response.attachment)
|
Ok(response.attachment)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
|
||||||
pub async fn tenant_create(
|
|
||||||
&self,
|
|
||||||
req: TenantCreateRequest,
|
|
||||||
) -> anyhow::Result<TenantCreateResponse> {
|
|
||||||
self.dispatch(Method::POST, "tenant".to_string(), Some(req))
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
|
||||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
|
||||||
self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
|
||||||
pub async fn tenant_migrate(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
node_id: NodeId,
|
|
||||||
) -> anyhow::Result<TenantShardMigrateResponse> {
|
|
||||||
self.dispatch(
|
|
||||||
Method::PUT,
|
|
||||||
format!("tenant/{tenant_shard_id}/migrate"),
|
|
||||||
Some(TenantShardMigrateRequest {
|
|
||||||
tenant_shard_id,
|
|
||||||
node_id,
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
|
||||||
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
|
|
||||||
self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
|
||||||
pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
|
|
||||||
self.dispatch::<_, ()>(
|
|
||||||
Method::PUT,
|
|
||||||
format!("node/{}/config", req.node_id),
|
|
||||||
Some(req),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
|
||||||
pub async fn status(&self) -> anyhow::Result<()> {
|
|
||||||
self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))]
|
|
||||||
pub async fn tenant_timeline_create(
|
|
||||||
&self,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
req: TimelineCreateRequest,
|
|
||||||
) -> anyhow::Result<TimelineInfo> {
|
|
||||||
self.dispatch(
|
|
||||||
Method::POST,
|
|
||||||
format!("tenant/{tenant_id}/timeline"),
|
|
||||||
Some(req),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
337
control_plane/src/bin/attachment_service.rs
Normal file
337
control_plane/src/bin/attachment_service.rs
Normal file
@@ -0,0 +1,337 @@
|
|||||||
|
/// The attachment service mimics the aspects of the control plane API
|
||||||
|
/// that are required for a pageserver to operate.
|
||||||
|
///
|
||||||
|
/// This enables running & testing pageservers without a full-blown
|
||||||
|
/// deployment of the Neon cloud platform.
|
||||||
|
///
|
||||||
|
use anyhow::anyhow;
|
||||||
|
use clap::Parser;
|
||||||
|
use hex::FromHex;
|
||||||
|
use hyper::StatusCode;
|
||||||
|
use hyper::{Body, Request, Response};
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
use utils::http::endpoint::request_span;
|
||||||
|
use utils::logging::{self, LogFormat};
|
||||||
|
use utils::signals::{ShutdownSignals, Signal};
|
||||||
|
|
||||||
|
use utils::{
|
||||||
|
http::{
|
||||||
|
endpoint::{self},
|
||||||
|
error::ApiError,
|
||||||
|
json::{json_request, json_response},
|
||||||
|
RequestExt, RouterBuilder,
|
||||||
|
},
|
||||||
|
id::{NodeId, TenantId},
|
||||||
|
tcp_listener,
|
||||||
|
};
|
||||||
|
|
||||||
|
use pageserver_api::control_api::{
|
||||||
|
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
|
||||||
|
ValidateResponseTenant,
|
||||||
|
};
|
||||||
|
|
||||||
|
use control_plane::attachment_service::{
|
||||||
|
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(author, version, about, long_about = None)]
|
||||||
|
#[command(arg_required_else_help(true))]
|
||||||
|
struct Cli {
|
||||||
|
/// Host and port to listen on, like `127.0.0.1:1234`
|
||||||
|
#[arg(short, long)]
|
||||||
|
listen: std::net::SocketAddr,
|
||||||
|
|
||||||
|
/// Path to the .json file to store state (will be created if it doesn't exist)
|
||||||
|
#[arg(short, long)]
|
||||||
|
path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
// The persistent state of each Tenant
|
||||||
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
|
struct TenantState {
|
||||||
|
// Currently attached pageserver
|
||||||
|
pageserver: Option<NodeId>,
|
||||||
|
|
||||||
|
// Latest generation number: next time we attach, increment this
|
||||||
|
// and use the incremented number when attaching
|
||||||
|
generation: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
V: Clone + Serialize,
|
||||||
|
{
|
||||||
|
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
|
||||||
|
|
||||||
|
transformed
|
||||||
|
.collect::<HashMap<String, V>>()
|
||||||
|
.serialize(serializer)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::de::Deserializer<'de>,
|
||||||
|
V: Deserialize<'de>,
|
||||||
|
{
|
||||||
|
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
|
||||||
|
hex_map
|
||||||
|
.into_iter()
|
||||||
|
.map(|(k, v)| {
|
||||||
|
TenantId::from_hex(k)
|
||||||
|
.map(|k| (k, v))
|
||||||
|
.map_err(serde::de::Error::custom)
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Top level state available to all HTTP handlers
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
struct PersistentState {
|
||||||
|
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
|
||||||
|
tenants: HashMap<TenantId, TenantState>,
|
||||||
|
|
||||||
|
#[serde(skip)]
|
||||||
|
path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PersistentState {
|
||||||
|
async fn save(&self) -> anyhow::Result<()> {
|
||||||
|
let bytes = serde_json::to_vec(self)?;
|
||||||
|
tokio::fs::write(&self.path, &bytes).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn load(path: &Path) -> anyhow::Result<Self> {
|
||||||
|
let bytes = tokio::fs::read(path).await?;
|
||||||
|
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
|
||||||
|
decoded.path = path.to_owned();
|
||||||
|
Ok(decoded)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn load_or_new(path: &Path) -> Self {
|
||||||
|
match Self::load(path).await {
|
||||||
|
Ok(s) => {
|
||||||
|
tracing::info!("Loaded state file at {}", path.display());
|
||||||
|
s
|
||||||
|
}
|
||||||
|
Err(e)
|
||||||
|
if e.downcast_ref::<std::io::Error>()
|
||||||
|
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
|
||||||
|
.unwrap_or(false) =>
|
||||||
|
{
|
||||||
|
tracing::info!("Will create state file at {}", path.display());
|
||||||
|
Self {
|
||||||
|
tenants: HashMap::new(),
|
||||||
|
path: path.to_owned(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// State available to HTTP request handlers
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct State {
|
||||||
|
inner: Arc<tokio::sync::RwLock<PersistentState>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl State {
|
||||||
|
fn new(persistent_state: PersistentState) -> State {
|
||||||
|
Self {
|
||||||
|
inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn get_state(request: &Request<Body>) -> &State {
|
||||||
|
request
|
||||||
|
.data::<Arc<State>>()
|
||||||
|
.expect("unknown state type")
|
||||||
|
.as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
||||||
|
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||||
|
|
||||||
|
let state = get_state(&req).inner.clone();
|
||||||
|
let mut locked = state.write().await;
|
||||||
|
|
||||||
|
let mut response = ReAttachResponse {
|
||||||
|
tenants: Vec::new(),
|
||||||
|
};
|
||||||
|
for (t, state) in &mut locked.tenants {
|
||||||
|
if state.pageserver == Some(reattach_req.node_id) {
|
||||||
|
state.generation += 1;
|
||||||
|
response.tenants.push(ReAttachResponseTenant {
|
||||||
|
// TODO(sharding): make this shard-aware
|
||||||
|
id: TenantShardId::unsharded(*t),
|
||||||
|
gen: state.generation,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, response)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||||
|
/// holds the latest generation for the tenants with deletions enqueued
|
||||||
|
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||||
|
|
||||||
|
let locked = get_state(&req).inner.read().await;
|
||||||
|
|
||||||
|
let mut response = ValidateResponse {
|
||||||
|
tenants: Vec::new(),
|
||||||
|
};
|
||||||
|
|
||||||
|
for req_tenant in validate_req.tenants {
|
||||||
|
// TODO(sharding): make this shard-aware
|
||||||
|
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
|
||||||
|
let valid = tenant_state.generation == req_tenant.gen;
|
||||||
|
tracing::info!(
|
||||||
|
"handle_validate: {}(gen {}): valid={valid} (latest {})",
|
||||||
|
req_tenant.id,
|
||||||
|
req_tenant.gen,
|
||||||
|
tenant_state.generation
|
||||||
|
);
|
||||||
|
response.tenants.push(ValidateResponseTenant {
|
||||||
|
id: req_tenant.id,
|
||||||
|
valid,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, response)
|
||||||
|
}
|
||||||
|
/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
|
||||||
|
/// (in the real control plane this is unnecessary, because the same program is managing
|
||||||
|
/// generation numbers and doing attachments).
|
||||||
|
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
||||||
|
|
||||||
|
let state = get_state(&req).inner.clone();
|
||||||
|
let mut locked = state.write().await;
|
||||||
|
|
||||||
|
let tenant_state = locked
|
||||||
|
.tenants
|
||||||
|
.entry(attach_req.tenant_id)
|
||||||
|
.or_insert_with(|| TenantState {
|
||||||
|
pageserver: attach_req.node_id,
|
||||||
|
generation: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||||
|
tenant_state.generation += 1;
|
||||||
|
tracing::info!(
|
||||||
|
tenant_id = %attach_req.tenant_id,
|
||||||
|
ps_id = %attaching_pageserver,
|
||||||
|
generation = %tenant_state.generation,
|
||||||
|
"issuing",
|
||||||
|
);
|
||||||
|
} else if let Some(ps_id) = tenant_state.pageserver {
|
||||||
|
tracing::info!(
|
||||||
|
tenant_id = %attach_req.tenant_id,
|
||||||
|
%ps_id,
|
||||||
|
generation = %tenant_state.generation,
|
||||||
|
"dropping",
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
tracing::info!(
|
||||||
|
tenant_id = %attach_req.tenant_id,
|
||||||
|
"no-op: tenant already has no pageserver");
|
||||||
|
}
|
||||||
|
tenant_state.pageserver = attach_req.node_id;
|
||||||
|
let generation = tenant_state.generation;
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
"handle_attach_hook: tenant {} set generation {}, pageserver {}",
|
||||||
|
attach_req.tenant_id,
|
||||||
|
tenant_state.generation,
|
||||||
|
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||||
|
);
|
||||||
|
|
||||||
|
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
json_response(
|
||||||
|
StatusCode::OK,
|
||||||
|
AttachHookResponse {
|
||||||
|
gen: attach_req.node_id.map(|_| generation),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
|
||||||
|
|
||||||
|
let state = get_state(&req).inner.clone();
|
||||||
|
let locked = state.write().await;
|
||||||
|
let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
|
||||||
|
|
||||||
|
json_response(
|
||||||
|
StatusCode::OK,
|
||||||
|
InspectResponse {
|
||||||
|
attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||||
|
endpoint::make_router()
|
||||||
|
.data(Arc::new(State::new(persistent_state)))
|
||||||
|
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||||
|
.post("/validate", |r| request_span(r, handle_validate))
|
||||||
|
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||||
|
.post("/inspect", |r| request_span(r, handle_inspect))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
logging::init(
|
||||||
|
LogFormat::Plain,
|
||||||
|
logging::TracingErrorLayerEnablement::Disabled,
|
||||||
|
logging::Output::Stdout,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let args = Cli::parse();
|
||||||
|
tracing::info!(
|
||||||
|
"Starting, state at {}, listening on {}",
|
||||||
|
args.path.to_string_lossy(),
|
||||||
|
args.listen
|
||||||
|
);
|
||||||
|
|
||||||
|
let persistent_state = PersistentState::load_or_new(&args.path).await;
|
||||||
|
|
||||||
|
let http_listener = tcp_listener::bind(args.listen)?;
|
||||||
|
let router = make_router(persistent_state)
|
||||||
|
.build()
|
||||||
|
.map_err(|err| anyhow!(err))?;
|
||||||
|
let service = utils::http::RouterService::new(router).unwrap();
|
||||||
|
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||||
|
|
||||||
|
tracing::info!("Serving on {0}", args.listen);
|
||||||
|
|
||||||
|
tokio::task::spawn(server);
|
||||||
|
|
||||||
|
ShutdownSignals::handle(|signal| match signal {
|
||||||
|
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
|
||||||
|
tracing::info!("Got {}. Terminating", signal.name());
|
||||||
|
// We're just a test helper: no graceful shutdown.
|
||||||
|
std::process::exit(0);
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -6,26 +6,21 @@
|
|||||||
//! rely on `neon_local` to set up the environment for each test.
|
//! rely on `neon_local` to set up the environment for each test.
|
||||||
//!
|
//!
|
||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||||
use compute_api::spec::ComputeMode;
|
use compute_api::spec::ComputeMode;
|
||||||
use control_plane::attachment_service::{
|
use control_plane::attachment_service::AttachmentService;
|
||||||
AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
|
|
||||||
};
|
|
||||||
use control_plane::endpoint::ComputeControlPlane;
|
use control_plane::endpoint::ComputeControlPlane;
|
||||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
use control_plane::local_env::LocalEnv;
|
||||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||||
use control_plane::safekeeper::SafekeeperNode;
|
use control_plane::safekeeper::SafekeeperNode;
|
||||||
|
use control_plane::tenant_migration::migrate_tenant;
|
||||||
use control_plane::{broker, local_env};
|
use control_plane::{broker, local_env};
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::TimelineInfo;
|
||||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
|
||||||
};
|
|
||||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||||
};
|
};
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use postgres_connection::parse_host_port;
|
|
||||||
use safekeeper_api::{
|
use safekeeper_api::{
|
||||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||||
@@ -35,7 +30,6 @@ use std::path::PathBuf;
|
|||||||
use std::process::exit;
|
use std::process::exit;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||||
use url::Host;
|
|
||||||
use utils::{
|
use utils::{
|
||||||
auth::{Claims, Scope},
|
auth::{Claims, Scope},
|
||||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||||
@@ -282,10 +276,10 @@ fn print_timeline(
|
|||||||
/// Connects to the pageserver to query this information.
|
/// Connects to the pageserver to query this information.
|
||||||
async fn get_timeline_infos(
|
async fn get_timeline_infos(
|
||||||
env: &local_env::LocalEnv,
|
env: &local_env::LocalEnv,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
) -> Result<HashMap<TimelineId, TimelineInfo>> {
|
) -> Result<HashMap<TimelineId, TimelineInfo>> {
|
||||||
Ok(get_default_pageserver(env)
|
Ok(get_default_pageserver(env)
|
||||||
.timeline_list(tenant_shard_id)
|
.timeline_list(tenant_id)
|
||||||
.await?
|
.await?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
|
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
|
||||||
@@ -303,20 +297,6 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to parse --tenant_id option, for commands that accept a shard suffix
|
|
||||||
fn get_tenant_shard_id(
|
|
||||||
sub_match: &ArgMatches,
|
|
||||||
env: &local_env::LocalEnv,
|
|
||||||
) -> anyhow::Result<TenantShardId> {
|
|
||||||
if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() {
|
|
||||||
tenant_id_from_arguments
|
|
||||||
} else if let Some(default_id) = env.default_tenant_id {
|
|
||||||
Ok(TenantShardId::unsharded(default_id))
|
|
||||||
} else {
|
|
||||||
anyhow::bail!("No tenant shard id. Use --tenant-id, or set a default tenant");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
||||||
sub_match
|
sub_match
|
||||||
.get_one::<String>("tenant-id")
|
.get_one::<String>("tenant-id")
|
||||||
@@ -325,14 +305,6 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
|||||||
.context("Failed to parse tenant id from the argument string")
|
.context("Failed to parse tenant id from the argument string")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantShardId>> {
|
|
||||||
sub_match
|
|
||||||
.get_one::<String>("tenant-id")
|
|
||||||
.map(|id_str| TenantShardId::from_str(id_str))
|
|
||||||
.transpose()
|
|
||||||
.context("Failed to parse tenant shard id from the argument string")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
|
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
|
||||||
sub_match
|
sub_match
|
||||||
.get_one::<String>("timeline-id")
|
.get_one::<String>("timeline-id")
|
||||||
@@ -366,7 +338,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
|||||||
|
|
||||||
let mut env =
|
let mut env =
|
||||||
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
|
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
|
||||||
let force = init_match.get_one("force").expect("we set a default value");
|
let force = init_match.get_flag("force");
|
||||||
env.init(pg_version, force)
|
env.init(pg_version, force)
|
||||||
.context("Failed to initialize neon repository")?;
|
.context("Failed to initialize neon repository")?;
|
||||||
|
|
||||||
@@ -421,68 +393,47 @@ async fn handle_tenant(
|
|||||||
Some(("create", create_match)) => {
|
Some(("create", create_match)) => {
|
||||||
let tenant_conf: HashMap<_, _> = create_match
|
let tenant_conf: HashMap<_, _> = create_match
|
||||||
.get_many::<String>("config")
|
.get_many::<String>("config")
|
||||||
.map(|vals: clap::parser::ValuesRef<'_, String>| {
|
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
||||||
vals.flat_map(|c| c.split_once(':')).collect()
|
|
||||||
})
|
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
let shard_count: u8 = create_match
|
|
||||||
.get_one::<u8>("shard-count")
|
|
||||||
.cloned()
|
|
||||||
.unwrap_or(0);
|
|
||||||
|
|
||||||
let shard_stripe_size: Option<u32> =
|
|
||||||
create_match.get_one::<u32>("shard-stripe-size").cloned();
|
|
||||||
|
|
||||||
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
|
||||||
|
|
||||||
// If tenant ID was not specified, generate one
|
// If tenant ID was not specified, generate one
|
||||||
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
||||||
|
|
||||||
// We must register the tenant with the attachment service, so
|
let generation = if env.control_plane_api.is_some() {
|
||||||
// that when the pageserver restarts, it will be re-attached.
|
// We must register the tenant with the attachment service, so
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
// that when the pageserver restarts, it will be re-attached.
|
||||||
attachment_service
|
let attachment_service = AttachmentService::from_env(env);
|
||||||
.tenant_create(TenantCreateRequest {
|
attachment_service
|
||||||
// Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
|
.attach_hook(tenant_id, pageserver.conf.id)
|
||||||
// attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
.await?
|
||||||
// type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
|
} else {
|
||||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
None
|
||||||
generation: None,
|
};
|
||||||
shard_parameters: ShardParameters {
|
|
||||||
count: ShardCount(shard_count),
|
pageserver
|
||||||
stripe_size: shard_stripe_size
|
.tenant_create(tenant_id, generation, tenant_conf)
|
||||||
.map(ShardStripeSize)
|
|
||||||
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
|
|
||||||
},
|
|
||||||
config: tenant_conf,
|
|
||||||
})
|
|
||||||
.await?;
|
.await?;
|
||||||
println!("tenant {tenant_id} successfully created on the pageserver");
|
println!("tenant {tenant_id} successfully created on the pageserver");
|
||||||
|
|
||||||
// Create an initial timeline for the new tenant
|
// Create an initial timeline for the new tenant
|
||||||
let new_timeline_id =
|
let new_timeline_id = parse_timeline_id(create_match)?;
|
||||||
parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
|
|
||||||
let pg_version = create_match
|
let pg_version = create_match
|
||||||
.get_one::<u32>("pg-version")
|
.get_one::<u32>("pg-version")
|
||||||
.copied()
|
.copied()
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
// FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
|
let timeline_info = pageserver
|
||||||
// different shards picking different start lsns. Maybe we have to teach attachment service
|
.timeline_create(
|
||||||
// to let shard 0 branch first and then propagate the chosen LSN to other shards.
|
|
||||||
attachment_service
|
|
||||||
.tenant_timeline_create(
|
|
||||||
tenant_id,
|
tenant_id,
|
||||||
TimelineCreateRequest {
|
new_timeline_id,
|
||||||
new_timeline_id,
|
None,
|
||||||
ancestor_timeline_id: None,
|
None,
|
||||||
ancestor_start_lsn: None,
|
Some(pg_version),
|
||||||
existing_initdb_timeline_id: None,
|
None,
|
||||||
pg_version: Some(pg_version),
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
|
|
||||||
env.register_branch_mapping(
|
env.register_branch_mapping(
|
||||||
DEFAULT_BRANCH_NAME.to_string(),
|
DEFAULT_BRANCH_NAME.to_string(),
|
||||||
@@ -490,7 +441,9 @@ async fn handle_tenant(
|
|||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);
|
println!(
|
||||||
|
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
|
||||||
|
);
|
||||||
|
|
||||||
if create_match.get_flag("set-default") {
|
if create_match.get_flag("set-default") {
|
||||||
println!("Setting tenant {tenant_id} as a default one");
|
println!("Setting tenant {tenant_id} as a default one");
|
||||||
@@ -517,64 +470,14 @@ async fn handle_tenant(
|
|||||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||||
}
|
}
|
||||||
Some(("migrate", matches)) => {
|
Some(("migrate", matches)) => {
|
||||||
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
|
let tenant_id = get_tenant_id(matches, env)?;
|
||||||
let new_pageserver = get_pageserver(env, matches)?;
|
let new_pageserver = get_pageserver(env, matches)?;
|
||||||
let new_pageserver_id = new_pageserver.conf.id;
|
let new_pageserver_id = new_pageserver.conf.id;
|
||||||
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
migrate_tenant(env, tenant_id, new_pageserver).await?;
|
||||||
attachment_service
|
println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
|
||||||
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
|
|
||||||
}
|
}
|
||||||
Some(("status", matches)) => {
|
|
||||||
let tenant_id = get_tenant_id(matches, env)?;
|
|
||||||
|
|
||||||
let mut shard_table = comfy_table::Table::new();
|
|
||||||
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
|
|
||||||
|
|
||||||
let mut tenant_synthetic_size = None;
|
|
||||||
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
|
||||||
for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
|
|
||||||
let pageserver =
|
|
||||||
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
|
||||||
|
|
||||||
let size = pageserver
|
|
||||||
.http_client
|
|
||||||
.tenant_details(shard.shard_id)
|
|
||||||
.await?
|
|
||||||
.tenant_info
|
|
||||||
.current_physical_size
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
shard_table.add_row([
|
|
||||||
format!("{}", shard.shard_id.shard_slug()),
|
|
||||||
format!("{}", shard.node_id.0),
|
|
||||||
format!("{} MiB", size / (1024 * 1024)),
|
|
||||||
]);
|
|
||||||
|
|
||||||
if shard.shard_id.is_zero() {
|
|
||||||
tenant_synthetic_size =
|
|
||||||
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(synthetic_size) = tenant_synthetic_size else {
|
|
||||||
bail!("Shard 0 not found")
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut tenant_table = comfy_table::Table::new();
|
|
||||||
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
|
|
||||||
tenant_table.add_row([
|
|
||||||
"Synthetic size".to_string(),
|
|
||||||
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
|
|
||||||
]);
|
|
||||||
|
|
||||||
println!("{tenant_table}");
|
|
||||||
println!("{shard_table}");
|
|
||||||
}
|
|
||||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||||
None => bail!("no tenant subcommand provided"),
|
None => bail!("no tenant subcommand provided"),
|
||||||
}
|
}
|
||||||
@@ -586,10 +489,8 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
|
|
||||||
match timeline_match.subcommand() {
|
match timeline_match.subcommand() {
|
||||||
Some(("list", list_match)) => {
|
Some(("list", list_match)) => {
|
||||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
let tenant_id = get_tenant_id(list_match, env)?;
|
||||||
// where shard 0 is attached, and query there.
|
let timelines = pageserver.timeline_list(&tenant_id).await?;
|
||||||
let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
|
|
||||||
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
|
|
||||||
print_timelines_tree(timelines, env.timeline_name_mappings())?;
|
print_timelines_tree(timelines, env.timeline_name_mappings())?;
|
||||||
}
|
}
|
||||||
Some(("create", create_match)) => {
|
Some(("create", create_match)) => {
|
||||||
@@ -604,19 +505,18 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
||||||
let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
|
|
||||||
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let timeline_info = pageserver
|
||||||
let create_req = TimelineCreateRequest {
|
.timeline_create(
|
||||||
new_timeline_id,
|
tenant_id,
|
||||||
ancestor_timeline_id: None,
|
new_timeline_id_opt,
|
||||||
existing_initdb_timeline_id: None,
|
None,
|
||||||
ancestor_start_lsn: None,
|
None,
|
||||||
pg_version: Some(pg_version),
|
Some(pg_version),
|
||||||
};
|
None,
|
||||||
let timeline_info = attachment_service
|
)
|
||||||
.tenant_timeline_create(tenant_id, create_req)
|
|
||||||
.await?;
|
.await?;
|
||||||
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
|
|
||||||
let last_record_lsn = timeline_info.last_record_lsn;
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
||||||
@@ -674,6 +574,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
None,
|
None,
|
||||||
pg_version,
|
pg_version,
|
||||||
ComputeMode::Primary,
|
ComputeMode::Primary,
|
||||||
|
DEFAULT_PAGESERVER_ID,
|
||||||
)?;
|
)?;
|
||||||
println!("Done");
|
println!("Done");
|
||||||
}
|
}
|
||||||
@@ -697,18 +598,17 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||||
let new_timeline_id = TimelineId::generate();
|
let timeline_info = pageserver
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
.timeline_create(
|
||||||
let create_req = TimelineCreateRequest {
|
tenant_id,
|
||||||
new_timeline_id,
|
None,
|
||||||
ancestor_timeline_id: Some(ancestor_timeline_id),
|
start_lsn,
|
||||||
existing_initdb_timeline_id: None,
|
Some(ancestor_timeline_id),
|
||||||
ancestor_start_lsn: start_lsn,
|
None,
|
||||||
pg_version: None,
|
None,
|
||||||
};
|
)
|
||||||
let timeline_info = attachment_service
|
|
||||||
.tenant_timeline_create(tenant_id, create_req)
|
|
||||||
.await?;
|
.await?;
|
||||||
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
|
|
||||||
let last_record_lsn = timeline_info.last_record_lsn;
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
|
|
||||||
@@ -735,10 +635,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
|
|
||||||
match sub_name {
|
match sub_name {
|
||||||
"list" => {
|
"list" => {
|
||||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
let tenant_id = get_tenant_id(sub_args, env)?;
|
||||||
// where shard 0 is attached, and query there.
|
let timeline_infos = get_timeline_infos(env, &tenant_id)
|
||||||
let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
|
|
||||||
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
|
|
||||||
.await
|
.await
|
||||||
.unwrap_or_else(|e| {
|
.unwrap_or_else(|e| {
|
||||||
eprintln!("Failed to load timeline info: {}", e);
|
eprintln!("Failed to load timeline info: {}", e);
|
||||||
@@ -763,7 +661,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
for (endpoint_id, endpoint) in cplane
|
for (endpoint_id, endpoint) in cplane
|
||||||
.endpoints
|
.endpoints
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|(_, endpoint)| endpoint.tenant_id == tenant_shard_id.tenant_id)
|
.filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
|
||||||
{
|
{
|
||||||
let lsn_str = match endpoint.mode {
|
let lsn_str = match endpoint.mode {
|
||||||
ComputeMode::Static(lsn) => {
|
ComputeMode::Static(lsn) => {
|
||||||
@@ -782,10 +680,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
};
|
};
|
||||||
|
|
||||||
let branch_name = timeline_name_mappings
|
let branch_name = timeline_name_mappings
|
||||||
.get(&TenantTimelineId::new(
|
.get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id))
|
||||||
tenant_shard_id.tenant_id,
|
|
||||||
endpoint.timeline_id,
|
|
||||||
))
|
|
||||||
.map(|name| name.as_str())
|
.map(|name| name.as_str())
|
||||||
.unwrap_or("?");
|
.unwrap_or("?");
|
||||||
|
|
||||||
@@ -833,6 +728,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
.copied()
|
.copied()
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
let pageserver_id =
|
||||||
|
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||||
|
NodeId(id_str.parse().context("while parsing pageserver id")?)
|
||||||
|
} else {
|
||||||
|
DEFAULT_PAGESERVER_ID
|
||||||
|
};
|
||||||
|
|
||||||
let mode = match (lsn, hot_standby) {
|
let mode = match (lsn, hot_standby) {
|
||||||
(Some(lsn), false) => ComputeMode::Static(lsn),
|
(Some(lsn), false) => ComputeMode::Static(lsn),
|
||||||
(None, true) => ComputeMode::Replica,
|
(None, true) => ComputeMode::Replica,
|
||||||
@@ -860,6 +762,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
http_port,
|
http_port,
|
||||||
pg_version,
|
pg_version,
|
||||||
mode,
|
mode,
|
||||||
|
pageserver_id,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
"start" => {
|
"start" => {
|
||||||
@@ -869,11 +772,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
|
|
||||||
let pageserver_id =
|
let pageserver_id =
|
||||||
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||||
Some(NodeId(
|
NodeId(id_str.parse().context("while parsing pageserver id")?)
|
||||||
id_str.parse().context("while parsing pageserver id")?,
|
|
||||||
))
|
|
||||||
} else {
|
} else {
|
||||||
None
|
DEFAULT_PAGESERVER_ID
|
||||||
};
|
};
|
||||||
|
|
||||||
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
||||||
@@ -904,38 +805,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
endpoint.timeline_id,
|
endpoint.timeline_id,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
|
||||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
|
||||||
let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
|
|
||||||
(
|
|
||||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
|
||||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
|
||||||
// full managed by attachment service, therefore not sharded.
|
|
||||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
|
||||||
// to pass these on to postgres.
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
|
||||||
let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
|
|
||||||
let pageservers = locate_result
|
|
||||||
.shards
|
|
||||||
.into_iter()
|
|
||||||
.map(|shard| {
|
|
||||||
(
|
|
||||||
Host::parse(&shard.listen_pg_addr)
|
|
||||||
.expect("Attachment service reported bad hostname"),
|
|
||||||
shard.listen_pg_port,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
let stripe_size = locate_result.shard_params.stripe_size;
|
|
||||||
|
|
||||||
(pageservers, stripe_size)
|
|
||||||
};
|
|
||||||
assert!(!pageservers.is_empty());
|
|
||||||
|
|
||||||
let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
|
|
||||||
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
||||||
let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
|
let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
|
||||||
|
|
||||||
@@ -946,13 +816,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
|
|
||||||
println!("Starting existing endpoint {endpoint_id}...");
|
println!("Starting existing endpoint {endpoint_id}...");
|
||||||
endpoint
|
endpoint
|
||||||
.start(
|
.start(&auth_token, safekeepers, remote_ext_config)
|
||||||
&auth_token,
|
|
||||||
safekeepers,
|
|
||||||
pageservers,
|
|
||||||
remote_ext_config,
|
|
||||||
stripe_size.0 as usize,
|
|
||||||
)
|
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
"reconfigure" => {
|
"reconfigure" => {
|
||||||
@@ -963,31 +827,15 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
.endpoints
|
.endpoints
|
||||||
.get(endpoint_id.as_str())
|
.get(endpoint_id.as_str())
|
||||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||||
let pageservers =
|
let pageserver_id =
|
||||||
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||||
let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?);
|
Some(NodeId(
|
||||||
let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
|
id_str.parse().context("while parsing pageserver id")?,
|
||||||
vec![(
|
))
|
||||||
pageserver.pg_connection_config.host().clone(),
|
|
||||||
pageserver.pg_connection_config.port(),
|
|
||||||
)]
|
|
||||||
} else {
|
} else {
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
None
|
||||||
attachment_service
|
|
||||||
.tenant_locate(endpoint.tenant_id)
|
|
||||||
.await?
|
|
||||||
.shards
|
|
||||||
.into_iter()
|
|
||||||
.map(|shard| {
|
|
||||||
(
|
|
||||||
Host::parse(&shard.listen_pg_addr)
|
|
||||||
.expect("Attachment service reported malformed host"),
|
|
||||||
shard.listen_pg_port,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
};
|
};
|
||||||
endpoint.reconfigure(pageservers).await?;
|
endpoint.reconfigure(pageserver_id).await?;
|
||||||
}
|
}
|
||||||
"stop" => {
|
"stop" => {
|
||||||
let endpoint_id = sub_args
|
let endpoint_id = sub_args
|
||||||
@@ -1111,21 +959,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(("set-state", subcommand_args)) => {
|
|
||||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
|
||||||
let scheduling = subcommand_args.get_one("scheduling");
|
|
||||||
let availability = subcommand_args.get_one("availability");
|
|
||||||
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
|
||||||
attachment_service
|
|
||||||
.node_configure(NodeConfigureRequest {
|
|
||||||
node_id: pageserver.conf.id,
|
|
||||||
scheduling: scheduling.cloned(),
|
|
||||||
availability: availability.cloned(),
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(("status", subcommand_args)) => {
|
Some(("status", subcommand_args)) => {
|
||||||
match get_pageserver(env, subcommand_args)?.check_status().await {
|
match get_pageserver(env, subcommand_args)?.check_status().await {
|
||||||
Ok(_) => println!("Page server is up and running"),
|
Ok(_) => println!("Page server is up and running"),
|
||||||
@@ -1433,15 +1266,9 @@ fn cli() -> Command {
|
|||||||
.required(false);
|
.required(false);
|
||||||
|
|
||||||
let force_arg = Arg::new("force")
|
let force_arg = Arg::new("force")
|
||||||
.value_parser(value_parser!(InitForceMode))
|
.value_parser(value_parser!(bool))
|
||||||
.long("force")
|
.long("force")
|
||||||
.default_value(
|
.action(ArgAction::SetTrue)
|
||||||
InitForceMode::MustNotExist
|
|
||||||
.to_possible_value()
|
|
||||||
.unwrap()
|
|
||||||
.get_name()
|
|
||||||
.to_owned(),
|
|
||||||
)
|
|
||||||
.help("Force initialization even if the repository is not empty")
|
.help("Force initialization even if the repository is not empty")
|
||||||
.required(false);
|
.required(false);
|
||||||
|
|
||||||
@@ -1525,8 +1352,6 @@ fn cli() -> Command {
|
|||||||
.arg(pg_version_arg.clone())
|
.arg(pg_version_arg.clone())
|
||||||
.arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
|
.arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
|
||||||
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
|
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
|
||||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
|
||||||
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
|
|
||||||
)
|
)
|
||||||
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
|
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
|
||||||
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
|
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
|
||||||
@@ -1537,9 +1362,6 @@ fn cli() -> Command {
|
|||||||
.about("Migrate a tenant from one pageserver to another")
|
.about("Migrate a tenant from one pageserver to another")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
.arg(pageserver_id_arg.clone()))
|
.arg(pageserver_id_arg.clone()))
|
||||||
.subcommand(Command::new("status")
|
|
||||||
.about("Human readable summary of the tenant's shards and attachment locations")
|
|
||||||
.arg(tenant_id_arg.clone()))
|
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("pageserver")
|
Command::new("pageserver")
|
||||||
@@ -1559,12 +1381,6 @@ fn cli() -> Command {
|
|||||||
.about("Restart local pageserver")
|
.about("Restart local pageserver")
|
||||||
.arg(pageserver_config_args.clone())
|
.arg(pageserver_config_args.clone())
|
||||||
)
|
)
|
||||||
.subcommand(Command::new("set-state")
|
|
||||||
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
|
|
||||||
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
|
|
||||||
.about("Set scheduling or availability state of pageserver node")
|
|
||||||
.arg(pageserver_config_args.clone())
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("attachment_service")
|
Command::new("attachment_service")
|
||||||
|
|||||||
@@ -46,14 +46,11 @@ use std::time::Duration;
|
|||||||
|
|
||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
use compute_api::spec::RemoteExtSpec;
|
use compute_api::spec::RemoteExtSpec;
|
||||||
use nix::sys::signal::kill;
|
|
||||||
use nix::sys::signal::Signal;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use url::Host;
|
|
||||||
use utils::id::{NodeId, TenantId, TimelineId};
|
use utils::id::{NodeId, TenantId, TimelineId};
|
||||||
|
|
||||||
use crate::attachment_service::AttachmentService;
|
|
||||||
use crate::local_env::LocalEnv;
|
use crate::local_env::LocalEnv;
|
||||||
|
use crate::pageserver::PageServerNode;
|
||||||
use crate::postgresql_conf::PostgresConf;
|
use crate::postgresql_conf::PostgresConf;
|
||||||
|
|
||||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||||
@@ -70,6 +67,7 @@ pub struct EndpointConf {
|
|||||||
http_port: u16,
|
http_port: u16,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
skip_pg_catalog_updates: bool,
|
skip_pg_catalog_updates: bool,
|
||||||
|
pageserver_id: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -121,14 +119,19 @@ impl ComputeControlPlane {
|
|||||||
http_port: Option<u16>,
|
http_port: Option<u16>,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
mode: ComputeMode,
|
mode: ComputeMode,
|
||||||
|
pageserver_id: NodeId,
|
||||||
) -> Result<Arc<Endpoint>> {
|
) -> Result<Arc<Endpoint>> {
|
||||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||||
|
let pageserver =
|
||||||
|
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||||
|
|
||||||
let ep = Arc::new(Endpoint {
|
let ep = Arc::new(Endpoint {
|
||||||
endpoint_id: endpoint_id.to_owned(),
|
endpoint_id: endpoint_id.to_owned(),
|
||||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
||||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
|
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
|
||||||
env: self.env.clone(),
|
env: self.env.clone(),
|
||||||
|
pageserver,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
mode,
|
mode,
|
||||||
tenant_id,
|
tenant_id,
|
||||||
@@ -154,6 +157,7 @@ impl ComputeControlPlane {
|
|||||||
pg_port,
|
pg_port,
|
||||||
pg_version,
|
pg_version,
|
||||||
skip_pg_catalog_updates: true,
|
skip_pg_catalog_updates: true,
|
||||||
|
pageserver_id,
|
||||||
})?,
|
})?,
|
||||||
)?;
|
)?;
|
||||||
std::fs::write(
|
std::fs::write(
|
||||||
@@ -212,6 +216,7 @@ pub struct Endpoint {
|
|||||||
// These are not part of the endpoint as such, but the environment
|
// These are not part of the endpoint as such, but the environment
|
||||||
// the endpoint runs in.
|
// the endpoint runs in.
|
||||||
pub env: LocalEnv,
|
pub env: LocalEnv,
|
||||||
|
pageserver: PageServerNode,
|
||||||
|
|
||||||
// Optimizations
|
// Optimizations
|
||||||
skip_pg_catalog_updates: bool,
|
skip_pg_catalog_updates: bool,
|
||||||
@@ -234,11 +239,15 @@ impl Endpoint {
|
|||||||
let conf: EndpointConf =
|
let conf: EndpointConf =
|
||||||
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
|
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
|
||||||
|
|
||||||
|
let pageserver =
|
||||||
|
PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
|
||||||
|
|
||||||
Ok(Endpoint {
|
Ok(Endpoint {
|
||||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
|
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
|
||||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
|
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
|
||||||
endpoint_id,
|
endpoint_id,
|
||||||
env: env.clone(),
|
env: env.clone(),
|
||||||
|
pageserver,
|
||||||
timeline_id: conf.timeline_id,
|
timeline_id: conf.timeline_id,
|
||||||
mode: conf.mode,
|
mode: conf.mode,
|
||||||
tenant_id: conf.tenant_id,
|
tenant_id: conf.tenant_id,
|
||||||
@@ -430,14 +439,11 @@ impl Endpoint {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
|
||||||
// TODO use background_process::stop_process instead
|
// TODO use background_process::stop_process instead
|
||||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||||
if send_sigterm {
|
|
||||||
kill(pid, Signal::SIGTERM).ok();
|
|
||||||
}
|
|
||||||
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -458,21 +464,11 @@ impl Endpoint {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
|
|
||||||
pageservers
|
|
||||||
.iter()
|
|
||||||
.map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join(",")
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn start(
|
pub async fn start(
|
||||||
&self,
|
&self,
|
||||||
auth_token: &Option<String>,
|
auth_token: &Option<String>,
|
||||||
safekeepers: Vec<NodeId>,
|
safekeepers: Vec<NodeId>,
|
||||||
pageservers: Vec<(Host, u16)>,
|
|
||||||
remote_ext_config: Option<&String>,
|
remote_ext_config: Option<&String>,
|
||||||
shard_stripe_size: usize,
|
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if self.status() == "running" {
|
if self.status() == "running" {
|
||||||
anyhow::bail!("The endpoint is already running");
|
anyhow::bail!("The endpoint is already running");
|
||||||
@@ -486,9 +482,13 @@ impl Endpoint {
|
|||||||
std::fs::remove_dir_all(self.pgdata())?;
|
std::fs::remove_dir_all(self.pgdata())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
let pageserver_connstring = {
|
||||||
assert!(!pageserver_connstring.is_empty());
|
let config = &self.pageserver.pg_connection_config;
|
||||||
|
let (host, port) = (config.host(), config.port());
|
||||||
|
|
||||||
|
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||||
|
format!("postgresql://no_user@{host}:{port}")
|
||||||
|
};
|
||||||
let mut safekeeper_connstrings = Vec::new();
|
let mut safekeeper_connstrings = Vec::new();
|
||||||
if self.mode == ComputeMode::Primary {
|
if self.mode == ComputeMode::Primary {
|
||||||
for sk_id in safekeepers {
|
for sk_id in safekeepers {
|
||||||
@@ -537,8 +537,6 @@ impl Endpoint {
|
|||||||
safekeeper_connstrings,
|
safekeeper_connstrings,
|
||||||
storage_auth_token: auth_token.clone(),
|
storage_auth_token: auth_token.clone(),
|
||||||
remote_extensions,
|
remote_extensions,
|
||||||
pgbouncer_settings: None,
|
|
||||||
shard_stripe_size: Some(shard_stripe_size),
|
|
||||||
};
|
};
|
||||||
let spec_path = self.endpoint_path().join("spec.json");
|
let spec_path = self.endpoint_path().join("spec.json");
|
||||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||||
@@ -661,7 +659,7 @@ impl Endpoint {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
|
pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
|
||||||
let mut spec: ComputeSpec = {
|
let mut spec: ComputeSpec = {
|
||||||
let spec_path = self.endpoint_path().join("spec.json");
|
let spec_path = self.endpoint_path().join("spec.json");
|
||||||
let file = std::fs::File::open(spec_path)?;
|
let file = std::fs::File::open(spec_path)?;
|
||||||
@@ -671,26 +669,24 @@ impl Endpoint {
|
|||||||
let postgresql_conf = self.read_postgresql_conf()?;
|
let postgresql_conf = self.read_postgresql_conf()?;
|
||||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||||
|
|
||||||
// If we weren't given explicit pageservers, query the attachment service
|
if let Some(pageserver_id) = pageserver_id {
|
||||||
if pageservers.is_empty() {
|
let endpoint_config_path = self.endpoint_path().join("endpoint.json");
|
||||||
let attachment_service = AttachmentService::from_env(&self.env);
|
let mut endpoint_conf: EndpointConf = {
|
||||||
let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
|
let file = std::fs::File::open(&endpoint_config_path)?;
|
||||||
pageservers = locate_result
|
serde_json::from_reader(file)?
|
||||||
.shards
|
};
|
||||||
.into_iter()
|
endpoint_conf.pageserver_id = pageserver_id;
|
||||||
.map(|shard| {
|
std::fs::write(
|
||||||
(
|
endpoint_config_path,
|
||||||
Host::parse(&shard.listen_pg_addr)
|
serde_json::to_string_pretty(&endpoint_conf)?,
|
||||||
.expect("Attachment service reported bad hostname"),
|
)?;
|
||||||
shard.listen_pg_port,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
}
|
|
||||||
|
|
||||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
let pageserver =
|
||||||
assert!(!pageserver_connstr.is_empty());
|
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
let ps_http_conf = &pageserver.pg_connection_config;
|
||||||
|
let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
|
||||||
|
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
|
||||||
|
}
|
||||||
|
|
||||||
let client = reqwest::Client::new();
|
let client = reqwest::Client::new();
|
||||||
let response = client
|
let response = client
|
||||||
@@ -736,15 +732,10 @@ impl Endpoint {
|
|||||||
&None,
|
&None,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// Also wait for the compute_ctl process to die. It might have some
|
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||||
// cleanup work to do after postgres stops, like syncing safekeepers,
|
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||||
// etc.
|
|
||||||
//
|
//
|
||||||
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
|
self.wait_for_compute_ctl_to_exit()?;
|
||||||
// want this cleanup: tests intentionally do stop when majority of
|
|
||||||
// safekeepers is down, so sync-safekeepers would hang otherwise. This
|
|
||||||
// could be a separate flag though.
|
|
||||||
self.wait_for_compute_ctl_to_exit(destroy)?;
|
|
||||||
if destroy {
|
if destroy {
|
||||||
println!(
|
println!(
|
||||||
"Destroying postgres data directory '{}'",
|
"Destroying postgres data directory '{}'",
|
||||||
|
|||||||
@@ -14,3 +14,4 @@ pub mod local_env;
|
|||||||
pub mod pageserver;
|
pub mod pageserver;
|
||||||
pub mod postgresql_conf;
|
pub mod postgresql_conf;
|
||||||
pub mod safekeeper;
|
pub mod safekeeper;
|
||||||
|
pub mod tenant_migration;
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
|
|
||||||
use anyhow::{bail, ensure, Context};
|
use anyhow::{bail, ensure, Context};
|
||||||
|
|
||||||
use clap::ValueEnum;
|
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@@ -163,31 +162,6 @@ impl Default for SafekeeperConf {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
|
||||||
pub enum InitForceMode {
|
|
||||||
MustNotExist,
|
|
||||||
EmptyDirOk,
|
|
||||||
RemoveAllContents,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ValueEnum for InitForceMode {
|
|
||||||
fn value_variants<'a>() -> &'a [Self] {
|
|
||||||
&[
|
|
||||||
Self::MustNotExist,
|
|
||||||
Self::EmptyDirOk,
|
|
||||||
Self::RemoveAllContents,
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
|
|
||||||
Some(clap::builder::PossibleValue::new(match self {
|
|
||||||
InitForceMode::MustNotExist => "must-not-exist",
|
|
||||||
InitForceMode::EmptyDirOk => "empty-dir-ok",
|
|
||||||
InitForceMode::RemoveAllContents => "remove-all-contents",
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SafekeeperConf {
|
impl SafekeeperConf {
|
||||||
/// Compute is served by port on which only tenant scoped tokens allowed, if
|
/// Compute is served by port on which only tenant scoped tokens allowed, if
|
||||||
/// it is configured.
|
/// it is configured.
|
||||||
@@ -251,13 +225,7 @@ impl LocalEnv {
|
|||||||
if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
|
if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
|
||||||
Ok(conf)
|
Ok(conf)
|
||||||
} else {
|
} else {
|
||||||
let have_ids = self
|
bail!("could not find pageserver {id}")
|
||||||
.pageservers
|
|
||||||
.iter()
|
|
||||||
.map(|node| format!("{}:{}", node.id, node.listen_http_addr))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
let joined = have_ids.join(",");
|
|
||||||
bail!("could not find pageserver {id}, have ids {joined}")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -416,7 +384,7 @@ impl LocalEnv {
|
|||||||
//
|
//
|
||||||
// Initialize a new Neon repository
|
// Initialize a new Neon repository
|
||||||
//
|
//
|
||||||
pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
|
pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
|
||||||
// check if config already exists
|
// check if config already exists
|
||||||
let base_path = &self.base_data_dir;
|
let base_path = &self.base_data_dir;
|
||||||
ensure!(
|
ensure!(
|
||||||
@@ -425,34 +393,25 @@ impl LocalEnv {
|
|||||||
);
|
);
|
||||||
|
|
||||||
if base_path.exists() {
|
if base_path.exists() {
|
||||||
match force {
|
if force {
|
||||||
InitForceMode::MustNotExist => {
|
println!("removing all contents of '{}'", base_path.display());
|
||||||
bail!(
|
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
|
||||||
"directory '{}' already exists. Perhaps already initialized?",
|
// all contents inside. This helps if the developer symbol links another directory (i.e.,
|
||||||
base_path.display()
|
// S3 local SSD) to the `.neon` base directory.
|
||||||
);
|
for entry in std::fs::read_dir(base_path)? {
|
||||||
}
|
let entry = entry?;
|
||||||
InitForceMode::EmptyDirOk => {
|
let path = entry.path();
|
||||||
if let Some(res) = std::fs::read_dir(base_path)?.next() {
|
if path.is_dir() {
|
||||||
res.context("check if directory is empty")?;
|
fs::remove_dir_all(&path)?;
|
||||||
anyhow::bail!("directory not empty: {base_path:?}");
|
} else {
|
||||||
}
|
fs::remove_file(&path)?;
|
||||||
}
|
|
||||||
InitForceMode::RemoveAllContents => {
|
|
||||||
println!("removing all contents of '{}'", base_path.display());
|
|
||||||
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
|
|
||||||
// all contents inside. This helps if the developer symbol links another directory (i.e.,
|
|
||||||
// S3 local SSD) to the `.neon` base directory.
|
|
||||||
for entry in std::fs::read_dir(base_path)? {
|
|
||||||
let entry = entry?;
|
|
||||||
let path = entry.path();
|
|
||||||
if path.is_dir() {
|
|
||||||
fs::remove_dir_all(&path)?;
|
|
||||||
} else {
|
|
||||||
fs::remove_file(&path)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
bail!(
|
||||||
|
"directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
|
||||||
|
base_path.display()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -17,9 +17,7 @@ use std::time::Duration;
|
|||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use futures::SinkExt;
|
use futures::SinkExt;
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
|
||||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
|
||||||
};
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use pageserver_client::mgmt_api;
|
use pageserver_client::mgmt_api;
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
@@ -108,16 +106,6 @@ impl PageServerNode {
|
|||||||
"control_plane_api='{}'",
|
"control_plane_api='{}'",
|
||||||
control_plane_api.as_str()
|
control_plane_api.as_str()
|
||||||
));
|
));
|
||||||
|
|
||||||
// Attachment service uses the same auth as pageserver: if JWT is enabled
|
|
||||||
// for us, we will also need it to talk to them.
|
|
||||||
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
|
||||||
let jwt_token = self
|
|
||||||
.env
|
|
||||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
|
||||||
.unwrap();
|
|
||||||
overrides.push(format!("control_plane_api_token='{}'", jwt_token));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if !cli_overrides
|
if !cli_overrides
|
||||||
@@ -313,8 +301,16 @@ impl PageServerNode {
|
|||||||
pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
|
pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
|
||||||
self.http_client.list_tenants().await
|
self.http_client.list_tenants().await
|
||||||
}
|
}
|
||||||
pub fn parse_config(mut settings: HashMap<&str, &str>) -> anyhow::Result<models::TenantConfig> {
|
|
||||||
let result = models::TenantConfig {
|
pub async fn tenant_create(
|
||||||
|
&self,
|
||||||
|
new_tenant_id: TenantId,
|
||||||
|
generation: Option<u32>,
|
||||||
|
settings: HashMap<&str, &str>,
|
||||||
|
) -> anyhow::Result<TenantId> {
|
||||||
|
let mut settings = settings.clone();
|
||||||
|
|
||||||
|
let config = models::TenantConfig {
|
||||||
checkpoint_distance: settings
|
checkpoint_distance: settings
|
||||||
.remove("checkpoint_distance")
|
.remove("checkpoint_distance")
|
||||||
.map(|x| x.parse::<u64>())
|
.map(|x| x.parse::<u64>())
|
||||||
@@ -375,26 +371,11 @@ impl PageServerNode {
|
|||||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||||
};
|
};
|
||||||
if !settings.is_empty() {
|
|
||||||
bail!("Unrecognized tenant settings: {settings:?}")
|
|
||||||
} else {
|
|
||||||
Ok(result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn tenant_create(
|
|
||||||
&self,
|
|
||||||
new_tenant_id: TenantId,
|
|
||||||
generation: Option<u32>,
|
|
||||||
settings: HashMap<&str, &str>,
|
|
||||||
) -> anyhow::Result<TenantId> {
|
|
||||||
let config = Self::parse_config(settings.clone())?;
|
|
||||||
|
|
||||||
let request = models::TenantCreateRequest {
|
let request = models::TenantCreateRequest {
|
||||||
new_tenant_id: TenantShardId::unsharded(new_tenant_id),
|
new_tenant_id: TenantShardId::unsharded(new_tenant_id),
|
||||||
generation,
|
generation,
|
||||||
config,
|
config,
|
||||||
shard_parameters: ShardParameters::default(),
|
|
||||||
};
|
};
|
||||||
if !settings.is_empty() {
|
if !settings.is_empty() {
|
||||||
bail!("Unrecognized tenant settings: {settings:?}")
|
bail!("Unrecognized tenant settings: {settings:?}")
|
||||||
@@ -490,39 +471,31 @@ impl PageServerNode {
|
|||||||
|
|
||||||
pub async fn location_config(
|
pub async fn location_config(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
config: LocationConfig,
|
config: LocationConfig,
|
||||||
flush_ms: Option<Duration>,
|
flush_ms: Option<Duration>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.http_client
|
.http_client
|
||||||
.location_config(tenant_shard_id, config, flush_ms)
|
.location_config(tenant_id, config, flush_ms)
|
||||||
.await?)
|
.await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn timeline_list(
|
pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||||
&self,
|
Ok(self.http_client.list_timelines(*tenant_id).await?)
|
||||||
tenant_shard_id: &TenantShardId,
|
|
||||||
) -> anyhow::Result<Vec<TimelineInfo>> {
|
|
||||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
|
||||||
Ok(self
|
|
||||||
.http_client
|
|
||||||
.tenant_secondary_download(*tenant_id)
|
|
||||||
.await?)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn timeline_create(
|
pub async fn timeline_create(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
new_timeline_id: TimelineId,
|
new_timeline_id: Option<TimelineId>,
|
||||||
ancestor_start_lsn: Option<Lsn>,
|
ancestor_start_lsn: Option<Lsn>,
|
||||||
ancestor_timeline_id: Option<TimelineId>,
|
ancestor_timeline_id: Option<TimelineId>,
|
||||||
pg_version: Option<u32>,
|
pg_version: Option<u32>,
|
||||||
existing_initdb_timeline_id: Option<TimelineId>,
|
existing_initdb_timeline_id: Option<TimelineId>,
|
||||||
) -> anyhow::Result<TimelineInfo> {
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
|
// If timeline ID was not specified, generate one
|
||||||
|
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
||||||
let req = models::TimelineCreateRequest {
|
let req = models::TimelineCreateRequest {
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
ancestor_start_lsn,
|
ancestor_start_lsn,
|
||||||
@@ -530,10 +503,7 @@ impl PageServerNode {
|
|||||||
pg_version,
|
pg_version,
|
||||||
existing_initdb_timeline_id,
|
existing_initdb_timeline_id,
|
||||||
};
|
};
|
||||||
Ok(self
|
Ok(self.http_client.timeline_create(tenant_id, &req).await?)
|
||||||
.http_client
|
|
||||||
.timeline_create(tenant_shard_id, &req)
|
|
||||||
.await?)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Import a basebackup prepared using either:
|
/// Import a basebackup prepared using either:
|
||||||
@@ -611,14 +581,4 @@ impl PageServerNode {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn tenant_synthetic_size(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
) -> anyhow::Result<TenantHistorySize> {
|
|
||||||
Ok(self
|
|
||||||
.http_client
|
|
||||||
.tenant_synthetic_size(tenant_shard_id)
|
|
||||||
.await?)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
205
control_plane/src/tenant_migration.rs
Normal file
205
control_plane/src/tenant_migration.rs
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
//!
|
||||||
|
//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
|
||||||
|
//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
|
||||||
|
//! point to the new pageserver.
|
||||||
|
//!
|
||||||
|
use crate::local_env::LocalEnv;
|
||||||
|
use crate::{
|
||||||
|
attachment_service::AttachmentService, endpoint::ComputeControlPlane,
|
||||||
|
pageserver::PageServerNode,
|
||||||
|
};
|
||||||
|
use pageserver_api::models::{
|
||||||
|
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||||
|
};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::time::Duration;
|
||||||
|
use utils::{
|
||||||
|
id::{TenantId, TimelineId},
|
||||||
|
lsn::Lsn,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Given an attached pageserver, retrieve the LSN for all timelines
|
||||||
|
async fn get_lsns(
|
||||||
|
tenant_id: TenantId,
|
||||||
|
pageserver: &PageServerNode,
|
||||||
|
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||||
|
let timelines = pageserver.timeline_list(&tenant_id).await?;
|
||||||
|
Ok(timelines
|
||||||
|
.into_iter()
|
||||||
|
.map(|t| (t.timeline_id, t.last_record_lsn))
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
|
||||||
|
/// `baseline`.
|
||||||
|
async fn await_lsn(
|
||||||
|
tenant_id: TenantId,
|
||||||
|
pageserver: &PageServerNode,
|
||||||
|
baseline: HashMap<TimelineId, Lsn>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
loop {
|
||||||
|
let latest = match get_lsns(tenant_id, pageserver).await {
|
||||||
|
Ok(l) => l,
|
||||||
|
Err(e) => {
|
||||||
|
println!(
|
||||||
|
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
||||||
|
pageserver.conf.id
|
||||||
|
);
|
||||||
|
std::thread::sleep(Duration::from_millis(500));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut any_behind: bool = false;
|
||||||
|
for (timeline_id, baseline_lsn) in &baseline {
|
||||||
|
match latest.get(timeline_id) {
|
||||||
|
Some(latest_lsn) => {
|
||||||
|
println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
||||||
|
if latest_lsn < baseline_lsn {
|
||||||
|
any_behind = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// Expected timeline isn't yet visible on migration destination.
|
||||||
|
// (IRL we would have to account for timeline deletion, but this
|
||||||
|
// is just test helper)
|
||||||
|
any_behind = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !any_behind {
|
||||||
|
println!("✅ LSN caught up. Proceeding...");
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
std::thread::sleep(Duration::from_millis(500));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This function spans multiple services, to demonstrate live migration of a tenant
|
||||||
|
/// between pageservers:
|
||||||
|
/// - Coordinate attach/secondary/detach on pageservers
|
||||||
|
/// - call into attachment_service for generations
|
||||||
|
/// - reconfigure compute endpoints to point to new attached pageserver
|
||||||
|
pub async fn migrate_tenant(
|
||||||
|
env: &LocalEnv,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
dest_ps: PageServerNode,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
// Get a new generation
|
||||||
|
let attachment_service = AttachmentService::from_env(env);
|
||||||
|
|
||||||
|
fn build_location_config(
|
||||||
|
mode: LocationConfigMode,
|
||||||
|
generation: Option<u32>,
|
||||||
|
secondary_conf: Option<LocationConfigSecondary>,
|
||||||
|
) -> LocationConfig {
|
||||||
|
LocationConfig {
|
||||||
|
mode,
|
||||||
|
generation,
|
||||||
|
secondary_conf,
|
||||||
|
tenant_conf: TenantConfig::default(),
|
||||||
|
shard_number: 0,
|
||||||
|
shard_count: 0,
|
||||||
|
shard_stripe_size: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let previous = attachment_service.inspect(tenant_id).await?;
|
||||||
|
let mut baseline_lsns = None;
|
||||||
|
if let Some((generation, origin_ps_id)) = &previous {
|
||||||
|
let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
|
||||||
|
|
||||||
|
if origin_ps_id == &dest_ps.conf.id {
|
||||||
|
println!("🔁 Already attached to {origin_ps_id}, freshening...");
|
||||||
|
let gen = attachment_service
|
||||||
|
.attach_hook(tenant_id, dest_ps.conf.id)
|
||||||
|
.await?;
|
||||||
|
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
||||||
|
dest_ps.location_config(tenant_id, dest_conf, None).await?;
|
||||||
|
println!("✅ Migration complete");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
|
||||||
|
|
||||||
|
let stale_conf =
|
||||||
|
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
|
||||||
|
origin_ps
|
||||||
|
.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
|
||||||
|
}
|
||||||
|
|
||||||
|
let gen = attachment_service
|
||||||
|
.attach_hook(tenant_id, dest_ps.conf.id)
|
||||||
|
.await?;
|
||||||
|
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
|
||||||
|
|
||||||
|
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
|
||||||
|
dest_ps.location_config(tenant_id, dest_conf, None).await?;
|
||||||
|
|
||||||
|
if let Some(baseline) = baseline_lsns {
|
||||||
|
println!("🕑 Waiting for LSN to catch up...");
|
||||||
|
await_lsn(tenant_id, &dest_ps, baseline).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let cplane = ComputeControlPlane::load(env.clone())?;
|
||||||
|
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||||
|
if endpoint.tenant_id == tenant_id {
|
||||||
|
println!(
|
||||||
|
"🔁 Reconfiguring endpoint {} to use pageserver {}",
|
||||||
|
endpoint_name, dest_ps.conf.id
|
||||||
|
);
|
||||||
|
endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for other_ps_conf in &env.pageservers {
|
||||||
|
if other_ps_conf.id == dest_ps.conf.id {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let other_ps = PageServerNode::from_env(env, other_ps_conf);
|
||||||
|
let other_ps_tenants = other_ps.tenant_list().await?;
|
||||||
|
|
||||||
|
// Check if this tenant is attached
|
||||||
|
let found = other_ps_tenants
|
||||||
|
.into_iter()
|
||||||
|
.map(|t| t.id)
|
||||||
|
.any(|i| i.tenant_id == tenant_id);
|
||||||
|
if !found {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Downgrade to a secondary location
|
||||||
|
let secondary_conf = build_location_config(
|
||||||
|
LocationConfigMode::Secondary,
|
||||||
|
None,
|
||||||
|
Some(LocationConfigSecondary { warm: true }),
|
||||||
|
);
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"💤 Switching to secondary mode on pageserver {}",
|
||||||
|
other_ps.conf.id
|
||||||
|
);
|
||||||
|
other_ps
|
||||||
|
.location_config(tenant_id, secondary_conf, None)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
||||||
|
dest_ps.conf.id
|
||||||
|
);
|
||||||
|
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
||||||
|
dest_ps.location_config(tenant_id, dest_conf, None).await?;
|
||||||
|
|
||||||
|
println!("✅ Migration complete");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -35,7 +35,6 @@ allow = [
|
|||||||
"Artistic-2.0",
|
"Artistic-2.0",
|
||||||
"BSD-2-Clause",
|
"BSD-2-Clause",
|
||||||
"BSD-3-Clause",
|
"BSD-3-Clause",
|
||||||
"CC0-1.0",
|
|
||||||
"ISC",
|
"ISC",
|
||||||
"MIT",
|
"MIT",
|
||||||
"MPL-2.0",
|
"MPL-2.0",
|
||||||
|
|||||||
@@ -1,142 +0,0 @@
|
|||||||
# Vectored Timeline Get
|
|
||||||
|
|
||||||
Created on: 2024-01-02
|
|
||||||
Author: Christian Schwarz
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
|
|
||||||
A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
|
|
||||||
|
|
||||||
# Motivation
|
|
||||||
|
|
||||||
During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
|
|
||||||
For an example, see
|
|
||||||
https://github.com/neondatabase/neon/blob/5c88213eaf1b1e29c610a078d0b380f69ed49a7e/pageserver/src/basebackup.rs#L281-L302.
|
|
||||||
|
|
||||||
Each of these `Timeline::get` calls must traverse the layer map to gather reconstruct data (`Timeline::get_reconstruct_data`) for the requested page number (`blknum` in the example).
|
|
||||||
For each layer visited by layer map traversal, we do a `DiskBtree` point lookup.
|
|
||||||
If it's negative (no entry), we resume layer map traversal.
|
|
||||||
If it's positive, we collect the result in our reconstruct data bag.
|
|
||||||
If the reconstruct data bag contents suffice to reconstruct the page, we're done with `get_reconstruct_data` and move on to walredo.
|
|
||||||
Otherwise, we resume layer map traversal.
|
|
||||||
|
|
||||||
Doing this many `Timeline::get` calls is quite inefficient because:
|
|
||||||
|
|
||||||
1. We do the layer map traversal repeatedly, even if, e.g., all the data sits in the same image layer at the bottom of the stack.
|
|
||||||
2. We may visit many DiskBtree inner pages multiple times for point lookup of different keys.
|
|
||||||
This is likely particularly bad for L0s which span the whole key space and hence must be visited by layer map traversal, but
|
|
||||||
may not contain the data we're looking for.
|
|
||||||
3. Anecdotally, keys adjacent in keyspace and written simultaneously also end up physically adjacent in the layer files [^1].
|
|
||||||
So, to provide the reconstruct data for N adjacent keys, we would actually only _need_ to issue a single large read to the filesystem, instead of the N reads we currently do.
|
|
||||||
The filesystem, in turn, ideally stores the layer file physically contiguously, so our large read will turn into one IOP toward the disk.
|
|
||||||
|
|
||||||
[^1]: https://www.notion.so/neondatabase/Christian-Investigation-Slow-Basebackups-Early-2023-12-34ea5c7dcdc1485d9ac3731da4d2a6fc?pvs=4#15ee4e143392461fa64590679c8f54c9
|
|
||||||
|
|
||||||
# Solution
|
|
||||||
|
|
||||||
We should have a vectored aka batched aka scatter-gather style alternative API for `Timeline::get`. Having such an API unlocks:
|
|
||||||
|
|
||||||
* more efficient basebackup
|
|
||||||
* batched IO during compaction (useful for strides of unchanged pages)
|
|
||||||
* page_service: expose vectored get_page_at_lsn for compute (=> good for seqscan / prefetch)
|
|
||||||
* if [on-demand SLRU downloads](https://github.com/neondatabase/neon/pull/6151) land before vectored Timeline::get, on-demand SLRU downloads will still benefit from this API
|
|
||||||
|
|
||||||
# DoD
|
|
||||||
|
|
||||||
There is a new variant of `Timeline::get`, called `Timeline::get_vectored`.
|
|
||||||
It takes as arguments an `lsn: Lsn` and a `src: &[KeyVec]` where `struct KeyVec { base: Key, count: usize }`.
|
|
||||||
|
|
||||||
It is up to the implementor to figure out a suitable and efficient way to return the reconstructed page images.
|
|
||||||
It is sufficient to simply return a `Vec<Bytes>`, but, likely more efficient solutions can be found after studying all the callers of `Timeline::get`.
|
|
||||||
|
|
||||||
Functionally, the behavior of `Timeline::get_vectored` is equivalent to
|
|
||||||
|
|
||||||
```rust
|
|
||||||
let mut keys_iter: impl Iterator<Item=Key>
|
|
||||||
= src.map(|KeyVec{ base, count }| (base..base+count)).flatten();
|
|
||||||
let mut out = Vec::new();
|
|
||||||
for key in keys_iter {
|
|
||||||
let data = Timeline::get(key, lsn)?;
|
|
||||||
out.push(data);
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
```
|
|
||||||
|
|
||||||
However, unlike above, an ideal solution will
|
|
||||||
|
|
||||||
* Visit each `struct Layer` at most once.
|
|
||||||
* For each visited layer, call `Layer::get_value_reconstruct_data` at most once.
|
|
||||||
* This means, read each `DiskBtree` page at most once.
|
|
||||||
* Facilitate merging of the reads we issue to the OS and eventually NVMe.
|
|
||||||
|
|
||||||
Each of these items above represents a signficant amount of work.
|
|
||||||
|
|
||||||
## Performance
|
|
||||||
|
|
||||||
Ideally, the **base performance** of a vectored get of a single page should be identical to the current `Timeline::get`.
|
|
||||||
A reasonable constant overhead over current `Timeline::get` is acceptable.
|
|
||||||
|
|
||||||
The performance improvement for the vectored use case is demonstrated in some way, e.g., using the `pagebench` basebackup benchmark against a tenant with a lot of SLRU segments.
|
|
||||||
|
|
||||||
# Implementation
|
|
||||||
|
|
||||||
High-level set of tasks / changes to be made:
|
|
||||||
|
|
||||||
- **Get clarity on API**:
|
|
||||||
- Define naive `Timeline::get_vectored` implementation & adopt it across pageserver.
|
|
||||||
- The tricky thing here will be the return type (e.g. `Vec<Bytes>` vs `impl Stream`).
|
|
||||||
- Start with something simple to explore the different usages of the API.
|
|
||||||
Then iterate with peers until we have something that is good enough.
|
|
||||||
- **Vectored Layer Map traversal**
|
|
||||||
- Vectored `LayerMap::search` (take 1 LSN and N `Key`s instead of just 1 LSN and 1 `Key`)
|
|
||||||
- Refactor `Timeline::get_reconstruct_data` to hold & return state for N `Key`s instead of 1
|
|
||||||
- The slightly tricky part here is what to do about `cont_lsn` [after we've found some reconstruct data for some keys](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2385)
|
|
||||||
but need more.
|
|
||||||
Likely we'll need to keep track of `cont_lsn` per key and continue next iteration at `max(cont_lsn)` of all keys that still need data.
|
|
||||||
- **Vectored `Layer::get_value_reconstruct_data` / `DiskBtree`**
|
|
||||||
- Current code calls it [here](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2384).
|
|
||||||
- Delta layers use `DiskBtreeReader::visit()` to collect the `(offset,len)` pairs for delta record blobs to load.
|
|
||||||
- Image layers use `DiskBtreeReader::get` to get the offset of the image blob to load. Underneath, that's just a `::visit()` call.
|
|
||||||
- What needs to happen to `DiskBtree::visit()`?
|
|
||||||
* Minimally
|
|
||||||
* take a single `KeyVec` instead of a single `Key` as argument, i.e., take a single contiguous key range to visit.
|
|
||||||
* Change the visit code to to invoke the callback for all values in the `KeyVec`'s key range
|
|
||||||
* This should be good enough for what we've seen when investigating basebackup slowness, because there, the key ranges are contiguous.
|
|
||||||
* Ideally:
|
|
||||||
* Take a `&[KeyVec]`, sort it;
|
|
||||||
* during Btree traversal, peek at the next `KeyVec` range to determine whether we need to descend or back out.
|
|
||||||
* NB: this should be a straight-forward extension of the minimal solution above, as we'll already be checking for "is there more key range in the requested `KeyVec`".
|
|
||||||
- **Facilitate merging of the reads we issue to the OS and eventually NVMe.**
|
|
||||||
- The `DiskBtree::visit` produces a set of offsets which we then read from a `VirtualFile` [here](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
|
|
||||||
- [Delta layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
|
|
||||||
- We hit (and rely) on `PageCache` and `VirtualFile here (not great under pressure)
|
|
||||||
- [Image layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/image_layer.rs#L429-L435)
|
|
||||||
- What needs to happen is the **vectorization of the `blob_io` interface and then the `VirtualFile` API**.
|
|
||||||
- That is tricky because
|
|
||||||
- the `VirtualFile` API, which sits underneath `blob_io`, is being touched by ongoing [io_uring work](https://github.com/neondatabase/neon/pull/5824)
|
|
||||||
- there's the question how IO buffers will be managed; currently this area relies heavily on `PageCache`, but there's controversy around the future of `PageCache`.
|
|
||||||
- The guiding principle here should be to avoid coupling this work to the `PageCache`.
|
|
||||||
- I.e., treat `PageCache` as an extra hop in the I/O chain, rather than as an integral part of buffer management.
|
|
||||||
|
|
||||||
|
|
||||||
Let's see how we can improve by doing the first three items in above list first, then revisit.
|
|
||||||
|
|
||||||
## Rollout / Feature Flags
|
|
||||||
|
|
||||||
No feature flags are required for this epic.
|
|
||||||
|
|
||||||
At the end of this epic, `Timeline::get` forwards to `Timeline::get_vectored`, i.e., it's an all-or-nothing type of change.
|
|
||||||
|
|
||||||
It is encouraged to deliver this feature incrementally, i.e., do many small PRs over multiple weeks.
|
|
||||||
That will help isolate performance regressions across weekly releases.
|
|
||||||
|
|
||||||
# Interaction With Sharding
|
|
||||||
|
|
||||||
[Sharding](https://github.com/neondatabase/neon/pull/5432) splits up the key space, see functions `is_key_local` / `key_to_shard_number`.
|
|
||||||
|
|
||||||
Just as with `Timeline::get`, callers of `Timeline::get_vectored` are responsible for ensuring that they only ask for blocks of the given `struct Timeline`'s shard.
|
|
||||||
|
|
||||||
Given that this is already the case, there shouldn't be significant interaction/interference with sharding.
|
|
||||||
|
|
||||||
However, let's have a safety check for this constraint (error or assertion) because there are currently few affordances at the higher layers of Pageserver for sharding<=>keyspace interaction.
|
|
||||||
For example, `KeySpace` is not broken up by shard stripe, so if someone naively converted the compaction code to issue a vectored get for a keyspace range it would violate this constraint.
|
|
||||||
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
|
|||||||
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
||||||
|
|
||||||
### Obligatory checks
|
### Obligatory checks
|
||||||
We force code formatting via `ruff`, and type hints via `mypy`.
|
We force code formatting via `black`, `ruff`, and type hints via `mypy`.
|
||||||
Run the following commands in the repository's root (next to `pyproject.toml`):
|
Run the following commands in the repository's root (next to `pyproject.toml`):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
poetry run ruff format . # All code is reformatted
|
poetry run black . # All code is reformatted
|
||||||
poetry run ruff check . # Python linter
|
poetry run ruff . # Python linter
|
||||||
poetry run mypy . # Ensure there are no typing errors
|
poetry run mypy . # Ensure there are no typing errors
|
||||||
```
|
```
|
||||||
|
|
||||||
**WARNING**: do not run `mypy` from a directory other than the root of the repository.
|
**WARNING**: do not run `mypy` from a directory other than the root of the repository.
|
||||||
|
|||||||
@@ -73,12 +73,6 @@ pub struct ComputeSpec {
|
|||||||
|
|
||||||
// information about available remote extensions
|
// information about available remote extensions
|
||||||
pub remote_extensions: Option<RemoteExtSpec>,
|
pub remote_extensions: Option<RemoteExtSpec>,
|
||||||
|
|
||||||
pub pgbouncer_settings: Option<HashMap<String, String>>,
|
|
||||||
|
|
||||||
// Stripe size for pageserver sharding, in pages
|
|
||||||
#[serde(default)]
|
|
||||||
pub shard_stripe_size: Option<usize>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||||
@@ -86,13 +80,10 @@ pub struct ComputeSpec {
|
|||||||
#[serde(rename_all = "snake_case")]
|
#[serde(rename_all = "snake_case")]
|
||||||
pub enum ComputeFeature {
|
pub enum ComputeFeature {
|
||||||
// XXX: Add more feature flags here.
|
// XXX: Add more feature flags here.
|
||||||
/// Enable the experimental activity monitor logic, which uses `pg_stat_database` to
|
|
||||||
/// track short-lived connections as user activity.
|
|
||||||
ActivityMonitorExperimental,
|
|
||||||
|
|
||||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
// This is a special feature flag that is used to represent unknown feature flags.
|
||||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
// Basically all unknown to enum flags are represented as this one. See unit test
|
||||||
/// `parse_unknown_features()` for more details.
|
// `parse_unknown_features()` for more details.
|
||||||
#[serde(other)]
|
#[serde(other)]
|
||||||
UnknownFeature,
|
UnknownFeature,
|
||||||
}
|
}
|
||||||
@@ -289,23 +280,4 @@ mod tests {
|
|||||||
assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
|
assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
|
||||||
assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
|
assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn parse_known_features() {
|
|
||||||
// Test that we can properly parse known feature flags.
|
|
||||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
|
||||||
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
|
|
||||||
let ob = json.as_object_mut().unwrap();
|
|
||||||
|
|
||||||
// Add known feature flags.
|
|
||||||
let features = vec!["activity_monitor_experimental"];
|
|
||||||
ob.insert("features".into(), features.into());
|
|
||||||
|
|
||||||
let spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
spec.features,
|
|
||||||
vec![ComputeFeature::ActivityMonitorExperimental]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -243,9 +243,5 @@
|
|||||||
"public_extensions": [
|
"public_extensions": [
|
||||||
"postgis"
|
"postgis"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
"pgbouncer_settings": {
|
|
||||||
"default_pool_size": "42",
|
|
||||||
"pool_mode": "session"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ strum.workspace = true
|
|||||||
strum_macros.workspace = true
|
strum_macros.workspace = true
|
||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
humantime-serde.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
|
|||||||
@@ -3,8 +3,6 @@ use byteorder::{ByteOrder, BE};
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
use crate::reltag::{BlockNumber, RelTag};
|
|
||||||
|
|
||||||
/// Key used in the Repository kv-store.
|
/// Key used in the Repository kv-store.
|
||||||
///
|
///
|
||||||
/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
|
/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
|
||||||
@@ -143,25 +141,8 @@ impl Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||||
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
|
key.field1 == 0x00 && key.field4 != 0
|
||||||
}
|
|
||||||
|
|
||||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
|
||||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
|
||||||
Ok(match key.field1 {
|
|
||||||
0x00 => (
|
|
||||||
RelTag {
|
|
||||||
spcnode: key.field2,
|
|
||||||
dbnode: key.field3,
|
|
||||||
relnode: key.field4,
|
|
||||||
forknum: key.field5,
|
|
||||||
},
|
|
||||||
key.field6,
|
|
||||||
),
|
|
||||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::str::FromStr for Key {
|
impl std::str::FromStr for Key {
|
||||||
|
|||||||
@@ -114,21 +114,16 @@ impl KeySpaceAccum {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
pub fn add_key(&mut self, key: Key) {
|
pub fn add_key(&mut self, key: Key) {
|
||||||
self.add_range(singleton_range(key))
|
self.add_range(singleton_range(key))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
pub fn add_range(&mut self, range: Range<Key>) {
|
pub fn add_range(&mut self, range: Range<Key>) {
|
||||||
match self.accum.as_mut() {
|
match self.accum.as_mut() {
|
||||||
Some(accum) => {
|
Some(accum) => {
|
||||||
if range.start == accum.end {
|
if range.start == accum.end {
|
||||||
accum.end = range.end;
|
accum.end = range.end;
|
||||||
} else {
|
} else {
|
||||||
// TODO: to efficiently support small sharding stripe sizes, we should avoid starting
|
|
||||||
// a new range here if the skipped region was all keys that don't belong on this shard.
|
|
||||||
// (https://github.com/neondatabase/neon/issues/6247)
|
|
||||||
assert!(range.start > accum.end);
|
assert!(range.start > accum.end);
|
||||||
self.ranges.push(accum.clone());
|
self.ranges.push(accum.clone());
|
||||||
*accum = range;
|
*accum = range;
|
||||||
|
|||||||
@@ -2,9 +2,9 @@ pub mod partitioning;
|
|||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
io::{BufRead, Read},
|
io::Read,
|
||||||
num::{NonZeroU64, NonZeroUsize},
|
num::{NonZeroU64, NonZeroUsize},
|
||||||
time::{Duration, SystemTime},
|
time::SystemTime,
|
||||||
};
|
};
|
||||||
|
|
||||||
use byteorder::{BigEndian, ReadBytesExt};
|
use byteorder::{BigEndian, ReadBytesExt};
|
||||||
@@ -18,10 +18,7 @@ use utils::{
|
|||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{
|
use crate::{reltag::RelTag, shard::TenantShardId};
|
||||||
reltag::RelTag,
|
|
||||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
|
||||||
};
|
|
||||||
use anyhow::bail;
|
use anyhow::bail;
|
||||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||||
|
|
||||||
@@ -191,31 +188,6 @@ pub struct TimelineCreateRequest {
|
|||||||
pub pg_version: Option<u32>,
|
pub pg_version: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parameters that apply to all shards in a tenant. Used during tenant creation.
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
|
||||||
#[serde(deny_unknown_fields)]
|
|
||||||
pub struct ShardParameters {
|
|
||||||
pub count: ShardCount,
|
|
||||||
pub stripe_size: ShardStripeSize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ShardParameters {
|
|
||||||
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
|
||||||
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
|
||||||
self.count == ShardCount(0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for ShardParameters {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
count: ShardCount(0),
|
|
||||||
stripe_size: Self::DEFAULT_STRIPE_SIZE,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
pub struct TenantCreateRequest {
|
pub struct TenantCreateRequest {
|
||||||
@@ -223,12 +195,6 @@ pub struct TenantCreateRequest {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub generation: Option<u32>,
|
pub generation: Option<u32>,
|
||||||
|
|
||||||
// If omitted, create a single shard with TenantShardId::unsharded()
|
|
||||||
#[serde(default)]
|
|
||||||
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
|
|
||||||
pub shard_parameters: ShardParameters,
|
|
||||||
|
|
||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||||
}
|
}
|
||||||
@@ -251,7 +217,7 @@ impl std::ops::Deref for TenantCreateRequest {
|
|||||||
|
|
||||||
/// An alternative representation of `pageserver::tenant::TenantConf` with
|
/// An alternative representation of `pageserver::tenant::TenantConf` with
|
||||||
/// simpler types.
|
/// simpler types.
|
||||||
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
|
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||||
pub struct TenantConfig {
|
pub struct TenantConfig {
|
||||||
pub checkpoint_distance: Option<u64>,
|
pub checkpoint_distance: Option<u64>,
|
||||||
pub checkpoint_timeout: Option<String>,
|
pub checkpoint_timeout: Option<String>,
|
||||||
@@ -266,41 +232,21 @@ pub struct TenantConfig {
|
|||||||
pub lagging_wal_timeout: Option<String>,
|
pub lagging_wal_timeout: Option<String>,
|
||||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||||
pub trace_read_requests: Option<bool>,
|
pub trace_read_requests: Option<bool>,
|
||||||
pub eviction_policy: Option<EvictionPolicy>,
|
// We defer the parsing of the eviction_policy field to the request handler.
|
||||||
|
// Otherwise we'd have to move the types for eviction policy into this package.
|
||||||
|
// We might do that once the eviction feature has stabilizied.
|
||||||
|
// For now, this field is not even documented in the openapi_spec.yml.
|
||||||
|
pub eviction_policy: Option<serde_json::Value>,
|
||||||
pub min_resident_size_override: Option<u64>,
|
pub min_resident_size_override: Option<u64>,
|
||||||
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
||||||
pub gc_feedback: Option<bool>,
|
pub gc_feedback: Option<bool>,
|
||||||
pub heatmap_period: Option<String>,
|
pub heatmap_period: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
||||||
#[serde(tag = "kind")]
|
|
||||||
pub enum EvictionPolicy {
|
|
||||||
NoEviction,
|
|
||||||
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl EvictionPolicy {
|
|
||||||
pub fn discriminant_str(&self) -> &'static str {
|
|
||||||
match self {
|
|
||||||
EvictionPolicy::NoEviction => "NoEviction",
|
|
||||||
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
||||||
pub struct EvictionPolicyLayerAccessThreshold {
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub period: Duration,
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub threshold: Duration,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
|
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
|
||||||
/// lists out all possible states (and the virtual "Detached" state)
|
/// lists out all possible states (and the virtual "Detached" state)
|
||||||
/// in a flat form rather than using rust-style enums.
|
/// in a flat form rather than using rust-style enums.
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub enum LocationConfigMode {
|
pub enum LocationConfigMode {
|
||||||
AttachedSingle,
|
AttachedSingle,
|
||||||
AttachedMulti,
|
AttachedMulti,
|
||||||
@@ -309,21 +255,19 @@ pub enum LocationConfigMode {
|
|||||||
Detached,
|
Detached,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct LocationConfigSecondary {
|
pub struct LocationConfigSecondary {
|
||||||
pub warm: bool,
|
pub warm: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// An alternative representation of `pageserver::tenant::LocationConf`,
|
/// An alternative representation of `pageserver::tenant::LocationConf`,
|
||||||
/// for use in external-facing APIs.
|
/// for use in external-facing APIs.
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct LocationConfig {
|
pub struct LocationConfig {
|
||||||
pub mode: LocationConfigMode,
|
pub mode: LocationConfigMode,
|
||||||
/// If attaching, in what generation?
|
/// If attaching, in what generation?
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub generation: Option<u32>,
|
pub generation: Option<u32>,
|
||||||
|
|
||||||
// If requesting mode `Secondary`, configuration for that.
|
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub secondary_conf: Option<LocationConfigSecondary>,
|
pub secondary_conf: Option<LocationConfigSecondary>,
|
||||||
|
|
||||||
@@ -336,17 +280,11 @@ pub struct LocationConfig {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub shard_stripe_size: u32,
|
pub shard_stripe_size: u32,
|
||||||
|
|
||||||
// This configuration only affects attached mode, but should be provided irrespective
|
// If requesting mode `Secondary`, configuration for that.
|
||||||
// of the mode, as a secondary location might transition on startup if the response
|
// Custom storage configuration for the tenant, if any
|
||||||
// to the `/re-attach` control plane API requests it.
|
|
||||||
pub tenant_conf: TenantConfig,
|
pub tenant_conf: TenantConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct LocationConfigListResponse {
|
|
||||||
pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
#[serde(transparent)]
|
#[serde(transparent)]
|
||||||
pub struct TenantCreateResponse(pub TenantId);
|
pub struct TenantCreateResponse(pub TenantId);
|
||||||
@@ -359,7 +297,7 @@ pub struct StatusResponse {
|
|||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
pub struct TenantLocationConfigRequest {
|
pub struct TenantLocationConfigRequest {
|
||||||
pub tenant_id: TenantShardId,
|
pub tenant_id: TenantId,
|
||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||||
}
|
}
|
||||||
@@ -430,8 +368,6 @@ pub struct TenantInfo {
|
|||||||
/// If a layer is present in both local FS and S3, it counts only once.
|
/// If a layer is present in both local FS and S3, it counts only once.
|
||||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||||
pub attachment_status: TenantAttachmentStatus,
|
pub attachment_status: TenantAttachmentStatus,
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
|
||||||
pub generation: Option<u32>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone)]
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
@@ -621,6 +557,19 @@ pub enum DownloadRemoteLayersTaskState {
|
|||||||
ShutDown,
|
ShutDown,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||||
|
|
||||||
|
/// Information for configuring a single fail point
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct FailpointConfig {
|
||||||
|
/// Name of the fail point
|
||||||
|
pub name: String,
|
||||||
|
/// List of actions to take, using the format described in `fail::cfg`
|
||||||
|
///
|
||||||
|
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||||
|
pub actions: String,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
pub struct TimelineGcRequest {
|
pub struct TimelineGcRequest {
|
||||||
pub gc_horizon: Option<u64>,
|
pub gc_horizon: Option<u64>,
|
||||||
@@ -722,17 +671,6 @@ pub struct PagestreamDbSizeResponse {
|
|||||||
pub db_size: i64,
|
pub db_size: i64,
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
|
|
||||||
// that require pageserver-internal types. It is sufficient to get the total size.
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
|
||||||
pub struct TenantHistorySize {
|
|
||||||
pub id: TenantId,
|
|
||||||
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
|
||||||
///
|
|
||||||
/// Will be none if `?inputs_only=true` was given.
|
|
||||||
pub size: Option<u64>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PagestreamFeMessage {
|
impl PagestreamFeMessage {
|
||||||
pub fn serialize(&self) -> Bytes {
|
pub fn serialize(&self) -> Bytes {
|
||||||
let mut bytes = BytesMut::new();
|
let mut bytes = BytesMut::new();
|
||||||
@@ -888,10 +826,9 @@ impl PagestreamBeMessage {
|
|||||||
PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
|
PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
|
||||||
}
|
}
|
||||||
Tag::Error => {
|
Tag::Error => {
|
||||||
let mut msg = Vec::new();
|
let buf = buf.get_ref();
|
||||||
buf.read_until(0, &mut msg)?;
|
let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
|
||||||
let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
|
let rust_str = cstr.to_str()?;
|
||||||
let rust_str = cstring.to_str()?;
|
|
||||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||||
message: rust_str.to_owned(),
|
message: rust_str.to_owned(),
|
||||||
})
|
})
|
||||||
@@ -985,7 +922,6 @@ mod tests {
|
|||||||
state: TenantState::Active,
|
state: TenantState::Active,
|
||||||
current_physical_size: Some(42),
|
current_physical_size: Some(42),
|
||||||
attachment_status: TenantAttachmentStatus::Attached,
|
attachment_status: TenantAttachmentStatus::Attached,
|
||||||
generation: None,
|
|
||||||
};
|
};
|
||||||
let expected_active = json!({
|
let expected_active = json!({
|
||||||
"id": original_active.id.to_string(),
|
"id": original_active.id.to_string(),
|
||||||
@@ -1006,7 +942,6 @@ mod tests {
|
|||||||
},
|
},
|
||||||
current_physical_size: Some(42),
|
current_physical_size: Some(42),
|
||||||
attachment_status: TenantAttachmentStatus::Attached,
|
attachment_status: TenantAttachmentStatus::Attached,
|
||||||
generation: None,
|
|
||||||
};
|
};
|
||||||
let expected_broken = json!({
|
let expected_broken = json!({
|
||||||
"id": original_broken.id.to_string(),
|
"id": original_broken.id.to_string(),
|
||||||
|
|||||||
@@ -32,9 +32,6 @@ pub struct RelTag {
|
|||||||
pub relnode: Oid,
|
pub relnode: Oid,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
|
|
||||||
pub type BlockNumber = u32;
|
|
||||||
|
|
||||||
impl PartialOrd for RelTag {
|
impl PartialOrd for RelTag {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
|
|||||||
@@ -1,9 +1,6 @@
|
|||||||
use std::{ops::RangeInclusive, str::FromStr};
|
use std::{ops::RangeInclusive, str::FromStr};
|
||||||
|
|
||||||
use crate::{
|
use crate::key::{is_rel_block_key, Key};
|
||||||
key::{is_rel_block_key, Key},
|
|
||||||
models::ShardParameters,
|
|
||||||
};
|
|
||||||
use hex::FromHex;
|
use hex::FromHex;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use thiserror;
|
use thiserror;
|
||||||
@@ -84,16 +81,6 @@ impl TenantShardId {
|
|||||||
pub fn is_zero(&self) -> bool {
|
pub fn is_zero(&self) -> bool {
|
||||||
self.shard_number == ShardNumber(0)
|
self.shard_number == ShardNumber(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
|
||||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
|
||||||
}
|
|
||||||
pub fn to_index(&self) -> ShardIndex {
|
|
||||||
ShardIndex {
|
|
||||||
shard_number: self.shard_number,
|
|
||||||
shard_count: self.shard_count,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Formatting helper
|
/// Formatting helper
|
||||||
@@ -342,7 +329,7 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
|||||||
pub struct ShardIdentity {
|
pub struct ShardIdentity {
|
||||||
pub number: ShardNumber,
|
pub number: ShardNumber,
|
||||||
pub count: ShardCount,
|
pub count: ShardCount,
|
||||||
pub stripe_size: ShardStripeSize,
|
stripe_size: ShardStripeSize,
|
||||||
layout: ShardLayout,
|
layout: ShardLayout,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -412,17 +399,6 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For use when creating ShardIdentity instances for new shards, where a creation request
|
|
||||||
/// specifies the ShardParameters that apply to all shards.
|
|
||||||
pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
|
|
||||||
Self {
|
|
||||||
number,
|
|
||||||
count: params.count,
|
|
||||||
layout: LAYOUT_V1,
|
|
||||||
stripe_size: params.stripe_size,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_broken(&self) -> bool {
|
fn is_broken(&self) -> bool {
|
||||||
self.layout == LAYOUT_BROKEN
|
self.layout == LAYOUT_BROKEN
|
||||||
}
|
}
|
||||||
@@ -442,21 +418,6 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return true if the key should be discarded if found in this shard's
|
|
||||||
/// data store, e.g. during compaction after a split
|
|
||||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
|
||||||
if key_is_shard0(key) {
|
|
||||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
|
||||||
// A: because the WAL ingestion logic currently ingests some shard 0
|
|
||||||
// content on all shards, even though it's only read on shard 0. If we
|
|
||||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
|
||||||
// an error.
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
!self.is_key_local(key)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn shard_slug(&self) -> String {
|
pub fn shard_slug(&self) -> String {
|
||||||
if self.count > ShardCount(0) {
|
if self.count > ShardCount(0) {
|
||||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||||
@@ -550,7 +511,12 @@ fn key_is_shard0(key: &Key) -> bool {
|
|||||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||||
// requests, and any request other than those for particular blocks in relations.
|
// requests, and any request other than those for particular blocks in relations.
|
||||||
!is_rel_block_key(key)
|
//
|
||||||
|
// In this condition:
|
||||||
|
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
|
||||||
|
// all metadata.
|
||||||
|
// - field6 is set to -1 for relation size pages.
|
||||||
|
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||||
|
|||||||
@@ -35,12 +35,6 @@ pub enum QueryError {
|
|||||||
/// We were instructed to shutdown while processing the query
|
/// We were instructed to shutdown while processing the query
|
||||||
#[error("Shutting down")]
|
#[error("Shutting down")]
|
||||||
Shutdown,
|
Shutdown,
|
||||||
/// Query handler indicated that client should reconnect
|
|
||||||
#[error("Server requested reconnect")]
|
|
||||||
Reconnect,
|
|
||||||
/// Query named an entity that was not found
|
|
||||||
#[error("Not found: {0}")]
|
|
||||||
NotFound(std::borrow::Cow<'static, str>),
|
|
||||||
/// Authentication failure
|
/// Authentication failure
|
||||||
#[error("Unauthorized: {0}")]
|
#[error("Unauthorized: {0}")]
|
||||||
Unauthorized(std::borrow::Cow<'static, str>),
|
Unauthorized(std::borrow::Cow<'static, str>),
|
||||||
@@ -60,9 +54,9 @@ impl From<io::Error> for QueryError {
|
|||||||
impl QueryError {
|
impl QueryError {
|
||||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||||
match self {
|
match self {
|
||||||
Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
|
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
|
||||||
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
||||||
Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
|
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
|
||||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -431,11 +425,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
info!("Stopped due to shutdown");
|
info!("Stopped due to shutdown");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
Err(QueryError::Reconnect) => {
|
|
||||||
// Dropping out of this loop implicitly disconnects
|
|
||||||
info!("Stopped due to handler reconnect request");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
Err(QueryError::Disconnected(e)) => {
|
Err(QueryError::Disconnected(e)) => {
|
||||||
info!("Disconnected ({e:#})");
|
info!("Disconnected ({e:#})");
|
||||||
// Disconnection is not an error: we just use it that way internally to drop
|
// Disconnection is not an error: we just use it that way internally to drop
|
||||||
@@ -985,9 +974,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
|
|||||||
pub fn short_error(e: &QueryError) -> String {
|
pub fn short_error(e: &QueryError) -> String {
|
||||||
match e {
|
match e {
|
||||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||||
QueryError::Reconnect => "reconnect".to_string(),
|
|
||||||
QueryError::Shutdown => "shutdown".to_string(),
|
QueryError::Shutdown => "shutdown".to_string(),
|
||||||
QueryError::NotFound(_) => "not found".to_string(),
|
|
||||||
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
|
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
|
||||||
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
|
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
|
||||||
QueryError::Other(e) => format!("{e:#}"),
|
QueryError::Other(e) => format!("{e:#}"),
|
||||||
@@ -1009,15 +996,9 @@ fn log_query_error(query: &str, e: &QueryError) {
|
|||||||
QueryError::SimulatedConnectionError => {
|
QueryError::SimulatedConnectionError => {
|
||||||
error!("query handler for query '{query}' failed due to a simulated connection error")
|
error!("query handler for query '{query}' failed due to a simulated connection error")
|
||||||
}
|
}
|
||||||
QueryError::Reconnect => {
|
|
||||||
info!("query handler for '{query}' requested client to reconnect")
|
|
||||||
}
|
|
||||||
QueryError::Shutdown => {
|
QueryError::Shutdown => {
|
||||||
info!("query handler for '{query}' cancelled during tenant shutdown")
|
info!("query handler for '{query}' cancelled during tenant shutdown")
|
||||||
}
|
}
|
||||||
QueryError::NotFound(reason) => {
|
|
||||||
info!("query handler for '{query}' entity not found: {reason}")
|
|
||||||
}
|
|
||||||
QueryError::Unauthorized(e) => {
|
QueryError::Unauthorized(e) => {
|
||||||
warn!("query handler for '{query}' failed with authentication error: {e}");
|
warn!("query handler for '{query}' failed with authentication error: {e}");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,9 +5,7 @@ use std::collections::HashMap;
|
|||||||
use std::env;
|
use std::env;
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::str::FromStr;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
@@ -15,14 +13,12 @@ use azure_core::request_options::{MaxResults, Metadata, Range};
|
|||||||
use azure_core::RetryOptions;
|
use azure_core::RetryOptions;
|
||||||
use azure_identity::DefaultAzureCredential;
|
use azure_identity::DefaultAzureCredential;
|
||||||
use azure_storage::StorageCredentials;
|
use azure_storage::StorageCredentials;
|
||||||
use azure_storage_blobs::blob::CopyStatus;
|
|
||||||
use azure_storage_blobs::prelude::ClientBuilder;
|
use azure_storage_blobs::prelude::ClientBuilder;
|
||||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures::stream::Stream;
|
use futures::stream::Stream;
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
use http_types::{StatusCode, Url};
|
use http_types::StatusCode;
|
||||||
use tokio::time::Instant;
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::s3_bucket::RequestKind;
|
use crate::s3_bucket::RequestKind;
|
||||||
@@ -121,8 +117,6 @@ impl AzureBlobStorage {
|
|||||||
) -> Result<Download, DownloadError> {
|
) -> Result<Download, DownloadError> {
|
||||||
let mut response = builder.into_stream();
|
let mut response = builder.into_stream();
|
||||||
|
|
||||||
let mut etag = None;
|
|
||||||
let mut last_modified = None;
|
|
||||||
let mut metadata = HashMap::new();
|
let mut metadata = HashMap::new();
|
||||||
// TODO give proper streaming response instead of buffering into RAM
|
// TODO give proper streaming response instead of buffering into RAM
|
||||||
// https://github.com/neondatabase/neon/issues/5563
|
// https://github.com/neondatabase/neon/issues/5563
|
||||||
@@ -130,13 +124,6 @@ impl AzureBlobStorage {
|
|||||||
let mut bufs = Vec::new();
|
let mut bufs = Vec::new();
|
||||||
while let Some(part) = response.next().await {
|
while let Some(part) = response.next().await {
|
||||||
let part = part.map_err(to_download_error)?;
|
let part = part.map_err(to_download_error)?;
|
||||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
|
||||||
if etag.is_none() {
|
|
||||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
|
||||||
}
|
|
||||||
if last_modified.is_none() {
|
|
||||||
last_modified = Some(part.blob.properties.last_modified.into());
|
|
||||||
}
|
|
||||||
if let Some(blob_meta) = part.blob.metadata {
|
if let Some(blob_meta) = part.blob.metadata {
|
||||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||||
}
|
}
|
||||||
@@ -149,8 +136,6 @@ impl AzureBlobStorage {
|
|||||||
}
|
}
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||||
etag,
|
|
||||||
last_modified,
|
|
||||||
metadata: Some(StorageMetadata(metadata)),
|
metadata: Some(StorageMetadata(metadata)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -326,51 +311,6 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
|
||||||
let _permit = self.permit(RequestKind::Copy).await;
|
|
||||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
|
||||||
|
|
||||||
let source_url = format!(
|
|
||||||
"{}/{}",
|
|
||||||
self.client.url()?,
|
|
||||||
self.relative_path_to_name(from)
|
|
||||||
);
|
|
||||||
let builder = blob_client.copy(Url::from_str(&source_url)?);
|
|
||||||
|
|
||||||
let result = builder.into_future().await?;
|
|
||||||
|
|
||||||
let mut copy_status = result.copy_status;
|
|
||||||
let start_time = Instant::now();
|
|
||||||
const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
|
|
||||||
loop {
|
|
||||||
match copy_status {
|
|
||||||
CopyStatus::Aborted => {
|
|
||||||
anyhow::bail!("Received abort for copy from {from} to {to}.");
|
|
||||||
}
|
|
||||||
CopyStatus::Failed => {
|
|
||||||
anyhow::bail!("Received failure response for copy from {from} to {to}.");
|
|
||||||
}
|
|
||||||
CopyStatus::Success => return Ok(()),
|
|
||||||
CopyStatus::Pending => (),
|
|
||||||
}
|
|
||||||
// The copy is taking longer. Waiting a second and then re-trying.
|
|
||||||
// TODO estimate time based on copy_progress and adjust time based on that
|
|
||||||
tokio::time::sleep(Duration::from_millis(1000)).await;
|
|
||||||
let properties = blob_client.get_properties().into_future().await?;
|
|
||||||
let Some(status) = properties.blob.properties.copy_status else {
|
|
||||||
tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
|
|
||||||
return Ok(());
|
|
||||||
};
|
|
||||||
if start_time.elapsed() > MAX_WAIT_TIME {
|
|
||||||
anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
|
|
||||||
MAX_WAIT_TIME.as_secs_f32(),
|
|
||||||
properties.blob.properties.copy_progress,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
copy_status = status;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pin_project_lite::pin_project! {
|
pin_project_lite::pin_project! {
|
||||||
|
|||||||
@@ -14,9 +14,7 @@ mod local_fs;
|
|||||||
mod s3_bucket;
|
mod s3_bucket;
|
||||||
mod simulate_failures;
|
mod simulate_failures;
|
||||||
|
|
||||||
use std::{
|
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
|
||||||
collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
|
|
||||||
};
|
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
@@ -207,18 +205,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||||
|
|
||||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||||
|
|
||||||
/// Copy a remote object inside a bucket from one path to another.
|
|
||||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
|
||||||
pub struct Download {
|
pub struct Download {
|
||||||
pub download_stream: DownloadStream,
|
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
|
||||||
/// The last time the file was modified (`last-modified` HTTP header)
|
|
||||||
pub last_modified: Option<SystemTime>,
|
|
||||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
|
||||||
pub etag: Option<String>,
|
|
||||||
/// Extra key-value data, associated with the current remote file.
|
/// Extra key-value data, associated with the current remote file.
|
||||||
pub metadata: Option<StorageMetadata>,
|
pub metadata: Option<StorageMetadata>,
|
||||||
}
|
}
|
||||||
@@ -377,15 +367,6 @@ impl GenericRemoteStorage {
|
|||||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
|
||||||
match self {
|
|
||||||
Self::LocalFs(s) => s.copy(from, to).await,
|
|
||||||
Self::AwsS3(s) => s.copy(from, to).await,
|
|
||||||
Self::AzureBlob(s) => s.copy(from, to).await,
|
|
||||||
Self::Unreliable(s) => s.copy(from, to).await,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GenericRemoteStorage {
|
impl GenericRemoteStorage {
|
||||||
@@ -672,7 +653,6 @@ impl ConcurrencyLimiter {
|
|||||||
RequestKind::Put => &self.write,
|
RequestKind::Put => &self.write,
|
||||||
RequestKind::List => &self.read,
|
RequestKind::List => &self.read,
|
||||||
RequestKind::Delete => &self.write,
|
RequestKind::Delete => &self.write,
|
||||||
RequestKind::Copy => &self.write,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||||
|
|
||||||
use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
|
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
|
||||||
|
|
||||||
use super::{RemoteStorage, StorageMetadata};
|
use super::{RemoteStorage, StorageMetadata};
|
||||||
|
|
||||||
@@ -331,8 +331,6 @@ impl RemoteStorage for LocalFs {
|
|||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
metadata,
|
metadata,
|
||||||
last_modified: None,
|
|
||||||
etag: None,
|
|
||||||
download_stream: Box::pin(source),
|
download_stream: Box::pin(source),
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
@@ -374,17 +372,17 @@ impl RemoteStorage for LocalFs {
|
|||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
let download_stream: DownloadStream = match end_exclusive {
|
Ok(match end_exclusive {
|
||||||
Some(end_exclusive) => Box::pin(ReaderStream::new(
|
Some(end_exclusive) => Download {
|
||||||
source.take(end_exclusive - start_inclusive),
|
metadata,
|
||||||
)),
|
download_stream: Box::pin(ReaderStream::new(
|
||||||
None => Box::pin(ReaderStream::new(source)),
|
source.take(end_exclusive - start_inclusive),
|
||||||
};
|
)),
|
||||||
Ok(Download {
|
},
|
||||||
metadata,
|
None => Download {
|
||||||
last_modified: None,
|
metadata,
|
||||||
etag: None,
|
download_stream: Box::pin(ReaderStream::new(source)),
|
||||||
download_stream,
|
},
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
Err(DownloadError::NotFound)
|
Err(DownloadError::NotFound)
|
||||||
@@ -409,20 +407,6 @@ impl RemoteStorage for LocalFs {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
|
||||||
let from_path = from.with_base(&self.storage_root);
|
|
||||||
let to_path = to.with_base(&self.storage_root);
|
|
||||||
create_target_directory(&to_path).await?;
|
|
||||||
fs::copy(&from_path, &to_path).await.with_context(|| {
|
|
||||||
format!(
|
|
||||||
"Failed to copy file from '{from_path}' to '{to_path}'",
|
|
||||||
from_path = from_path,
|
|
||||||
to_path = to_path
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ use aws_config::{
|
|||||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||||
imds::credentials::ImdsCredentialsProvider,
|
imds::credentials::ImdsCredentialsProvider,
|
||||||
meta::credentials::CredentialsProviderChain,
|
meta::credentials::CredentialsProviderChain,
|
||||||
profile::ProfileFileCredentialsProvider,
|
|
||||||
provider_config::ProviderConfig,
|
provider_config::ProviderConfig,
|
||||||
retry::{RetryConfigBuilder, RetryMode},
|
retry::{RetryConfigBuilder, RetryMode},
|
||||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||||
@@ -75,29 +74,20 @@ impl S3Bucket {
|
|||||||
|
|
||||||
let region = Some(Region::new(aws_config.bucket_region.clone()));
|
let region = Some(Region::new(aws_config.bucket_region.clone()));
|
||||||
|
|
||||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
|
||||||
|
|
||||||
let credentials_provider = {
|
let credentials_provider = {
|
||||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||||
CredentialsProviderChain::first_try(
|
CredentialsProviderChain::first_try(
|
||||||
"env",
|
"env",
|
||||||
EnvironmentVariableCredentialsProvider::new(),
|
EnvironmentVariableCredentialsProvider::new(),
|
||||||
)
|
)
|
||||||
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
|
|
||||||
.or_else(
|
|
||||||
"profile-sso",
|
|
||||||
ProfileFileCredentialsProvider::builder()
|
|
||||||
.configure(&provider_conf)
|
|
||||||
.build(),
|
|
||||||
)
|
|
||||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||||
// needed to access remote extensions bucket
|
// needed to access remote extensions bucket
|
||||||
.or_else(
|
.or_else("token", {
|
||||||
"token",
|
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||||
WebIdentityTokenCredentialsProvider::builder()
|
WebIdentityTokenCredentialsProvider::builder()
|
||||||
.configure(&provider_conf)
|
.configure(&provider_conf)
|
||||||
.build(),
|
.build()
|
||||||
)
|
})
|
||||||
// uses imds v2
|
// uses imds v2
|
||||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||||
};
|
};
|
||||||
@@ -231,8 +221,6 @@ impl S3Bucket {
|
|||||||
match get_object {
|
match get_object {
|
||||||
Ok(object_output) => {
|
Ok(object_output) => {
|
||||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||||
let etag = object_output.e_tag.clone();
|
|
||||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
|
||||||
|
|
||||||
let body = object_output.body;
|
let body = object_output.body;
|
||||||
let body = ByteStreamAsStream::from(body);
|
let body = ByteStreamAsStream::from(body);
|
||||||
@@ -241,8 +229,6 @@ impl S3Bucket {
|
|||||||
|
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
metadata,
|
metadata,
|
||||||
etag,
|
|
||||||
last_modified,
|
|
||||||
download_stream: Box::pin(body),
|
download_stream: Box::pin(body),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -493,38 +479,6 @@ impl RemoteStorage for S3Bucket {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
|
||||||
let kind = RequestKind::Copy;
|
|
||||||
let _guard = self.permit(kind).await;
|
|
||||||
|
|
||||||
let started_at = start_measuring_requests(kind);
|
|
||||||
|
|
||||||
// we need to specify bucket_name as a prefix
|
|
||||||
let copy_source = format!(
|
|
||||||
"{}/{}",
|
|
||||||
self.bucket_name,
|
|
||||||
self.relative_path_to_s3_object(from)
|
|
||||||
);
|
|
||||||
|
|
||||||
let res = self
|
|
||||||
.client
|
|
||||||
.copy_object()
|
|
||||||
.bucket(self.bucket_name.clone())
|
|
||||||
.key(self.relative_path_to_s3_object(to))
|
|
||||||
.copy_source(copy_source)
|
|
||||||
.send()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
let started_at = ScopeGuard::into_inner(started_at);
|
|
||||||
metrics::BUCKET_METRICS
|
|
||||||
.req_seconds
|
|
||||||
.observe_elapsed(kind, &res, started_at);
|
|
||||||
|
|
||||||
res?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||||
// if prefix is not none then download file `prefix/from`
|
// if prefix is not none then download file `prefix/from`
|
||||||
// if prefix is none then download file `from`
|
// if prefix is none then download file `from`
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ pub(crate) enum RequestKind {
|
|||||||
Put = 1,
|
Put = 1,
|
||||||
Delete = 2,
|
Delete = 2,
|
||||||
List = 3,
|
List = 3,
|
||||||
Copy = 4,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
use RequestKind::*;
|
use RequestKind::*;
|
||||||
@@ -23,7 +22,6 @@ impl RequestKind {
|
|||||||
Put => "put_object",
|
Put => "put_object",
|
||||||
Delete => "delete_object",
|
Delete => "delete_object",
|
||||||
List => "list_objects",
|
List => "list_objects",
|
||||||
Copy => "copy_object",
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const fn as_index(&self) -> usize {
|
const fn as_index(&self) -> usize {
|
||||||
@@ -31,7 +29,7 @@ impl RequestKind {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) struct RequestTyped<C>([C; 5]);
|
pub(super) struct RequestTyped<C>([C; 4]);
|
||||||
|
|
||||||
impl<C> RequestTyped<C> {
|
impl<C> RequestTyped<C> {
|
||||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||||
@@ -40,8 +38,8 @@ impl<C> RequestTyped<C> {
|
|||||||
|
|
||||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||||
use RequestKind::*;
|
use RequestKind::*;
|
||||||
let mut it = [Get, Put, Delete, List, Copy].into_iter();
|
let mut it = [Get, Put, Delete, List].into_iter();
|
||||||
let arr = std::array::from_fn::<C, 5, _>(|index| {
|
let arr = std::array::from_fn::<C, 4, _>(|index| {
|
||||||
let next = it.next().unwrap();
|
let next = it.next().unwrap();
|
||||||
assert_eq!(index, next.as_index());
|
assert_eq!(index, next.as_index());
|
||||||
f(next)
|
f(next)
|
||||||
|
|||||||
@@ -162,11 +162,4 @@ impl RemoteStorage for UnreliableWrapper {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
|
||||||
// copy is equivalent to download + upload
|
|
||||||
self.attempt(RemoteOp::Download(from.clone()))?;
|
|
||||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
|
||||||
self.inner.copy_object(from, to).await
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,288 +0,0 @@
|
|||||||
use anyhow::Context;
|
|
||||||
use camino::Utf8Path;
|
|
||||||
use remote_storage::RemotePath;
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::sync::Arc;
|
|
||||||
use test_context::test_context;
|
|
||||||
use tracing::debug;
|
|
||||||
|
|
||||||
use crate::common::{download_to_vec, upload_stream, wrap_stream};
|
|
||||||
|
|
||||||
use super::{
|
|
||||||
MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
|
|
||||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
|
|
||||||
/// See the client creation in [`create_s3_client`] for details on the required env vars.
|
|
||||||
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
|
||||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
|
||||||
///
|
|
||||||
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
|
|
||||||
/// where
|
|
||||||
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
|
|
||||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
|
||||||
///
|
|
||||||
/// Then, verifies that the client does return correct prefixes when queried:
|
|
||||||
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
|
||||||
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
|
||||||
///
|
|
||||||
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
|
|
||||||
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
|
|
||||||
/// since current default AWS S3 pagination limit is 1000.
|
|
||||||
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
|
|
||||||
///
|
|
||||||
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
|
|
||||||
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
|
||||||
#[test_context(MaybeEnabledStorageWithTestBlobs)]
|
|
||||||
#[tokio::test]
|
|
||||||
async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> anyhow::Result<()> {
|
|
||||||
let ctx = match ctx {
|
|
||||||
MaybeEnabledStorageWithTestBlobs::Enabled(ctx) => ctx,
|
|
||||||
MaybeEnabledStorageWithTestBlobs::Disabled => return Ok(()),
|
|
||||||
MaybeEnabledStorageWithTestBlobs::UploadsFailed(e, _) => {
|
|
||||||
anyhow::bail!("S3 init failed: {e:?}")
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let test_client = Arc::clone(&ctx.enabled.client);
|
|
||||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
|
||||||
|
|
||||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
|
||||||
.context("common_prefix construction")?;
|
|
||||||
let root_remote_prefixes = test_client
|
|
||||||
.list_prefixes(None)
|
|
||||||
.await
|
|
||||||
.context("client list root prefixes failure")?
|
|
||||||
.into_iter()
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
assert_eq!(
|
|
||||||
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
|
|
||||||
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
|
|
||||||
);
|
|
||||||
|
|
||||||
let nested_remote_prefixes = test_client
|
|
||||||
.list_prefixes(Some(&base_prefix))
|
|
||||||
.await
|
|
||||||
.context("client list nested prefixes failure")?
|
|
||||||
.into_iter()
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
let remote_only_prefixes = nested_remote_prefixes
|
|
||||||
.difference(&expected_remote_prefixes)
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
let missing_uploaded_prefixes = expected_remote_prefixes
|
|
||||||
.difference(&nested_remote_prefixes)
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
assert_eq!(
|
|
||||||
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
|
||||||
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
|
|
||||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
|
|
||||||
/// See `s3_pagination_should_work` for more information.
|
|
||||||
///
|
|
||||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
|
||||||
/// Then performs the following queries:
|
|
||||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
|
||||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
|
||||||
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
|
||||||
#[tokio::test]
|
|
||||||
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
|
|
||||||
let ctx = match ctx {
|
|
||||||
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
|
||||||
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
|
||||||
MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
|
|
||||||
anyhow::bail!("S3 init failed: {e:?}")
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let test_client = Arc::clone(&ctx.enabled.client);
|
|
||||||
let base_prefix =
|
|
||||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
|
||||||
let root_files = test_client
|
|
||||||
.list_files(None)
|
|
||||||
.await
|
|
||||||
.context("client list root files failure")?
|
|
||||||
.into_iter()
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
assert_eq!(
|
|
||||||
root_files,
|
|
||||||
ctx.remote_blobs.clone(),
|
|
||||||
"remote storage list_files on root mismatches with the uploads."
|
|
||||||
);
|
|
||||||
let nested_remote_files = test_client
|
|
||||||
.list_files(Some(&base_prefix))
|
|
||||||
.await
|
|
||||||
.context("client list nested files failure")?
|
|
||||||
.into_iter()
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
let trim_remote_blobs: HashSet<_> = ctx
|
|
||||||
.remote_blobs
|
|
||||||
.iter()
|
|
||||||
.map(|x| x.get_path())
|
|
||||||
.filter(|x| x.starts_with("folder1"))
|
|
||||||
.map(|x| RemotePath::new(x).expect("must be valid path"))
|
|
||||||
.collect();
|
|
||||||
assert_eq!(
|
|
||||||
nested_remote_files, trim_remote_blobs,
|
|
||||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
|
||||||
);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test_context(MaybeEnabledStorage)]
|
|
||||||
#[tokio::test]
|
|
||||||
async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
|
||||||
let ctx = match ctx {
|
|
||||||
MaybeEnabledStorage::Enabled(ctx) => ctx,
|
|
||||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let path = RemotePath::new(Utf8Path::new(
|
|
||||||
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
|
|
||||||
))
|
|
||||||
.with_context(|| "RemotePath conversion")?;
|
|
||||||
|
|
||||||
ctx.client.delete(&path).await.expect("should succeed");
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test_context(MaybeEnabledStorage)]
|
|
||||||
#[tokio::test]
|
|
||||||
async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
|
||||||
let ctx = match ctx {
|
|
||||||
MaybeEnabledStorage::Enabled(ctx) => ctx,
|
|
||||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
|
||||||
.with_context(|| "RemotePath conversion")?;
|
|
||||||
|
|
||||||
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
|
|
||||||
.with_context(|| "RemotePath conversion")?;
|
|
||||||
|
|
||||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
|
||||||
.with_context(|| "RemotePath conversion")?;
|
|
||||||
|
|
||||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
|
||||||
ctx.client.upload(data, len, &path1, None).await?;
|
|
||||||
|
|
||||||
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
|
||||||
ctx.client.upload(data, len, &path2, None).await?;
|
|
||||||
|
|
||||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
|
||||||
ctx.client.upload(data, len, &path3, None).await?;
|
|
||||||
|
|
||||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
|
||||||
|
|
||||||
let prefixes = ctx.client.list_prefixes(None).await?;
|
|
||||||
|
|
||||||
assert_eq!(prefixes.len(), 1);
|
|
||||||
|
|
||||||
ctx.client.delete_objects(&[path3]).await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test_context(MaybeEnabledStorage)]
|
|
||||||
#[tokio::test]
|
|
||||||
async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
|
||||||
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
|
|
||||||
return Ok(());
|
|
||||||
};
|
|
||||||
|
|
||||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
|
||||||
.with_context(|| "RemotePath conversion")?;
|
|
||||||
|
|
||||||
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
|
|
||||||
|
|
||||||
let (data, len) = wrap_stream(orig.clone());
|
|
||||||
|
|
||||||
ctx.client.upload(data, len, &path, None).await?;
|
|
||||||
|
|
||||||
// Normal download request
|
|
||||||
let dl = ctx.client.download(&path).await?;
|
|
||||||
let buf = download_to_vec(dl).await?;
|
|
||||||
assert_eq!(&buf, &orig);
|
|
||||||
|
|
||||||
// Full range (end specified)
|
|
||||||
let dl = ctx
|
|
||||||
.client
|
|
||||||
.download_byte_range(&path, 0, Some(len as u64))
|
|
||||||
.await?;
|
|
||||||
let buf = download_to_vec(dl).await?;
|
|
||||||
assert_eq!(&buf, &orig);
|
|
||||||
|
|
||||||
// partial range (end specified)
|
|
||||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
|
||||||
let buf = download_to_vec(dl).await?;
|
|
||||||
assert_eq!(&buf, &orig[4..10]);
|
|
||||||
|
|
||||||
// partial range (end beyond real end)
|
|
||||||
let dl = ctx
|
|
||||||
.client
|
|
||||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
|
||||||
.await?;
|
|
||||||
let buf = download_to_vec(dl).await?;
|
|
||||||
assert_eq!(&buf, &orig[8..]);
|
|
||||||
|
|
||||||
// Partial range (end unspecified)
|
|
||||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
|
||||||
let buf = download_to_vec(dl).await?;
|
|
||||||
assert_eq!(&buf, &orig[4..]);
|
|
||||||
|
|
||||||
// Full range (end unspecified)
|
|
||||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
|
||||||
let buf = download_to_vec(dl).await?;
|
|
||||||
assert_eq!(&buf, &orig);
|
|
||||||
|
|
||||||
debug!("Cleanup: deleting file at path {path:?}");
|
|
||||||
ctx.client
|
|
||||||
.delete(&path)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("{path:?} removal"))?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test_context(MaybeEnabledStorage)]
|
|
||||||
#[tokio::test]
|
|
||||||
async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
|
||||||
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
|
|
||||||
return Ok(());
|
|
||||||
};
|
|
||||||
|
|
||||||
let path = RemotePath::new(Utf8Path::new(
|
|
||||||
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
|
|
||||||
))
|
|
||||||
.with_context(|| "RemotePath conversion")?;
|
|
||||||
let path_dest = RemotePath::new(Utf8Path::new(
|
|
||||||
format!("{}/file_dest", ctx.base_prefix).as_str(),
|
|
||||||
))
|
|
||||||
.with_context(|| "RemotePath conversion")?;
|
|
||||||
|
|
||||||
let orig = bytes::Bytes::from_static("remote blob data content".as_bytes());
|
|
||||||
|
|
||||||
let (data, len) = wrap_stream(orig.clone());
|
|
||||||
|
|
||||||
ctx.client.upload(data, len, &path, None).await?;
|
|
||||||
|
|
||||||
// Normal download request
|
|
||||||
ctx.client.copy_object(&path, &path_dest).await?;
|
|
||||||
|
|
||||||
let dl = ctx.client.download(&path_dest).await?;
|
|
||||||
let buf = download_to_vec(dl).await?;
|
|
||||||
assert_eq!(&buf, &orig);
|
|
||||||
|
|
||||||
debug!("Cleanup: deleting file at path {path:?}");
|
|
||||||
ctx.client
|
|
||||||
.delete_objects(&[path.clone(), path_dest.clone()])
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("{path:?} removal"))?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -6,23 +6,263 @@ use std::sync::Arc;
|
|||||||
use std::time::UNIX_EPOCH;
|
use std::time::UNIX_EPOCH;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use camino::Utf8Path;
|
||||||
use remote_storage::{
|
use remote_storage::{
|
||||||
AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||||
};
|
};
|
||||||
use test_context::AsyncTestContext;
|
use test_context::{test_context, AsyncTestContext};
|
||||||
use tracing::info;
|
use tracing::{debug, info};
|
||||||
|
|
||||||
mod common;
|
mod common;
|
||||||
|
|
||||||
#[path = "common/tests.rs"]
|
use common::{
|
||||||
mod tests_azure;
|
cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
|
||||||
|
upload_stream, wrap_stream,
|
||||||
use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
|
};
|
||||||
|
|
||||||
const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
|
const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
|
||||||
|
|
||||||
const BASE_PREFIX: &str = "test";
|
const BASE_PREFIX: &str = "test";
|
||||||
|
|
||||||
|
/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
|
||||||
|
/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
|
||||||
|
/// See the client creation in [`create_azure_client`] for details on the required env vars.
|
||||||
|
/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||||
|
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||||
|
///
|
||||||
|
/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
|
||||||
|
/// where
|
||||||
|
/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
|
||||||
|
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||||
|
///
|
||||||
|
/// Then, verifies that the client does return correct prefixes when queried:
|
||||||
|
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
||||||
|
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
||||||
|
///
|
||||||
|
/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
|
||||||
|
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
|
||||||
|
///
|
||||||
|
/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
|
||||||
|
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||||
|
#[test_context(MaybeEnabledAzureWithTestBlobs)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn azure_pagination_should_work(
|
||||||
|
ctx: &mut MaybeEnabledAzureWithTestBlobs,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let ctx = match ctx {
|
||||||
|
MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
|
||||||
|
MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
|
||||||
|
MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
|
||||||
|
anyhow::bail!("Azure init failed: {e:?}")
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let test_client = Arc::clone(&ctx.enabled.client);
|
||||||
|
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||||
|
|
||||||
|
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||||
|
.context("common_prefix construction")?;
|
||||||
|
let root_remote_prefixes = test_client
|
||||||
|
.list_prefixes(None)
|
||||||
|
.await
|
||||||
|
.context("client list root prefixes failure")?
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
assert_eq!(
|
||||||
|
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
|
||||||
|
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let nested_remote_prefixes = test_client
|
||||||
|
.list_prefixes(Some(&base_prefix))
|
||||||
|
.await
|
||||||
|
.context("client list nested prefixes failure")?
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
let remote_only_prefixes = nested_remote_prefixes
|
||||||
|
.difference(&expected_remote_prefixes)
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
let missing_uploaded_prefixes = expected_remote_prefixes
|
||||||
|
.difference(&nested_remote_prefixes)
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
assert_eq!(
|
||||||
|
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
||||||
|
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
|
||||||
|
/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||||
|
/// See `Azure_pagination_should_work` for more information.
|
||||||
|
///
|
||||||
|
/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||||
|
/// Then performs the following queries:
|
||||||
|
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||||
|
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||||
|
#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn azure_list_files_works(
|
||||||
|
ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let ctx = match ctx {
|
||||||
|
MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||||
|
MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||||
|
MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
|
||||||
|
anyhow::bail!("Azure init failed: {e:?}")
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let test_client = Arc::clone(&ctx.enabled.client);
|
||||||
|
let base_prefix =
|
||||||
|
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||||
|
let root_files = test_client
|
||||||
|
.list_files(None)
|
||||||
|
.await
|
||||||
|
.context("client list root files failure")?
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
assert_eq!(
|
||||||
|
root_files,
|
||||||
|
ctx.remote_blobs.clone(),
|
||||||
|
"remote storage list_files on root mismatches with the uploads."
|
||||||
|
);
|
||||||
|
let nested_remote_files = test_client
|
||||||
|
.list_files(Some(&base_prefix))
|
||||||
|
.await
|
||||||
|
.context("client list nested files failure")?
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
let trim_remote_blobs: HashSet<_> = ctx
|
||||||
|
.remote_blobs
|
||||||
|
.iter()
|
||||||
|
.map(|x| x.get_path())
|
||||||
|
.filter(|x| x.starts_with("folder1"))
|
||||||
|
.map(|x| RemotePath::new(x).expect("must be valid path"))
|
||||||
|
.collect();
|
||||||
|
assert_eq!(
|
||||||
|
nested_remote_files, trim_remote_blobs,
|
||||||
|
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test_context(MaybeEnabledAzure)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
|
||||||
|
let ctx = match ctx {
|
||||||
|
MaybeEnabledAzure::Enabled(ctx) => ctx,
|
||||||
|
MaybeEnabledAzure::Disabled => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let path = RemotePath::new(Utf8Path::new(
|
||||||
|
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
|
||||||
|
))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
ctx.client.delete(&path).await.expect("should succeed");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test_context(MaybeEnabledAzure)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
|
||||||
|
let ctx = match ctx {
|
||||||
|
MaybeEnabledAzure::Enabled(ctx) => ctx,
|
||||||
|
MaybeEnabledAzure::Disabled => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||||
|
ctx.client.upload(data, len, &path1, None).await?;
|
||||||
|
|
||||||
|
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||||
|
ctx.client.upload(data, len, &path2, None).await?;
|
||||||
|
|
||||||
|
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||||
|
ctx.client.upload(data, len, &path3, None).await?;
|
||||||
|
|
||||||
|
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||||
|
|
||||||
|
let prefixes = ctx.client.list_prefixes(None).await?;
|
||||||
|
|
||||||
|
assert_eq!(prefixes.len(), 1);
|
||||||
|
|
||||||
|
ctx.client.delete_objects(&[path3]).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test_context(MaybeEnabledAzure)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
|
||||||
|
let MaybeEnabledAzure::Enabled(ctx) = ctx else {
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
|
||||||
|
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
|
||||||
|
|
||||||
|
let (data, len) = wrap_stream(orig.clone());
|
||||||
|
|
||||||
|
ctx.client.upload(data, len, &path, None).await?;
|
||||||
|
|
||||||
|
// Normal download request
|
||||||
|
let dl = ctx.client.download(&path).await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig);
|
||||||
|
|
||||||
|
// Full range (end specified)
|
||||||
|
let dl = ctx
|
||||||
|
.client
|
||||||
|
.download_byte_range(&path, 0, Some(len as u64))
|
||||||
|
.await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig);
|
||||||
|
|
||||||
|
// partial range (end specified)
|
||||||
|
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig[4..10]);
|
||||||
|
|
||||||
|
// partial range (end beyond real end)
|
||||||
|
let dl = ctx
|
||||||
|
.client
|
||||||
|
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||||
|
.await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig[8..]);
|
||||||
|
|
||||||
|
// Partial range (end unspecified)
|
||||||
|
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig[4..]);
|
||||||
|
|
||||||
|
// Full range (end unspecified)
|
||||||
|
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig);
|
||||||
|
|
||||||
|
debug!("Cleanup: deleting file at path {path:?}");
|
||||||
|
ctx.client
|
||||||
|
.delete(&path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("{path:?} removal"))?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
struct EnabledAzure {
|
struct EnabledAzure {
|
||||||
client: Arc<GenericRemoteStorage>,
|
client: Arc<GenericRemoteStorage>,
|
||||||
base_prefix: &'static str,
|
base_prefix: &'static str,
|
||||||
@@ -41,13 +281,13 @@ impl EnabledAzure {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum MaybeEnabledStorage {
|
enum MaybeEnabledAzure {
|
||||||
Enabled(EnabledAzure),
|
Enabled(EnabledAzure),
|
||||||
Disabled,
|
Disabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorage {
|
impl AsyncTestContext for MaybeEnabledAzure {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
|
|
||||||
@@ -63,7 +303,7 @@ impl AsyncTestContext for MaybeEnabledStorage {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum MaybeEnabledStorageWithTestBlobs {
|
enum MaybeEnabledAzureWithTestBlobs {
|
||||||
Enabled(AzureWithTestBlobs),
|
Enabled(AzureWithTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
UploadsFailed(anyhow::Error, AzureWithTestBlobs),
|
UploadsFailed(anyhow::Error, AzureWithTestBlobs),
|
||||||
@@ -76,7 +316,7 @@ struct AzureWithTestBlobs {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||||
@@ -127,7 +367,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
|||||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||||
// whereas the list_files function is concerned with listing files.
|
// whereas the list_files function is concerned with listing files.
|
||||||
// See `RemoteStorage::list_files` documentation for more details
|
// See `RemoteStorage::list_files` documentation for more details
|
||||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
enum MaybeEnabledAzureWithSimpleTestBlobs {
|
||||||
Enabled(AzureWithSimpleTestBlobs),
|
Enabled(AzureWithSimpleTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
|
UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
|
||||||
@@ -138,7 +378,7 @@ struct AzureWithSimpleTestBlobs {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||||
|
|||||||
@@ -6,23 +6,259 @@ use std::sync::Arc;
|
|||||||
use std::time::UNIX_EPOCH;
|
use std::time::UNIX_EPOCH;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use camino::Utf8Path;
|
||||||
use remote_storage::{
|
use remote_storage::{
|
||||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||||
};
|
};
|
||||||
use test_context::AsyncTestContext;
|
use test_context::{test_context, AsyncTestContext};
|
||||||
use tracing::info;
|
use tracing::{debug, info};
|
||||||
|
|
||||||
mod common;
|
mod common;
|
||||||
|
|
||||||
#[path = "common/tests.rs"]
|
use common::{
|
||||||
mod tests_s3;
|
cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
|
||||||
|
upload_stream, wrap_stream,
|
||||||
use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
|
};
|
||||||
|
|
||||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||||
|
|
||||||
const BASE_PREFIX: &str = "test";
|
const BASE_PREFIX: &str = "test";
|
||||||
|
|
||||||
|
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
|
||||||
|
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
|
||||||
|
/// See the client creation in [`create_s3_client`] for details on the required env vars.
|
||||||
|
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||||
|
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||||
|
///
|
||||||
|
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
|
||||||
|
/// where
|
||||||
|
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
|
||||||
|
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||||
|
///
|
||||||
|
/// Then, verifies that the client does return correct prefixes when queried:
|
||||||
|
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
||||||
|
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
||||||
|
///
|
||||||
|
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
|
||||||
|
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
|
||||||
|
/// since current default AWS S3 pagination limit is 1000.
|
||||||
|
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
|
||||||
|
///
|
||||||
|
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
|
||||||
|
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||||
|
#[test_context(MaybeEnabledS3WithTestBlobs)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
|
||||||
|
let ctx = match ctx {
|
||||||
|
MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
|
||||||
|
MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
|
||||||
|
MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let test_client = Arc::clone(&ctx.enabled.client);
|
||||||
|
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||||
|
|
||||||
|
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||||
|
.context("common_prefix construction")?;
|
||||||
|
let root_remote_prefixes = test_client
|
||||||
|
.list_prefixes(None)
|
||||||
|
.await
|
||||||
|
.context("client list root prefixes failure")?
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
assert_eq!(
|
||||||
|
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
|
||||||
|
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let nested_remote_prefixes = test_client
|
||||||
|
.list_prefixes(Some(&base_prefix))
|
||||||
|
.await
|
||||||
|
.context("client list nested prefixes failure")?
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
let remote_only_prefixes = nested_remote_prefixes
|
||||||
|
.difference(&expected_remote_prefixes)
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
let missing_uploaded_prefixes = expected_remote_prefixes
|
||||||
|
.difference(&nested_remote_prefixes)
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
assert_eq!(
|
||||||
|
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
||||||
|
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
|
||||||
|
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||||
|
/// See `s3_pagination_should_work` for more information.
|
||||||
|
///
|
||||||
|
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||||
|
/// Then performs the following queries:
|
||||||
|
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||||
|
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||||
|
#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
|
||||||
|
let ctx = match ctx {
|
||||||
|
MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||||
|
MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
|
||||||
|
MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
|
||||||
|
anyhow::bail!("S3 init failed: {e:?}")
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let test_client = Arc::clone(&ctx.enabled.client);
|
||||||
|
let base_prefix =
|
||||||
|
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||||
|
let root_files = test_client
|
||||||
|
.list_files(None)
|
||||||
|
.await
|
||||||
|
.context("client list root files failure")?
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
assert_eq!(
|
||||||
|
root_files,
|
||||||
|
ctx.remote_blobs.clone(),
|
||||||
|
"remote storage list_files on root mismatches with the uploads."
|
||||||
|
);
|
||||||
|
let nested_remote_files = test_client
|
||||||
|
.list_files(Some(&base_prefix))
|
||||||
|
.await
|
||||||
|
.context("client list nested files failure")?
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
let trim_remote_blobs: HashSet<_> = ctx
|
||||||
|
.remote_blobs
|
||||||
|
.iter()
|
||||||
|
.map(|x| x.get_path())
|
||||||
|
.filter(|x| x.starts_with("folder1"))
|
||||||
|
.map(|x| RemotePath::new(x).expect("must be valid path"))
|
||||||
|
.collect();
|
||||||
|
assert_eq!(
|
||||||
|
nested_remote_files, trim_remote_blobs,
|
||||||
|
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test_context(MaybeEnabledS3)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||||
|
let ctx = match ctx {
|
||||||
|
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||||
|
MaybeEnabledS3::Disabled => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let path = RemotePath::new(Utf8Path::new(
|
||||||
|
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
|
||||||
|
))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
ctx.client.delete(&path).await.expect("should succeed");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test_context(MaybeEnabledS3)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||||
|
let ctx = match ctx {
|
||||||
|
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||||
|
MaybeEnabledS3::Disabled => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||||
|
ctx.client.upload(data, len, &path1, None).await?;
|
||||||
|
|
||||||
|
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||||
|
ctx.client.upload(data, len, &path2, None).await?;
|
||||||
|
|
||||||
|
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||||
|
ctx.client.upload(data, len, &path3, None).await?;
|
||||||
|
|
||||||
|
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||||
|
|
||||||
|
let prefixes = ctx.client.list_prefixes(None).await?;
|
||||||
|
|
||||||
|
assert_eq!(prefixes.len(), 1);
|
||||||
|
|
||||||
|
ctx.client.delete_objects(&[path3]).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test_context(MaybeEnabledS3)]
|
||||||
|
#[tokio::test]
|
||||||
|
async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||||
|
let MaybeEnabledS3::Enabled(ctx) = ctx else {
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
|
||||||
|
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||||
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
|
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
|
||||||
|
|
||||||
|
let (data, len) = wrap_stream(orig.clone());
|
||||||
|
|
||||||
|
ctx.client.upload(data, len, &path, None).await?;
|
||||||
|
|
||||||
|
// Normal download request
|
||||||
|
let dl = ctx.client.download(&path).await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig);
|
||||||
|
|
||||||
|
// Full range (end specified)
|
||||||
|
let dl = ctx
|
||||||
|
.client
|
||||||
|
.download_byte_range(&path, 0, Some(len as u64))
|
||||||
|
.await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig);
|
||||||
|
|
||||||
|
// partial range (end specified)
|
||||||
|
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig[4..10]);
|
||||||
|
|
||||||
|
// partial range (end beyond real end)
|
||||||
|
let dl = ctx
|
||||||
|
.client
|
||||||
|
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||||
|
.await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig[8..]);
|
||||||
|
|
||||||
|
// Partial range (end unspecified)
|
||||||
|
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig[4..]);
|
||||||
|
|
||||||
|
// Full range (end unspecified)
|
||||||
|
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||||
|
let buf = download_to_vec(dl).await?;
|
||||||
|
assert_eq!(&buf, &orig);
|
||||||
|
|
||||||
|
debug!("Cleanup: deleting file at path {path:?}");
|
||||||
|
ctx.client
|
||||||
|
.delete(&path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("{path:?} removal"))?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
struct EnabledS3 {
|
struct EnabledS3 {
|
||||||
client: Arc<GenericRemoteStorage>,
|
client: Arc<GenericRemoteStorage>,
|
||||||
base_prefix: &'static str,
|
base_prefix: &'static str,
|
||||||
@@ -41,13 +277,13 @@ impl EnabledS3 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum MaybeEnabledStorage {
|
enum MaybeEnabledS3 {
|
||||||
Enabled(EnabledS3),
|
Enabled(EnabledS3),
|
||||||
Disabled,
|
Disabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorage {
|
impl AsyncTestContext for MaybeEnabledS3 {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
|
|
||||||
@@ -63,7 +299,7 @@ impl AsyncTestContext for MaybeEnabledStorage {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum MaybeEnabledStorageWithTestBlobs {
|
enum MaybeEnabledS3WithTestBlobs {
|
||||||
Enabled(S3WithTestBlobs),
|
Enabled(S3WithTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
UploadsFailed(anyhow::Error, S3WithTestBlobs),
|
UploadsFailed(anyhow::Error, S3WithTestBlobs),
|
||||||
@@ -76,7 +312,7 @@ struct S3WithTestBlobs {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||||
@@ -127,7 +363,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
|||||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||||
// whereas the list_files function is concerned with listing files.
|
// whereas the list_files function is concerned with listing files.
|
||||||
// See `RemoteStorage::list_files` documentation for more details
|
// See `RemoteStorage::list_files` documentation for more details
|
||||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
enum MaybeEnabledS3WithSimpleTestBlobs {
|
||||||
Enabled(S3WithSimpleTestBlobs),
|
Enabled(S3WithSimpleTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
|
UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
|
||||||
@@ -138,7 +374,7 @@ struct S3WithSimpleTestBlobs {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||||
|
|||||||
@@ -51,9 +51,3 @@ pub struct SkTimelineInfo {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub http_connstr: Option<String>,
|
pub http_connstr: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
||||||
pub struct TimelineCopyRequest {
|
|
||||||
pub target_timeline_id: TimelineId,
|
|
||||||
pub until_lsn: Lsn,
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -4,12 +4,6 @@ version = "0.1.0"
|
|||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
[features]
|
|
||||||
default = []
|
|
||||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
|
||||||
# which adds some runtime cost to run tests on outage conditions
|
|
||||||
testing = ["fail/failpoints"]
|
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
arc-swap.workspace = true
|
arc-swap.workspace = true
|
||||||
sentry.workspace = true
|
sentry.workspace = true
|
||||||
@@ -22,7 +16,6 @@ chrono.workspace = true
|
|||||||
heapless.workspace = true
|
heapless.workspace = true
|
||||||
hex = { workspace = true, features = ["serde"] }
|
hex = { workspace = true, features = ["serde"] }
|
||||||
hyper = { workspace = true, features = ["full"] }
|
hyper = { workspace = true, features = ["full"] }
|
||||||
fail.workspace = true
|
|
||||||
futures = { workspace = true}
|
futures = { workspace = true}
|
||||||
jsonwebtoken.workspace = true
|
jsonwebtoken.workspace = true
|
||||||
nix.workspace = true
|
nix.workspace = true
|
||||||
|
|||||||
@@ -1,177 +0,0 @@
|
|||||||
//! Failpoint support code shared between pageserver and safekeepers.
|
|
||||||
|
|
||||||
use crate::http::{
|
|
||||||
error::ApiError,
|
|
||||||
json::{json_request, json_response},
|
|
||||||
};
|
|
||||||
use hyper::{Body, Request, Response, StatusCode};
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use tracing::*;
|
|
||||||
|
|
||||||
/// use with fail::cfg("$name", "return(2000)")
|
|
||||||
///
|
|
||||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
|
||||||
/// specified time (in milliseconds). The main difference is that we use async
|
|
||||||
/// tokio sleep function. Another difference is that we print lines to the log,
|
|
||||||
/// which can be useful in tests to check that the failpoint was hit.
|
|
||||||
///
|
|
||||||
/// Optionally pass a cancellation token, and this failpoint will drop out of
|
|
||||||
/// its sleep when the cancellation token fires. This is useful for testing
|
|
||||||
/// cases where we would like to block something, but test its clean shutdown behavior.
|
|
||||||
#[macro_export]
|
|
||||||
macro_rules! __failpoint_sleep_millis_async {
|
|
||||||
($name:literal) => {{
|
|
||||||
// If the failpoint is used with a "return" action, set should_sleep to the
|
|
||||||
// returned value (as string). Otherwise it's set to None.
|
|
||||||
let should_sleep = (|| {
|
|
||||||
::fail::fail_point!($name, |x| x);
|
|
||||||
::std::option::Option::None
|
|
||||||
})();
|
|
||||||
|
|
||||||
// Sleep if the action was a returned value
|
|
||||||
if let ::std::option::Option::Some(duration_str) = should_sleep {
|
|
||||||
$crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
|
|
||||||
}
|
|
||||||
}};
|
|
||||||
($name:literal, $cancel:expr) => {{
|
|
||||||
// If the failpoint is used with a "return" action, set should_sleep to the
|
|
||||||
// returned value (as string). Otherwise it's set to None.
|
|
||||||
let should_sleep = (|| {
|
|
||||||
::fail::fail_point!($name, |x| x);
|
|
||||||
::std::option::Option::None
|
|
||||||
})();
|
|
||||||
|
|
||||||
// Sleep if the action was a returned value
|
|
||||||
if let ::std::option::Option::Some(duration_str) = should_sleep {
|
|
||||||
$crate::failpoint_support::failpoint_sleep_cancellable_helper(
|
|
||||||
$name,
|
|
||||||
duration_str,
|
|
||||||
$cancel,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
}};
|
|
||||||
}
|
|
||||||
pub use __failpoint_sleep_millis_async as sleep_millis_async;
|
|
||||||
|
|
||||||
// Helper function used by the macro. (A function has nicer scoping so we
|
|
||||||
// don't need to decorate everything with "::")
|
|
||||||
#[doc(hidden)]
|
|
||||||
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
|
||||||
let millis = duration_str.parse::<u64>().unwrap();
|
|
||||||
let d = std::time::Duration::from_millis(millis);
|
|
||||||
|
|
||||||
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
|
|
||||||
tokio::time::sleep(d).await;
|
|
||||||
tracing::info!("failpoint {:?}: sleep done", name);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper function used by the macro. (A function has nicer scoping so we
|
|
||||||
// don't need to decorate everything with "::")
|
|
||||||
#[doc(hidden)]
|
|
||||||
pub async fn failpoint_sleep_cancellable_helper(
|
|
||||||
name: &'static str,
|
|
||||||
duration_str: String,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) {
|
|
||||||
let millis = duration_str.parse::<u64>().unwrap();
|
|
||||||
let d = std::time::Duration::from_millis(millis);
|
|
||||||
|
|
||||||
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
|
|
||||||
tokio::time::timeout(d, cancel.cancelled()).await.ok();
|
|
||||||
tracing::info!("failpoint {:?}: sleep done", name);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn init() -> fail::FailScenario<'static> {
|
|
||||||
// The failpoints lib provides support for parsing the `FAILPOINTS` env var.
|
|
||||||
// We want non-default behavior for `exit`, though, so, we handle it separately.
|
|
||||||
//
|
|
||||||
// Format for FAILPOINTS is "name=actions" separated by ";".
|
|
||||||
let actions = std::env::var("FAILPOINTS");
|
|
||||||
if actions.is_ok() {
|
|
||||||
std::env::remove_var("FAILPOINTS");
|
|
||||||
} else {
|
|
||||||
// let the library handle non-utf8, or nothing for not present
|
|
||||||
}
|
|
||||||
|
|
||||||
let scenario = fail::FailScenario::setup();
|
|
||||||
|
|
||||||
if let Ok(val) = actions {
|
|
||||||
val.split(';')
|
|
||||||
.enumerate()
|
|
||||||
.map(|(i, s)| s.split_once('=').ok_or((i, s)))
|
|
||||||
.for_each(|res| {
|
|
||||||
let (name, actions) = match res {
|
|
||||||
Ok(t) => t,
|
|
||||||
Err((i, s)) => {
|
|
||||||
panic!(
|
|
||||||
"startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
|
|
||||||
i + 1,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if let Err(e) = apply_failpoint(name, actions) {
|
|
||||||
panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
scenario
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
|
||||||
if actions == "exit" {
|
|
||||||
fail::cfg_callback(name, exit_failpoint)
|
|
||||||
} else {
|
|
||||||
fail::cfg(name, actions)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(never)]
|
|
||||||
fn exit_failpoint() {
|
|
||||||
tracing::info!("Exit requested by failpoint");
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
|
||||||
|
|
||||||
/// Information for configuring a single fail point
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
pub struct FailpointConfig {
|
|
||||||
/// Name of the fail point
|
|
||||||
pub name: String,
|
|
||||||
/// List of actions to take, using the format described in `fail::cfg`
|
|
||||||
///
|
|
||||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
|
||||||
pub actions: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Configure failpoints through http.
|
|
||||||
pub async fn failpoints_handler(
|
|
||||||
mut request: Request<Body>,
|
|
||||||
_cancel: CancellationToken,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
if !fail::has_failpoints() {
|
|
||||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
|
||||||
"Cannot manage failpoints because storage was compiled without failpoints support"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
|
||||||
for fp in failpoints {
|
|
||||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
|
||||||
|
|
||||||
// We recognize one extra "action" that's not natively recognized
|
|
||||||
// by the failpoints crate: exit, to immediately kill the process
|
|
||||||
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
|
||||||
|
|
||||||
if let Err(err_msg) = cfg_result {
|
|
||||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
|
||||||
"Failed to configure failpoints: {err_msg}"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
|
||||||
}
|
|
||||||
@@ -31,9 +31,6 @@ pub enum ApiError {
|
|||||||
#[error("Shutting down")]
|
#[error("Shutting down")]
|
||||||
ShuttingDown,
|
ShuttingDown,
|
||||||
|
|
||||||
#[error("Timeout")]
|
|
||||||
Timeout(Cow<'static, str>),
|
|
||||||
|
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
InternalServerError(anyhow::Error),
|
InternalServerError(anyhow::Error),
|
||||||
}
|
}
|
||||||
@@ -70,10 +67,6 @@ impl ApiError {
|
|||||||
err.to_string(),
|
err.to_string(),
|
||||||
StatusCode::SERVICE_UNAVAILABLE,
|
StatusCode::SERVICE_UNAVAILABLE,
|
||||||
),
|
),
|
||||||
ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
|
|
||||||
err.to_string(),
|
|
||||||
StatusCode::REQUEST_TIMEOUT,
|
|
||||||
),
|
|
||||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||||
err.to_string(),
|
err.to_string(),
|
||||||
StatusCode::INTERNAL_SERVER_ERROR,
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
use std::num::ParseIntError;
|
|
||||||
use std::{fmt, str::FromStr};
|
use std::{fmt, str::FromStr};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
@@ -375,13 +374,6 @@ impl fmt::Display for NodeId {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FromStr for NodeId {
|
|
||||||
type Err = ParseIntError;
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
Ok(NodeId(u64::from_str(s)?))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use serde_assert::{Deserializer, Serializer, Token, Tokens};
|
use serde_assert::{Deserializer, Serializer, Token, Tokens};
|
||||||
|
|||||||
@@ -83,10 +83,6 @@ pub mod timeout;
|
|||||||
|
|
||||||
pub mod sync;
|
pub mod sync;
|
||||||
|
|
||||||
pub mod failpoint_support;
|
|
||||||
|
|
||||||
pub mod yielding_loop;
|
|
||||||
|
|
||||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||||
///
|
///
|
||||||
/// we have several cases:
|
/// we have several cases:
|
||||||
|
|||||||
@@ -366,49 +366,6 @@ impl MonotonicCounter<Lsn> for RecordLsn {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
|
|
||||||
///
|
|
||||||
/// This is used by the `pagebench` pageserver benchmarking tool.
|
|
||||||
pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
|
|
||||||
|
|
||||||
impl rand::distributions::uniform::SampleUniform for Lsn {
|
|
||||||
type Sampler = LsnSampler;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl rand::distributions::uniform::UniformSampler for LsnSampler {
|
|
||||||
type X = Lsn;
|
|
||||||
|
|
||||||
fn new<B1, B2>(low: B1, high: B2) -> Self
|
|
||||||
where
|
|
||||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
|
||||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
|
||||||
{
|
|
||||||
Self(
|
|
||||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
|
|
||||||
low.borrow().0,
|
|
||||||
high.borrow().0,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
|
|
||||||
where
|
|
||||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
|
||||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
|
||||||
{
|
|
||||||
Self(
|
|
||||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
|
|
||||||
low.borrow().0,
|
|
||||||
high.borrow().0,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
|
|
||||||
Lsn(self.0.sample(rng))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::bin_ser::BeSer;
|
use crate::bin_ser::BeSer;
|
||||||
|
|||||||
@@ -15,12 +15,6 @@ pub struct Gate {
|
|||||||
name: String,
|
name: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for Gate {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "Gate<{}>", self.name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
||||||
/// not complete.
|
/// not complete.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
|||||||
@@ -1,35 +0,0 @@
|
|||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
pub enum YieldingLoopError {
|
|
||||||
#[error("Cancelled")]
|
|
||||||
Cancelled,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically
|
|
||||||
/// yields to avoid blocking the executor, and after resuming checks the provided
|
|
||||||
/// cancellation token to drop out promptly on shutdown.
|
|
||||||
#[inline(always)]
|
|
||||||
pub async fn yielding_loop<I, T, F>(
|
|
||||||
interval: usize,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
iter: I,
|
|
||||||
mut visitor: F,
|
|
||||||
) -> Result<(), YieldingLoopError>
|
|
||||||
where
|
|
||||||
I: Iterator<Item = T>,
|
|
||||||
F: FnMut(T),
|
|
||||||
{
|
|
||||||
for (i, item) in iter.enumerate() {
|
|
||||||
visitor(item);
|
|
||||||
|
|
||||||
if i + 1 % interval == 0 {
|
|
||||||
tokio::task::yield_now().await;
|
|
||||||
if cancel.is_cancelled() {
|
|
||||||
return Err(YieldingLoopError::Cancelled);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -446,11 +446,12 @@ impl Runner {
|
|||||||
if let Some(t) = self.last_upscale_request_at {
|
if let Some(t) = self.last_upscale_request_at {
|
||||||
let elapsed = t.elapsed();
|
let elapsed = t.elapsed();
|
||||||
if elapsed < Duration::from_secs(1) {
|
if elapsed < Duration::from_secs(1) {
|
||||||
// *Ideally* we'd like to log here that we're ignoring the fact the
|
info!(
|
||||||
// memory stats are too high, but in practice this can result in
|
elapsed_millis = elapsed.as_millis(),
|
||||||
// spamming the logs with repetitive messages about ignoring the signal
|
avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
|
||||||
//
|
threshold = bytes_to_mebibytes(cgroup.threshold),
|
||||||
// See https://github.com/neondatabase/neon/issues/5865 for more.
|
"cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
|
||||||
|
);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,12 +8,12 @@ use std::ffi::CString;
|
|||||||
|
|
||||||
use crate::bindings::uint32;
|
use crate::bindings::uint32;
|
||||||
use crate::bindings::walproposer_api;
|
use crate::bindings::walproposer_api;
|
||||||
use crate::bindings::NeonWALReadResult;
|
|
||||||
use crate::bindings::PGAsyncReadResult;
|
use crate::bindings::PGAsyncReadResult;
|
||||||
use crate::bindings::PGAsyncWriteResult;
|
use crate::bindings::PGAsyncWriteResult;
|
||||||
use crate::bindings::Safekeeper;
|
use crate::bindings::Safekeeper;
|
||||||
use crate::bindings::Size;
|
use crate::bindings::Size;
|
||||||
use crate::bindings::StringInfoData;
|
use crate::bindings::StringInfoData;
|
||||||
|
use crate::bindings::TimeLineID;
|
||||||
use crate::bindings::TimestampTz;
|
use crate::bindings::TimestampTz;
|
||||||
use crate::bindings::WalProposer;
|
use crate::bindings::WalProposer;
|
||||||
use crate::bindings::WalProposerConnStatusType;
|
use crate::bindings::WalProposerConnStatusType;
|
||||||
@@ -178,11 +178,31 @@ extern "C" fn conn_blocking_write(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
|
extern "C" fn recovery_download(
|
||||||
|
sk: *mut Safekeeper,
|
||||||
|
_timeline: TimeLineID,
|
||||||
|
startpos: XLogRecPtr,
|
||||||
|
endpos: XLogRecPtr,
|
||||||
|
) -> bool {
|
||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
(*api).recovery_download(&mut (*wp), &mut (*sk))
|
(*api).recovery_download(&mut (*sk), startpos, endpos)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::unnecessary_cast)]
|
||||||
|
extern "C" fn wal_read(
|
||||||
|
sk: *mut Safekeeper,
|
||||||
|
buf: *mut ::std::os::raw::c_char,
|
||||||
|
startptr: XLogRecPtr,
|
||||||
|
count: Size,
|
||||||
|
) {
|
||||||
|
unsafe {
|
||||||
|
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||||
|
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||||
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
|
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -194,28 +214,11 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::unnecessary_cast)]
|
extern "C" fn free_event_set(wp: *mut WalProposer) {
|
||||||
extern "C" fn wal_read(
|
|
||||||
sk: *mut Safekeeper,
|
|
||||||
buf: *mut ::std::os::raw::c_char,
|
|
||||||
startptr: XLogRecPtr,
|
|
||||||
count: Size,
|
|
||||||
_errmsg: *mut *mut ::std::os::raw::c_char,
|
|
||||||
) -> NeonWALReadResult {
|
|
||||||
unsafe {
|
unsafe {
|
||||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
let callback_data = (*(*wp).config).callback_data;
|
||||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
// TODO: errmsg is not forwarded
|
(*api).free_event_set(&mut (*wp));
|
||||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
|
|
||||||
unsafe {
|
|
||||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
|
||||||
(*api).wal_reader_events(&mut (*sk))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -235,14 +238,6 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
|
|
||||||
unsafe {
|
|
||||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
|
||||||
(*api).active_state_update_event_set(&mut (*sk));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||||
@@ -251,14 +246,6 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
|
|
||||||
unsafe {
|
|
||||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
|
||||||
(*api).rm_safekeeper_event_set(&mut (*sk));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C" fn wait_event_set(
|
extern "C" fn wait_event_set(
|
||||||
wp: *mut WalProposer,
|
wp: *mut WalProposer,
|
||||||
timeout: ::std::os::raw::c_long,
|
timeout: ::std::os::raw::c_long,
|
||||||
@@ -326,6 +313,14 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||||
|
unsafe {
|
||||||
|
let callback_data = (*(*wp).config).callback_data;
|
||||||
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
|
(*api).confirm_wal_streamed(&mut (*wp), lsn)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" fn log_internal(
|
extern "C" fn log_internal(
|
||||||
wp: *mut WalProposer,
|
wp: *mut WalProposer,
|
||||||
level: ::std::os::raw::c_int,
|
level: ::std::os::raw::c_int,
|
||||||
@@ -340,6 +335,14 @@ extern "C" fn log_internal(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" fn after_election(wp: *mut WalProposer) {
|
||||||
|
unsafe {
|
||||||
|
let callback_data = (*(*wp).config).callback_data;
|
||||||
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
|
(*api).after_election(&mut (*wp))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum Level {
|
pub enum Level {
|
||||||
Debug5,
|
Debug5,
|
||||||
@@ -398,20 +401,20 @@ pub(crate) fn create_api() -> walproposer_api {
|
|||||||
conn_async_write: Some(conn_async_write),
|
conn_async_write: Some(conn_async_write),
|
||||||
conn_blocking_write: Some(conn_blocking_write),
|
conn_blocking_write: Some(conn_blocking_write),
|
||||||
recovery_download: Some(recovery_download),
|
recovery_download: Some(recovery_download),
|
||||||
wal_reader_allocate: Some(wal_reader_allocate),
|
|
||||||
wal_read: Some(wal_read),
|
wal_read: Some(wal_read),
|
||||||
wal_reader_events: Some(wal_reader_events),
|
wal_reader_allocate: Some(wal_reader_allocate),
|
||||||
|
free_event_set: Some(free_event_set),
|
||||||
init_event_set: Some(init_event_set),
|
init_event_set: Some(init_event_set),
|
||||||
update_event_set: Some(update_event_set),
|
update_event_set: Some(update_event_set),
|
||||||
active_state_update_event_set: Some(active_state_update_event_set),
|
|
||||||
add_safekeeper_event_set: Some(add_safekeeper_event_set),
|
add_safekeeper_event_set: Some(add_safekeeper_event_set),
|
||||||
rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
|
|
||||||
wait_event_set: Some(wait_event_set),
|
wait_event_set: Some(wait_event_set),
|
||||||
strong_random: Some(strong_random),
|
strong_random: Some(strong_random),
|
||||||
get_redo_start_lsn: Some(get_redo_start_lsn),
|
get_redo_start_lsn: Some(get_redo_start_lsn),
|
||||||
finish_sync_safekeepers: Some(finish_sync_safekeepers),
|
finish_sync_safekeepers: Some(finish_sync_safekeepers),
|
||||||
process_safekeeper_feedback: Some(process_safekeeper_feedback),
|
process_safekeeper_feedback: Some(process_safekeeper_feedback),
|
||||||
|
confirm_wal_streamed: Some(confirm_wal_streamed),
|
||||||
log_internal: Some(log_internal),
|
log_internal: Some(log_internal),
|
||||||
|
after_election: Some(after_election),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
|
|||||||
use crate::{
|
use crate::{
|
||||||
api_bindings::{create_api, take_vec_u8, Level},
|
api_bindings::{create_api, take_vec_u8, Level},
|
||||||
bindings::{
|
bindings::{
|
||||||
NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
|
Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
|
||||||
WalProposerFree, WalProposerStart,
|
WalProposerStart,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -86,19 +86,19 @@ pub trait ApiImpl {
|
|||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
|
fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
|
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
|
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
|
fn free_event_set(&self, _wp: &mut WalProposer) {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -110,18 +110,10 @@ pub trait ApiImpl {
|
|||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
|
|
||||||
todo!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
|
|
||||||
todo!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
|
fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
@@ -142,6 +134,10 @@ pub trait ApiImpl {
|
|||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
|
fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
@@ -244,7 +240,6 @@ impl Drop for Wrapper {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use core::panic;
|
|
||||||
use std::{
|
use std::{
|
||||||
cell::Cell,
|
cell::Cell,
|
||||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||||
@@ -252,7 +247,7 @@ mod tests {
|
|||||||
|
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
|
|
||||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
use crate::{api_bindings::Level, walproposer::Wrapper};
|
||||||
|
|
||||||
use super::ApiImpl;
|
use super::ApiImpl;
|
||||||
|
|
||||||
@@ -360,17 +355,12 @@ mod tests {
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
fn recovery_download(
|
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
|
||||||
&self,
|
println!("wal_reader_allocate")
|
||||||
_wp: &mut crate::bindings::WalProposer,
|
|
||||||
_sk: &mut crate::bindings::Safekeeper,
|
|
||||||
) -> bool {
|
|
||||||
true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
|
fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||||
println!("wal_reader_allocate");
|
println!("free_event_set")
|
||||||
crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||||
@@ -393,13 +383,6 @@ mod tests {
|
|||||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
|
|
||||||
println!(
|
|
||||||
"rm_safekeeper_event_set, sk={:?}",
|
|
||||||
sk as *mut crate::bindings::Safekeeper
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn wait_event_set(
|
fn wait_event_set(
|
||||||
&self,
|
&self,
|
||||||
_: &mut crate::bindings::WalProposer,
|
_: &mut crate::bindings::WalProposer,
|
||||||
@@ -425,7 +408,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
||||||
println!("wp_log[{}] {}", level, msg);
|
println!("walprop_log[{}] {}", level, msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ use bytes::{Buf, Bytes};
|
|||||||
use pageserver::{
|
use pageserver::{
|
||||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||||
};
|
};
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use utils::{id::TenantId, lsn::Lsn};
|
use utils::{id::TenantId, lsn::Lsn};
|
||||||
|
|
||||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||||
@@ -27,9 +26,9 @@ fn redo_scenarios(c: &mut Criterion) {
|
|||||||
|
|
||||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||||
let conf = Box::leak(Box::new(conf));
|
let conf = Box::leak(Box::new(conf));
|
||||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
let tenant_id = TenantId::generate();
|
||||||
|
|
||||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
let manager = PostgresRedoManager::new(conf, tenant_id);
|
||||||
|
|
||||||
let manager = Arc::new(manager);
|
let manager = Arc::new(manager);
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,10 @@
|
|||||||
use pageserver_api::{models::*, shard::TenantShardId};
|
use pageserver_api::models::*;
|
||||||
use reqwest::{IntoUrl, Method, StatusCode};
|
use reqwest::{IntoUrl, Method};
|
||||||
use utils::{
|
use utils::{
|
||||||
http::error::HttpErrorBody,
|
http::error::HttpErrorBody,
|
||||||
id::{TenantId, TimelineId},
|
id::{TenantId, TimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub mod util;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Client {
|
pub struct Client {
|
||||||
mgmt_api_endpoint: String,
|
mgmt_api_endpoint: String,
|
||||||
@@ -22,18 +20,20 @@ pub enum Error {
|
|||||||
#[error("receive error body: {0}")]
|
#[error("receive error body: {0}")]
|
||||||
ReceiveErrorBody(String),
|
ReceiveErrorBody(String),
|
||||||
|
|
||||||
#[error("pageserver API: {1}")]
|
#[error("pageserver API: {0}")]
|
||||||
ApiError(StatusCode, String),
|
ApiError(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type Result<T> = std::result::Result<T, Error>;
|
pub type Result<T> = std::result::Result<T, Error>;
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
pub trait ResponseErrorMessageExt: Sized {
|
pub trait ResponseErrorMessageExt: Sized {
|
||||||
fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
|
async fn error_from_body(self) -> Result<Self>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl ResponseErrorMessageExt for reqwest::Response {
|
impl ResponseErrorMessageExt for reqwest::Response {
|
||||||
async fn error_from_body(self) -> Result<Self> {
|
async fn error_from_body(mut self) -> Result<Self> {
|
||||||
let status = self.status();
|
let status = self.status();
|
||||||
if !(status.is_client_error() || status.is_server_error()) {
|
if !(status.is_client_error() || status.is_server_error()) {
|
||||||
return Ok(self);
|
return Ok(self);
|
||||||
@@ -41,7 +41,7 @@ impl ResponseErrorMessageExt for reqwest::Response {
|
|||||||
|
|
||||||
let url = self.url().to_owned();
|
let url = self.url().to_owned();
|
||||||
Err(match self.json::<HttpErrorBody>().await {
|
Err(match self.json::<HttpErrorBody>().await {
|
||||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
|
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
|
||||||
}
|
}
|
||||||
@@ -49,11 +49,6 @@ impl ResponseErrorMessageExt for reqwest::Response {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub enum ForceAwaitLogicalSize {
|
|
||||||
Yes,
|
|
||||||
No,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Client {
|
impl Client {
|
||||||
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -71,9 +66,9 @@ impl Client {
|
|||||||
|
|
||||||
pub async fn tenant_details(
|
pub async fn tenant_details(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
) -> Result<pageserver_api::models::TenantDetails> {
|
) -> Result<pageserver_api::models::TenantDetails> {
|
||||||
let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
|
let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
|
||||||
self.get(uri)
|
self.get(uri)
|
||||||
.await?
|
.await?
|
||||||
.json()
|
.json()
|
||||||
@@ -83,12 +78,9 @@ impl Client {
|
|||||||
|
|
||||||
pub async fn list_timelines(
|
pub async fn list_timelines(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
|
) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
|
||||||
let uri = format!(
|
let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
|
||||||
"{}/v1/tenant/{tenant_shard_id}/timeline",
|
|
||||||
self.mgmt_api_endpoint
|
|
||||||
);
|
|
||||||
self.get(&uri)
|
self.get(&uri)
|
||||||
.await?
|
.await?
|
||||||
.json()
|
.json()
|
||||||
@@ -100,18 +92,11 @@ impl Client {
|
|||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
force_await_logical_size: ForceAwaitLogicalSize,
|
|
||||||
) -> Result<pageserver_api::models::TimelineInfo> {
|
) -> Result<pageserver_api::models::TimelineInfo> {
|
||||||
let uri = format!(
|
let uri = format!(
|
||||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||||
self.mgmt_api_endpoint
|
self.mgmt_api_endpoint
|
||||||
);
|
);
|
||||||
|
|
||||||
let uri = match force_await_logical_size {
|
|
||||||
ForceAwaitLogicalSize::Yes => format!("{}?force-await-logical-size={}", uri, true),
|
|
||||||
ForceAwaitLogicalSize::No => uri,
|
|
||||||
};
|
|
||||||
|
|
||||||
self.get(&uri)
|
self.get(&uri)
|
||||||
.await?
|
.await?
|
||||||
.json()
|
.json()
|
||||||
@@ -177,28 +162,16 @@ impl Client {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
|
|
||||||
let uri = format!(
|
|
||||||
"{}/v1/tenant/{}/secondary/download",
|
|
||||||
self.mgmt_api_endpoint, tenant_id
|
|
||||||
);
|
|
||||||
self.request(Method::POST, &uri, ()).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn location_config(
|
pub async fn location_config(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
config: LocationConfig,
|
config: LocationConfig,
|
||||||
flush_ms: Option<std::time::Duration>,
|
flush_ms: Option<std::time::Duration>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let req_body = TenantLocationConfigRequest {
|
let req_body = TenantLocationConfigRequest { tenant_id, config };
|
||||||
tenant_id: tenant_shard_id,
|
|
||||||
config,
|
|
||||||
};
|
|
||||||
let path = format!(
|
let path = format!(
|
||||||
"{}/v1/tenant/{}/location_config",
|
"{}/v1/tenant/{}/location_config",
|
||||||
self.mgmt_api_endpoint, tenant_shard_id
|
self.mgmt_api_endpoint, tenant_id
|
||||||
);
|
);
|
||||||
let path = if let Some(flush_ms) = flush_ms {
|
let path = if let Some(flush_ms) = flush_ms {
|
||||||
format!("{}?flush_ms={}", path, flush_ms.as_millis())
|
format!("{}?flush_ms={}", path, flush_ms.as_millis())
|
||||||
@@ -209,23 +182,14 @@ impl Client {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
|
|
||||||
let path = format!("{}/v1/location_config", self.mgmt_api_endpoint);
|
|
||||||
self.request(Method::GET, &path, ())
|
|
||||||
.await?
|
|
||||||
.json()
|
|
||||||
.await
|
|
||||||
.map_err(Error::ReceiveBody)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn timeline_create(
|
pub async fn timeline_create(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
req: &TimelineCreateRequest,
|
req: &TimelineCreateRequest,
|
||||||
) -> Result<TimelineInfo> {
|
) -> Result<TimelineInfo> {
|
||||||
let uri = format!(
|
let uri = format!(
|
||||||
"{}/v1/tenant/{}/timeline",
|
"{}/v1/tenant/{}/timeline",
|
||||||
self.mgmt_api_endpoint, tenant_shard_id
|
self.mgmt_api_endpoint, tenant_id
|
||||||
);
|
);
|
||||||
self.request(Method::POST, &uri, req)
|
self.request(Method::POST, &uri, req)
|
||||||
.await?
|
.await?
|
||||||
@@ -233,46 +197,4 @@ impl Client {
|
|||||||
.await
|
.await
|
||||||
.map_err(Error::ReceiveBody)
|
.map_err(Error::ReceiveBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
|
||||||
let uri = format!(
|
|
||||||
"{}/v1/tenant/{}/reset",
|
|
||||||
self.mgmt_api_endpoint, tenant_shard_id
|
|
||||||
);
|
|
||||||
self.request(Method::POST, &uri, ())
|
|
||||||
.await?
|
|
||||||
.json()
|
|
||||||
.await
|
|
||||||
.map_err(Error::ReceiveBody)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn timeline_list(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: &TenantShardId,
|
|
||||||
) -> Result<Vec<TimelineInfo>> {
|
|
||||||
let uri = format!(
|
|
||||||
"{}/v1/tenant/{}/timeline",
|
|
||||||
self.mgmt_api_endpoint, tenant_shard_id
|
|
||||||
);
|
|
||||||
self.get(&uri)
|
|
||||||
.await?
|
|
||||||
.json()
|
|
||||||
.await
|
|
||||||
.map_err(Error::ReceiveBody)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn tenant_synthetic_size(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
) -> Result<TenantHistorySize> {
|
|
||||||
let uri = format!(
|
|
||||||
"{}/v1/tenant/{}/synthetic_size",
|
|
||||||
self.mgmt_api_endpoint, tenant_shard_id
|
|
||||||
);
|
|
||||||
self.get(&uri)
|
|
||||||
.await?
|
|
||||||
.json()
|
|
||||||
.await
|
|
||||||
.map_err(Error::ReceiveBody)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,53 +0,0 @@
|
|||||||
//! Helpers to do common higher-level tasks with the [`Client`].
|
|
||||||
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use tokio::task::JoinSet;
|
|
||||||
use utils::id::{TenantId, TenantTimelineId};
|
|
||||||
|
|
||||||
use super::Client;
|
|
||||||
|
|
||||||
/// Retrieve a list of all of the pageserver's timelines.
|
|
||||||
///
|
|
||||||
/// Fails if there are sharded tenants present on the pageserver.
|
|
||||||
pub async fn get_pageserver_tenant_timelines_unsharded(
|
|
||||||
api_client: &Arc<Client>,
|
|
||||||
) -> anyhow::Result<Vec<TenantTimelineId>> {
|
|
||||||
let mut timelines: Vec<TenantTimelineId> = Vec::new();
|
|
||||||
let mut tenants: Vec<TenantId> = Vec::new();
|
|
||||||
for ti in api_client.list_tenants().await? {
|
|
||||||
if !ti.id.is_unsharded() {
|
|
||||||
anyhow::bail!(
|
|
||||||
"only unsharded tenants are supported at this time: {}",
|
|
||||||
ti.id
|
|
||||||
);
|
|
||||||
}
|
|
||||||
tenants.push(ti.id.tenant_id)
|
|
||||||
}
|
|
||||||
let mut js = JoinSet::new();
|
|
||||||
for tenant_id in tenants {
|
|
||||||
js.spawn({
|
|
||||||
let mgmt_api_client = Arc::clone(api_client);
|
|
||||||
async move {
|
|
||||||
(
|
|
||||||
tenant_id,
|
|
||||||
mgmt_api_client
|
|
||||||
.tenant_details(TenantShardId::unsharded(tenant_id))
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
while let Some(res) = js.join_next().await {
|
|
||||||
let (tenant_id, details) = res.unwrap();
|
|
||||||
for timeline_id in details.timelines {
|
|
||||||
timelines.push(TenantTimelineId {
|
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(timelines)
|
|
||||||
}
|
|
||||||
@@ -108,38 +108,22 @@ pub struct RelTagBlockNo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PagestreamClient {
|
impl PagestreamClient {
|
||||||
pub async fn shutdown(self) {
|
pub async fn shutdown(mut self) {
|
||||||
let Self {
|
let _ = self.cancel_on_client_drop.take();
|
||||||
copy_both,
|
self.conn_task.await.unwrap();
|
||||||
cancel_on_client_drop: cancel_conn_task,
|
|
||||||
conn_task,
|
|
||||||
} = self;
|
|
||||||
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
|
||||||
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
|
|
||||||
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
|
|
||||||
//
|
|
||||||
// If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`,
|
|
||||||
// the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race).
|
|
||||||
//
|
|
||||||
// Further, the pageserver makes a lot of noise when it receives CopyFail.
|
|
||||||
// Computes don't send it in practice, they just hard-close the connection.
|
|
||||||
//
|
|
||||||
// So, let's behave like the computes and suppress the CopyFail as follows:
|
|
||||||
// kill the socket first, then drop copy_both.
|
|
||||||
//
|
|
||||||
// See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY
|
|
||||||
//
|
|
||||||
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
|
|
||||||
// => https://github.com/neondatabase/neon/issues/6390
|
|
||||||
let _ = cancel_conn_task.unwrap();
|
|
||||||
conn_task.await.unwrap();
|
|
||||||
drop(copy_both);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn getpage(
|
pub async fn getpage(
|
||||||
&mut self,
|
&mut self,
|
||||||
req: PagestreamGetPageRequest,
|
key: RelTagBlockNo,
|
||||||
|
lsn: Lsn,
|
||||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||||
|
let req = PagestreamGetPageRequest {
|
||||||
|
latest: false,
|
||||||
|
rel: key.rel_tag,
|
||||||
|
blkno: key.block_no,
|
||||||
|
lsn,
|
||||||
|
};
|
||||||
let req = PagestreamFeMessage::GetPage(req);
|
let req = PagestreamFeMessage::GetPage(req);
|
||||||
let req: bytes::Bytes = req.serialize();
|
let req: bytes::Bytes = req.serialize();
|
||||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "pagebench"
|
|
||||||
version = "0.1.0"
|
|
||||||
edition.workspace = true
|
|
||||||
license.workspace = true
|
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
anyhow.workspace = true
|
|
||||||
camino.workspace = true
|
|
||||||
clap.workspace = true
|
|
||||||
futures.workspace = true
|
|
||||||
hdrhistogram.workspace = true
|
|
||||||
humantime.workspace = true
|
|
||||||
humantime-serde.workspace = true
|
|
||||||
rand.workspace = true
|
|
||||||
serde.workspace = true
|
|
||||||
serde_json.workspace = true
|
|
||||||
tracing.workspace = true
|
|
||||||
tokio.workspace = true
|
|
||||||
tokio-util.workspace = true
|
|
||||||
|
|
||||||
pageserver_client.workspace = true
|
|
||||||
pageserver_api.workspace = true
|
|
||||||
utils = { path = "../../libs/utils/" }
|
|
||||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
|
||||||
@@ -1,275 +0,0 @@
|
|||||||
use anyhow::Context;
|
|
||||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
|
||||||
use pageserver_client::page_service::BasebackupRequest;
|
|
||||||
|
|
||||||
use utils::id::TenantTimelineId;
|
|
||||||
use utils::lsn::Lsn;
|
|
||||||
|
|
||||||
use rand::prelude::*;
|
|
||||||
use tokio::sync::Barrier;
|
|
||||||
use tokio::task::JoinSet;
|
|
||||||
use tracing::{debug, info, instrument};
|
|
||||||
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::num::NonZeroUsize;
|
|
||||||
use std::ops::Range;
|
|
||||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
|
||||||
use std::sync::{Arc, Mutex};
|
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
|
||||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
|
||||||
|
|
||||||
/// basebackup@LatestLSN
|
|
||||||
#[derive(clap::Parser)]
|
|
||||||
pub(crate) struct Args {
|
|
||||||
#[clap(long, default_value = "http://localhost:9898")]
|
|
||||||
mgmt_api_endpoint: String,
|
|
||||||
#[clap(long, default_value = "localhost:64000")]
|
|
||||||
page_service_host_port: String,
|
|
||||||
#[clap(long)]
|
|
||||||
pageserver_jwt: Option<String>,
|
|
||||||
#[clap(long, default_value = "1")]
|
|
||||||
num_clients: NonZeroUsize,
|
|
||||||
#[clap(long, default_value = "1.0")]
|
|
||||||
gzip_probability: f64,
|
|
||||||
#[clap(long)]
|
|
||||||
runtime: Option<humantime::Duration>,
|
|
||||||
#[clap(long)]
|
|
||||||
limit_to_first_n_targets: Option<usize>,
|
|
||||||
targets: Option<Vec<TenantTimelineId>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
|
||||||
struct LiveStats {
|
|
||||||
completed_requests: AtomicU64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LiveStats {
|
|
||||||
fn inc(&self) {
|
|
||||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Target {
|
|
||||||
timeline: TenantTimelineId,
|
|
||||||
lsn_range: Option<Range<Lsn>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(serde::Serialize)]
|
|
||||||
struct Output {
|
|
||||||
total: request_stats::Output,
|
|
||||||
}
|
|
||||||
|
|
||||||
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
|
|
||||||
|
|
||||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
|
||||||
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
|
|
||||||
main_impl(args, thread_local_stats)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn main_impl(
|
|
||||||
args: Args,
|
|
||||||
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let args: &'static Args = Box::leak(Box::new(args));
|
|
||||||
|
|
||||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
|
||||||
args.mgmt_api_endpoint.clone(),
|
|
||||||
args.pageserver_jwt.as_deref(),
|
|
||||||
));
|
|
||||||
|
|
||||||
// discover targets
|
|
||||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
|
||||||
&mgmt_api_client,
|
|
||||||
crate::util::cli::targets::Spec {
|
|
||||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
|
||||||
targets: args.targets.clone(),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
let mut js = JoinSet::new();
|
|
||||||
for timeline in &timelines {
|
|
||||||
js.spawn({
|
|
||||||
let timeline = *timeline;
|
|
||||||
let info = mgmt_api_client
|
|
||||||
.timeline_info(
|
|
||||||
timeline.tenant_id,
|
|
||||||
timeline.timeline_id,
|
|
||||||
ForceAwaitLogicalSize::No,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
async move {
|
|
||||||
anyhow::Ok(Target {
|
|
||||||
timeline,
|
|
||||||
// TODO: support lsn_range != latest LSN
|
|
||||||
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
let mut all_targets: Vec<Target> = Vec::new();
|
|
||||||
while let Some(res) = js.join_next().await {
|
|
||||||
all_targets.push(res.unwrap().unwrap());
|
|
||||||
}
|
|
||||||
|
|
||||||
let live_stats = Arc::new(LiveStats::default());
|
|
||||||
|
|
||||||
let num_client_tasks = timelines.len();
|
|
||||||
let num_live_stats_dump = 1;
|
|
||||||
let num_work_sender_tasks = 1;
|
|
||||||
|
|
||||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
|
||||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
|
|
||||||
));
|
|
||||||
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
|
|
||||||
|
|
||||||
tokio::spawn({
|
|
||||||
let stats = Arc::clone(&live_stats);
|
|
||||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
|
||||||
async move {
|
|
||||||
start_work_barrier.wait().await;
|
|
||||||
loop {
|
|
||||||
let start = std::time::Instant::now();
|
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
|
||||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
info!(
|
|
||||||
"RPS: {:.0}",
|
|
||||||
completed_requests as f64 / elapsed.as_secs_f64()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let mut work_senders = HashMap::new();
|
|
||||||
let mut tasks = Vec::new();
|
|
||||||
for tl in &timelines {
|
|
||||||
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
|
|
||||||
work_senders.insert(tl, sender);
|
|
||||||
tasks.push(tokio::spawn(client(
|
|
||||||
args,
|
|
||||||
*tl,
|
|
||||||
Arc::clone(&start_work_barrier),
|
|
||||||
receiver,
|
|
||||||
Arc::clone(&all_work_done_barrier),
|
|
||||||
Arc::clone(&live_stats),
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
let work_sender = async move {
|
|
||||||
start_work_barrier.wait().await;
|
|
||||||
loop {
|
|
||||||
let (timeline, work) = {
|
|
||||||
let mut rng = rand::thread_rng();
|
|
||||||
let target = all_targets.choose(&mut rng).unwrap();
|
|
||||||
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
|
|
||||||
(
|
|
||||||
target.timeline,
|
|
||||||
Work {
|
|
||||||
lsn,
|
|
||||||
gzip: rng.gen_bool(args.gzip_probability),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
};
|
|
||||||
let sender = work_senders.get(&timeline).unwrap();
|
|
||||||
// TODO: what if this blocks?
|
|
||||||
sender.send(work).await.ok().unwrap();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(runtime) = args.runtime {
|
|
||||||
match tokio::time::timeout(runtime.into(), work_sender).await {
|
|
||||||
Ok(()) => unreachable!("work sender never terminates"),
|
|
||||||
Err(_timeout) => {
|
|
||||||
// this implicitly drops the work_senders, making all the clients exit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
work_sender.await;
|
|
||||||
unreachable!("work sender never terminates");
|
|
||||||
}
|
|
||||||
|
|
||||||
for t in tasks {
|
|
||||||
t.await.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
let output = Output {
|
|
||||||
total: {
|
|
||||||
let mut agg_stats = request_stats::Stats::new();
|
|
||||||
for stats in all_thread_local_stats.lock().unwrap().iter() {
|
|
||||||
let stats = stats.lock().unwrap();
|
|
||||||
agg_stats.add(&stats);
|
|
||||||
}
|
|
||||||
agg_stats.output()
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
|
||||||
println!("{output}");
|
|
||||||
|
|
||||||
anyhow::Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
|
||||||
struct Work {
|
|
||||||
lsn: Option<Lsn>,
|
|
||||||
gzip: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
|
||||||
async fn client(
|
|
||||||
args: &'static Args,
|
|
||||||
timeline: TenantTimelineId,
|
|
||||||
start_work_barrier: Arc<Barrier>,
|
|
||||||
mut work: tokio::sync::mpsc::Receiver<Work>,
|
|
||||||
all_work_done_barrier: Arc<Barrier>,
|
|
||||||
live_stats: Arc<LiveStats>,
|
|
||||||
) {
|
|
||||||
start_work_barrier.wait().await;
|
|
||||||
|
|
||||||
let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
|
|
||||||
&args.page_service_host_port,
|
|
||||||
args.pageserver_jwt.as_deref(),
|
|
||||||
))
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
|
||||||
let start = Instant::now();
|
|
||||||
let copy_out_stream = client
|
|
||||||
.basebackup(&BasebackupRequest {
|
|
||||||
tenant_id: timeline.tenant_id,
|
|
||||||
timeline_id: timeline.timeline_id,
|
|
||||||
lsn,
|
|
||||||
gzip,
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("start basebackup for {timeline}"))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
use futures::StreamExt;
|
|
||||||
let size = Arc::new(AtomicUsize::new(0));
|
|
||||||
copy_out_stream
|
|
||||||
.for_each({
|
|
||||||
|r| {
|
|
||||||
let size = Arc::clone(&size);
|
|
||||||
async move {
|
|
||||||
let size = Arc::clone(&size);
|
|
||||||
size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.await;
|
|
||||||
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
live_stats.inc();
|
|
||||||
STATS.with(|stats| {
|
|
||||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
all_work_done_barrier.wait().await;
|
|
||||||
}
|
|
||||||
@@ -1,430 +0,0 @@
|
|||||||
use anyhow::Context;
|
|
||||||
use camino::Utf8PathBuf;
|
|
||||||
use futures::future::join_all;
|
|
||||||
use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
|
|
||||||
use pageserver_api::keyspace::KeySpaceAccum;
|
|
||||||
use pageserver_api::models::PagestreamGetPageRequest;
|
|
||||||
|
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use utils::id::TenantTimelineId;
|
|
||||||
use utils::lsn::Lsn;
|
|
||||||
|
|
||||||
use rand::prelude::*;
|
|
||||||
use tokio::sync::Barrier;
|
|
||||||
use tokio::task::JoinSet;
|
|
||||||
use tracing::{info, instrument};
|
|
||||||
|
|
||||||
use std::collections::{HashMap, HashSet};
|
|
||||||
use std::future::Future;
|
|
||||||
use std::num::NonZeroUsize;
|
|
||||||
use std::pin::Pin;
|
|
||||||
use std::sync::atomic::{AtomicU64, Ordering};
|
|
||||||
use std::sync::{Arc, Mutex};
|
|
||||||
use std::time::{Duration, Instant};
|
|
||||||
|
|
||||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
|
||||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
|
||||||
|
|
||||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
|
||||||
#[derive(clap::Parser)]
|
|
||||||
pub(crate) struct Args {
|
|
||||||
#[clap(long, default_value = "http://localhost:9898")]
|
|
||||||
mgmt_api_endpoint: String,
|
|
||||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
|
||||||
page_service_connstring: String,
|
|
||||||
#[clap(long)]
|
|
||||||
pageserver_jwt: Option<String>,
|
|
||||||
#[clap(long, default_value = "1")]
|
|
||||||
num_clients: NonZeroUsize,
|
|
||||||
#[clap(long)]
|
|
||||||
runtime: Option<humantime::Duration>,
|
|
||||||
#[clap(long)]
|
|
||||||
per_target_rate_limit: Option<usize>,
|
|
||||||
/// Probability for sending `latest=true` in the request (uniform distribution).
|
|
||||||
#[clap(long, default_value = "1")]
|
|
||||||
req_latest_probability: f64,
|
|
||||||
#[clap(long)]
|
|
||||||
limit_to_first_n_targets: Option<usize>,
|
|
||||||
/// For large pageserver installations, enumerating the keyspace takes a lot of time.
|
|
||||||
/// If specified, the specified path is used to maintain a cache of the keyspace enumeration result.
|
|
||||||
/// The cache is tagged and auto-invalided by the tenant/timeline ids only.
|
|
||||||
/// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction.
|
|
||||||
#[clap(long)]
|
|
||||||
keyspace_cache: Option<Utf8PathBuf>,
|
|
||||||
targets: Option<Vec<TenantTimelineId>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
|
||||||
struct LiveStats {
|
|
||||||
completed_requests: AtomicU64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LiveStats {
|
|
||||||
fn inc(&self) {
|
|
||||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, serde::Serialize, serde::Deserialize)]
|
|
||||||
struct KeyRange {
|
|
||||||
timeline: TenantTimelineId,
|
|
||||||
timeline_lsn: Lsn,
|
|
||||||
start: i128,
|
|
||||||
end: i128,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl KeyRange {
|
|
||||||
fn len(&self) -> i128 {
|
|
||||||
self.end - self.start
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(serde::Serialize)]
|
|
||||||
struct Output {
|
|
||||||
total: request_stats::Output,
|
|
||||||
}
|
|
||||||
|
|
||||||
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
|
|
||||||
|
|
||||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
|
||||||
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
|
|
||||||
main_impl(args, thread_local_stats)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn main_impl(
|
|
||||||
args: Args,
|
|
||||||
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let args: &'static Args = Box::leak(Box::new(args));
|
|
||||||
|
|
||||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
|
||||||
args.mgmt_api_endpoint.clone(),
|
|
||||||
args.pageserver_jwt.as_deref(),
|
|
||||||
));
|
|
||||||
|
|
||||||
// discover targets
|
|
||||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
|
||||||
&mgmt_api_client,
|
|
||||||
crate::util::cli::targets::Spec {
|
|
||||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
|
||||||
targets: args.targets.clone(),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
#[derive(serde::Deserialize)]
|
|
||||||
struct KeyspaceCacheDe {
|
|
||||||
tag: Vec<TenantTimelineId>,
|
|
||||||
data: Vec<KeyRange>,
|
|
||||||
}
|
|
||||||
#[derive(serde::Serialize)]
|
|
||||||
struct KeyspaceCacheSer<'a> {
|
|
||||||
tag: &'a [TenantTimelineId],
|
|
||||||
data: &'a [KeyRange],
|
|
||||||
}
|
|
||||||
let cache = args
|
|
||||||
.keyspace_cache
|
|
||||||
.as_ref()
|
|
||||||
.map(|keyspace_cache_file| {
|
|
||||||
let contents = match std::fs::read(keyspace_cache_file) {
|
|
||||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
|
||||||
return anyhow::Ok(None);
|
|
||||||
}
|
|
||||||
x => x.context("read keyspace cache file")?,
|
|
||||||
};
|
|
||||||
let cache: KeyspaceCacheDe =
|
|
||||||
serde_json::from_slice(&contents).context("deserialize cache file")?;
|
|
||||||
let tag_ok = HashSet::<TenantTimelineId>::from_iter(cache.tag.into_iter())
|
|
||||||
== HashSet::from_iter(timelines.iter().cloned());
|
|
||||||
info!("keyspace cache file matches tag: {tag_ok}");
|
|
||||||
anyhow::Ok(if tag_ok { Some(cache.data) } else { None })
|
|
||||||
})
|
|
||||||
.transpose()?
|
|
||||||
.flatten();
|
|
||||||
let all_ranges: Vec<KeyRange> = if let Some(cached) = cache {
|
|
||||||
info!("using keyspace cache file");
|
|
||||||
cached
|
|
||||||
} else {
|
|
||||||
let mut js = JoinSet::new();
|
|
||||||
for timeline in &timelines {
|
|
||||||
js.spawn({
|
|
||||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
|
||||||
let timeline = *timeline;
|
|
||||||
async move {
|
|
||||||
let partitioning = mgmt_api_client
|
|
||||||
.keyspace(timeline.tenant_id, timeline.timeline_id)
|
|
||||||
.await?;
|
|
||||||
let lsn = partitioning.at_lsn;
|
|
||||||
let start = Instant::now();
|
|
||||||
let mut filtered = KeySpaceAccum::new();
|
|
||||||
// let's hope this is inlined and vectorized...
|
|
||||||
// TODO: turn this loop into a is_rel_block_range() function.
|
|
||||||
for r in partitioning.keys.ranges.iter() {
|
|
||||||
let mut i = r.start;
|
|
||||||
while i != r.end {
|
|
||||||
if is_rel_block_key(&i) {
|
|
||||||
filtered.add_key(i);
|
|
||||||
}
|
|
||||||
i = i.next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let filtered = filtered.to_keyspace();
|
|
||||||
let filter_duration = start.elapsed();
|
|
||||||
|
|
||||||
anyhow::Ok((
|
|
||||||
filter_duration,
|
|
||||||
filtered.ranges.into_iter().map(move |r| KeyRange {
|
|
||||||
timeline,
|
|
||||||
timeline_lsn: lsn,
|
|
||||||
start: r.start.to_i128(),
|
|
||||||
end: r.end.to_i128(),
|
|
||||||
}),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
let mut total_filter_duration = Duration::from_secs(0);
|
|
||||||
let mut all_ranges: Vec<KeyRange> = Vec::new();
|
|
||||||
while let Some(res) = js.join_next().await {
|
|
||||||
let (filter_duration, range) = res.unwrap().unwrap();
|
|
||||||
all_ranges.extend(range);
|
|
||||||
total_filter_duration += filter_duration;
|
|
||||||
}
|
|
||||||
info!("filter duration: {}", total_filter_duration.as_secs_f64());
|
|
||||||
if let Some(cachefile) = args.keyspace_cache.as_ref() {
|
|
||||||
let cache = KeyspaceCacheSer {
|
|
||||||
tag: &timelines,
|
|
||||||
data: &all_ranges,
|
|
||||||
};
|
|
||||||
let bytes = serde_json::to_vec(&cache).context("serialize keyspace for cache file")?;
|
|
||||||
std::fs::write(cachefile, bytes).context("write keyspace cache file to disk")?;
|
|
||||||
info!("successfully wrote keyspace cache file");
|
|
||||||
}
|
|
||||||
all_ranges
|
|
||||||
};
|
|
||||||
|
|
||||||
let live_stats = Arc::new(LiveStats::default());
|
|
||||||
|
|
||||||
let num_client_tasks = timelines.len();
|
|
||||||
let num_live_stats_dump = 1;
|
|
||||||
let num_work_sender_tasks = 1;
|
|
||||||
let num_main_impl = 1;
|
|
||||||
|
|
||||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
|
||||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
|
|
||||||
));
|
|
||||||
|
|
||||||
tokio::spawn({
|
|
||||||
let stats = Arc::clone(&live_stats);
|
|
||||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
|
||||||
async move {
|
|
||||||
start_work_barrier.wait().await;
|
|
||||||
loop {
|
|
||||||
let start = std::time::Instant::now();
|
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
|
||||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
info!(
|
|
||||||
"RPS: {:.0}",
|
|
||||||
completed_requests as f64 / elapsed.as_secs_f64()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
|
||||||
|
|
||||||
let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
|
|
||||||
let mut tasks = Vec::new();
|
|
||||||
for tl in &timelines {
|
|
||||||
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
|
|
||||||
work_senders.insert(*tl, sender);
|
|
||||||
tasks.push(tokio::spawn(client(
|
|
||||||
args,
|
|
||||||
*tl,
|
|
||||||
Arc::clone(&start_work_barrier),
|
|
||||||
receiver,
|
|
||||||
Arc::clone(&live_stats),
|
|
||||||
cancel.clone(),
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
|
|
||||||
let start_work_barrier = start_work_barrier.clone();
|
|
||||||
let cancel = cancel.clone();
|
|
||||||
match args.per_target_rate_limit {
|
|
||||||
None => Box::pin(async move {
|
|
||||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
|
||||||
all_ranges.iter().map(|v| v.len()),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
start_work_barrier.wait().await;
|
|
||||||
|
|
||||||
while !cancel.is_cancelled() {
|
|
||||||
let (timeline, req) = {
|
|
||||||
let mut rng = rand::thread_rng();
|
|
||||||
let r = &all_ranges[weights.sample(&mut rng)];
|
|
||||||
let key: i128 = rng.gen_range(r.start..r.end);
|
|
||||||
let key = Key::from_i128(key);
|
|
||||||
let (rel_tag, block_no) =
|
|
||||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
|
||||||
(
|
|
||||||
r.timeline,
|
|
||||||
PagestreamGetPageRequest {
|
|
||||||
latest: rng.gen_bool(args.req_latest_probability),
|
|
||||||
lsn: r.timeline_lsn,
|
|
||||||
rel: rel_tag,
|
|
||||||
blkno: block_no,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
};
|
|
||||||
let sender = work_senders.get(&timeline).unwrap();
|
|
||||||
// TODO: what if this blocks?
|
|
||||||
if sender.send(req).await.is_err() {
|
|
||||||
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
Some(rps_limit) => Box::pin(async move {
|
|
||||||
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
|
|
||||||
let make_timeline_task: &dyn Fn(
|
|
||||||
TenantTimelineId,
|
|
||||||
)
|
|
||||||
-> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
|
|
||||||
let sender = work_senders.get(&timeline).unwrap();
|
|
||||||
let ranges: Vec<KeyRange> = all_ranges
|
|
||||||
.iter()
|
|
||||||
.filter(|r| r.timeline == timeline)
|
|
||||||
.cloned()
|
|
||||||
.collect();
|
|
||||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
|
||||||
ranges.iter().map(|v| v.len()),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let cancel = cancel.clone();
|
|
||||||
Box::pin(async move {
|
|
||||||
let mut ticker = tokio::time::interval(period);
|
|
||||||
ticker.set_missed_tick_behavior(
|
|
||||||
/* TODO review this choice */
|
|
||||||
tokio::time::MissedTickBehavior::Burst,
|
|
||||||
);
|
|
||||||
while !cancel.is_cancelled() {
|
|
||||||
ticker.tick().await;
|
|
||||||
let req = {
|
|
||||||
let mut rng = rand::thread_rng();
|
|
||||||
let r = &ranges[weights.sample(&mut rng)];
|
|
||||||
let key: i128 = rng.gen_range(r.start..r.end);
|
|
||||||
let key = Key::from_i128(key);
|
|
||||||
assert!(is_rel_block_key(&key));
|
|
||||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
|
||||||
.expect("we filter non-rel-block keys out above");
|
|
||||||
PagestreamGetPageRequest {
|
|
||||||
latest: rng.gen_bool(args.req_latest_probability),
|
|
||||||
lsn: r.timeline_lsn,
|
|
||||||
rel: rel_tag,
|
|
||||||
blkno: block_no,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if sender.send(req).await.is_err() {
|
|
||||||
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
};
|
|
||||||
|
|
||||||
let tasks: Vec<_> = work_senders
|
|
||||||
.keys()
|
|
||||||
.map(|tl| make_timeline_task(*tl))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
start_work_barrier.wait().await;
|
|
||||||
|
|
||||||
join_all(tasks).await;
|
|
||||||
}),
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let work_sender_task = tokio::spawn(work_sender);
|
|
||||||
|
|
||||||
info!("waiting for everything to become ready");
|
|
||||||
start_work_barrier.wait().await;
|
|
||||||
info!("work started");
|
|
||||||
if let Some(runtime) = args.runtime {
|
|
||||||
tokio::time::sleep(runtime.into()).await;
|
|
||||||
info!("runtime over, signalling cancellation");
|
|
||||||
cancel.cancel();
|
|
||||||
work_sender_task.await.unwrap();
|
|
||||||
info!("work sender exited");
|
|
||||||
} else {
|
|
||||||
work_sender_task.await.unwrap();
|
|
||||||
unreachable!("work sender never terminates");
|
|
||||||
}
|
|
||||||
|
|
||||||
info!("joining clients");
|
|
||||||
for t in tasks {
|
|
||||||
t.await.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
info!("all clients stopped");
|
|
||||||
|
|
||||||
let output = Output {
|
|
||||||
total: {
|
|
||||||
let mut agg_stats = request_stats::Stats::new();
|
|
||||||
for stats in all_thread_local_stats.lock().unwrap().iter() {
|
|
||||||
let stats = stats.lock().unwrap();
|
|
||||||
agg_stats.add(&stats);
|
|
||||||
}
|
|
||||||
agg_stats.output()
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
|
||||||
println!("{output}");
|
|
||||||
|
|
||||||
anyhow::Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
|
||||||
async fn client(
|
|
||||||
args: &'static Args,
|
|
||||||
timeline: TenantTimelineId,
|
|
||||||
start_work_barrier: Arc<Barrier>,
|
|
||||||
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
|
|
||||||
live_stats: Arc<LiveStats>,
|
|
||||||
cancel: CancellationToken,
|
|
||||||
) {
|
|
||||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
let mut client = client
|
|
||||||
.pagestream(timeline.tenant_id, timeline.timeline_id)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let do_requests = async {
|
|
||||||
start_work_barrier.wait().await;
|
|
||||||
while let Some(req) = work.recv().await {
|
|
||||||
let start = Instant::now();
|
|
||||||
client
|
|
||||||
.getpage(req)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("getpage for {timeline}"))
|
|
||||||
.unwrap();
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
live_stats.inc();
|
|
||||||
STATS.with(|stats| {
|
|
||||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
tokio::select! {
|
|
||||||
res = do_requests => { res },
|
|
||||||
_ = cancel.cancelled() => {
|
|
||||||
client.shutdown().await;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,88 +0,0 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use humantime::Duration;
|
|
||||||
use tokio::task::JoinSet;
|
|
||||||
use utils::id::TenantTimelineId;
|
|
||||||
|
|
||||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
|
||||||
|
|
||||||
#[derive(clap::Parser)]
|
|
||||||
pub(crate) struct Args {
|
|
||||||
#[clap(long, default_value = "http://localhost:9898")]
|
|
||||||
mgmt_api_endpoint: String,
|
|
||||||
#[clap(long, default_value = "localhost:64000")]
|
|
||||||
page_service_host_port: String,
|
|
||||||
#[clap(long)]
|
|
||||||
pageserver_jwt: Option<String>,
|
|
||||||
#[clap(
|
|
||||||
long,
|
|
||||||
help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
|
|
||||||
)]
|
|
||||||
poll_for_completion: Option<Duration>,
|
|
||||||
#[clap(long)]
|
|
||||||
limit_to_first_n_targets: Option<usize>,
|
|
||||||
targets: Option<Vec<TenantTimelineId>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
|
||||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let main_task = rt.spawn(main_impl(args));
|
|
||||||
rt.block_on(main_task).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
|
||||||
let args: &'static Args = Box::leak(Box::new(args));
|
|
||||||
|
|
||||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
|
||||||
args.mgmt_api_endpoint.clone(),
|
|
||||||
args.pageserver_jwt.as_deref(),
|
|
||||||
));
|
|
||||||
|
|
||||||
// discover targets
|
|
||||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
|
||||||
&mgmt_api_client,
|
|
||||||
crate::util::cli::targets::Spec {
|
|
||||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
|
||||||
targets: args.targets.clone(),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
// kick it off
|
|
||||||
|
|
||||||
let mut js = JoinSet::new();
|
|
||||||
for tl in timelines {
|
|
||||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
|
||||||
js.spawn(async move {
|
|
||||||
let info = mgmt_api_client
|
|
||||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// Polling should not be strictly required here since we await
|
|
||||||
// for the initial logical size, however it's possible for the request
|
|
||||||
// to land before the timeline is initialised. This results in an approximate
|
|
||||||
// logical size.
|
|
||||||
if let Some(period) = args.poll_for_completion {
|
|
||||||
let mut ticker = tokio::time::interval(period.into());
|
|
||||||
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
|
|
||||||
let mut info = info;
|
|
||||||
while !info.current_logical_size_is_accurate {
|
|
||||||
ticker.tick().await;
|
|
||||||
info = mgmt_api_client
|
|
||||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
while let Some(res) = js.join_next().await {
|
|
||||||
let _: () = res.unwrap();
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
use clap::Parser;
|
|
||||||
use utils::logging;
|
|
||||||
|
|
||||||
/// Re-usable pieces of code that aren't CLI-specific.
|
|
||||||
mod util {
|
|
||||||
pub(crate) mod connstring;
|
|
||||||
pub(crate) mod request_stats;
|
|
||||||
#[macro_use]
|
|
||||||
pub(crate) mod tokio_thread_local_stats;
|
|
||||||
/// Re-usable pieces of CLI-specific code.
|
|
||||||
pub(crate) mod cli {
|
|
||||||
pub(crate) mod targets;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
|
|
||||||
mod cmd {
|
|
||||||
pub(super) mod basebackup;
|
|
||||||
pub(super) mod getpage_latest_lsn;
|
|
||||||
pub(super) mod trigger_initial_size_calculation;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Component-level performance test for pageserver.
|
|
||||||
#[derive(clap::Parser)]
|
|
||||||
enum Args {
|
|
||||||
Basebackup(cmd::basebackup::Args),
|
|
||||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
|
||||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() {
|
|
||||||
logging::init(
|
|
||||||
logging::LogFormat::Plain,
|
|
||||||
logging::TracingErrorLayerEnablement::Disabled,
|
|
||||||
logging::Output::Stderr,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
|
||||||
|
|
||||||
let args = Args::parse();
|
|
||||||
match args {
|
|
||||||
Args::Basebackup(args) => cmd::basebackup::main(args),
|
|
||||||
Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
|
|
||||||
Args::TriggerInitialSizeCalculation(args) => {
|
|
||||||
cmd::trigger_initial_size_calculation::main(args)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use pageserver_client::mgmt_api;
|
|
||||||
use tracing::info;
|
|
||||||
use utils::id::TenantTimelineId;
|
|
||||||
|
|
||||||
pub(crate) struct Spec {
|
|
||||||
pub(crate) limit_to_first_n_targets: Option<usize>,
|
|
||||||
pub(crate) targets: Option<Vec<TenantTimelineId>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn discover(
|
|
||||||
api_client: &Arc<mgmt_api::Client>,
|
|
||||||
spec: Spec,
|
|
||||||
) -> anyhow::Result<Vec<TenantTimelineId>> {
|
|
||||||
let mut timelines = if let Some(targets) = spec.targets {
|
|
||||||
targets
|
|
||||||
} else {
|
|
||||||
mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(limit) = spec.limit_to_first_n_targets {
|
|
||||||
timelines.sort(); // for determinism
|
|
||||||
timelines.truncate(limit);
|
|
||||||
if timelines.len() < limit {
|
|
||||||
anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
info!("timelines:\n{:?}", timelines);
|
|
||||||
info!("number of timelines:\n{:?}", timelines.len());
|
|
||||||
|
|
||||||
Ok(timelines)
|
|
||||||
}
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
|
|
||||||
let colon_and_jwt = if let Some(jwt) = jwt {
|
|
||||||
format!(":{jwt}") // TODO: urlescape
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
format!("postgres://postgres{colon_and_jwt}@{host_port}")
|
|
||||||
}
|
|
||||||
@@ -1,88 +0,0 @@
|
|||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
use anyhow::Context;
|
|
||||||
|
|
||||||
pub(crate) struct Stats {
|
|
||||||
latency_histo: hdrhistogram::Histogram<u64>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Stats {
|
|
||||||
pub(crate) fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
// Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
|
|
||||||
// which would skew the benchmark results.
|
|
||||||
latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
|
|
||||||
let micros: u64 = latency
|
|
||||||
.as_micros()
|
|
||||||
.try_into()
|
|
||||||
.context("latency greater than u64")?;
|
|
||||||
self.latency_histo
|
|
||||||
.record(micros)
|
|
||||||
.context("add to histogram")?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
pub(crate) fn output(&self) -> Output {
|
|
||||||
let latency_percentiles = std::array::from_fn(|idx| {
|
|
||||||
let micros = self
|
|
||||||
.latency_histo
|
|
||||||
.value_at_percentile(LATENCY_PERCENTILES[idx]);
|
|
||||||
Duration::from_micros(micros)
|
|
||||||
});
|
|
||||||
Output {
|
|
||||||
request_count: self.latency_histo.len(),
|
|
||||||
latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
|
|
||||||
latency_percentiles: LatencyPercentiles {
|
|
||||||
latency_percentiles,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub(crate) fn add(&mut self, other: &Self) {
|
|
||||||
let Self {
|
|
||||||
ref mut latency_histo,
|
|
||||||
} = self;
|
|
||||||
latency_histo.add(&other.latency_histo).unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for Stats {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::new()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
|
|
||||||
|
|
||||||
struct LatencyPercentiles {
|
|
||||||
latency_percentiles: [Duration; 4],
|
|
||||||
}
|
|
||||||
|
|
||||||
impl serde::Serialize for LatencyPercentiles {
|
|
||||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
||||||
where
|
|
||||||
S: serde::Serializer,
|
|
||||||
{
|
|
||||||
use serde::ser::SerializeMap;
|
|
||||||
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
|
|
||||||
for p in LATENCY_PERCENTILES {
|
|
||||||
ser.serialize_entry(
|
|
||||||
&format!("p{p}"),
|
|
||||||
&format!(
|
|
||||||
"{}",
|
|
||||||
&humantime::format_duration(self.latency_percentiles[0])
|
|
||||||
),
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
ser.end()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(serde::Serialize)]
|
|
||||||
pub(crate) struct Output {
|
|
||||||
request_count: u64,
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
latency_mean: Duration,
|
|
||||||
latency_percentiles: LatencyPercentiles,
|
|
||||||
}
|
|
||||||
@@ -1,45 +0,0 @@
|
|||||||
pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
|
|
||||||
pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
|
|
||||||
|
|
||||||
macro_rules! declare {
|
|
||||||
($THREAD_LOCAL_NAME:ident: $T:ty) => {
|
|
||||||
thread_local! {
|
|
||||||
pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
|
|
||||||
std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
use std::sync::{Arc, Mutex};
|
|
||||||
|
|
||||||
pub(crate) use declare;
|
|
||||||
|
|
||||||
macro_rules! main {
|
|
||||||
($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
|
|
||||||
let main_impl = $main_impl;
|
|
||||||
let all = Arc::new(Mutex::new(Vec::new()));
|
|
||||||
|
|
||||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
|
||||||
.on_thread_start({
|
|
||||||
let all = Arc::clone(&all);
|
|
||||||
move || {
|
|
||||||
// pre-initialize the thread local stats by accessesing them
|
|
||||||
// (some stats like requests_stats::Stats are quite costly to initialize,
|
|
||||||
// we don't want to pay that cost during the measurement period)
|
|
||||||
$THREAD_LOCAL_NAME.with(|stats| {
|
|
||||||
let stats: Arc<_> = Arc::clone(&*stats.borrow());
|
|
||||||
all.lock().unwrap().push(stats);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let main_task = rt.spawn(main_impl(all));
|
|
||||||
rt.block_on(main_task).unwrap()
|
|
||||||
}};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) use main;
|
|
||||||
@@ -23,7 +23,6 @@ use tracing::*;
|
|||||||
use tokio_tar::{Builder, EntryType, Header};
|
use tokio_tar::{Builder, EntryType, Header};
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::pgdatadir_mapping::Version;
|
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||||
|
|
||||||
@@ -175,7 +174,7 @@ where
|
|||||||
] {
|
] {
|
||||||
for segno in self
|
for segno in self
|
||||||
.timeline
|
.timeline
|
||||||
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
|
.list_slru_segments(kind, self.lsn, self.ctx)
|
||||||
.await?
|
.await?
|
||||||
{
|
{
|
||||||
self.add_slru_segment(kind, segno).await?;
|
self.add_slru_segment(kind, segno).await?;
|
||||||
@@ -193,7 +192,7 @@ where
|
|||||||
// Otherwise only include init forks of unlogged relations.
|
// Otherwise only include init forks of unlogged relations.
|
||||||
let rels = self
|
let rels = self
|
||||||
.timeline
|
.timeline
|
||||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
for &rel in rels.iter() {
|
for &rel in rels.iter() {
|
||||||
// Send init fork as main fork to provide well formed empty
|
// Send init fork as main fork to provide well formed empty
|
||||||
@@ -268,7 +267,7 @@ where
|
|||||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
||||||
let nblocks = self
|
let nblocks = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
|
.get_rel_size(src, self.lsn, false, self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// If the relation is empty, create an empty file
|
// If the relation is empty, create an empty file
|
||||||
@@ -289,7 +288,7 @@ where
|
|||||||
for blknum in startblk..endblk {
|
for blknum in startblk..endblk {
|
||||||
let img = self
|
let img = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
|
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
segment_data.extend_from_slice(&img[..]);
|
segment_data.extend_from_slice(&img[..]);
|
||||||
}
|
}
|
||||||
@@ -311,7 +310,7 @@ where
|
|||||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||||
let nblocks = self
|
let nblocks = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
|
.get_slru_segment_size(slru, segno, self.lsn, self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||||
@@ -353,7 +352,7 @@ where
|
|||||||
let relmap_img = if has_relmap_file {
|
let relmap_img = if has_relmap_file {
|
||||||
let img = self
|
let img = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
ensure!(
|
ensure!(
|
||||||
@@ -400,7 +399,7 @@ where
|
|||||||
if !has_relmap_file
|
if !has_relmap_file
|
||||||
&& self
|
&& self
|
||||||
.timeline
|
.timeline
|
||||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||||
.await?
|
.await?
|
||||||
.is_empty()
|
.is_empty()
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -31,7 +31,6 @@ use pageserver::{
|
|||||||
virtual_file,
|
virtual_file,
|
||||||
};
|
};
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use utils::failpoint_support;
|
|
||||||
use utils::logging::TracingErrorLayerEnablement;
|
use utils::logging::TracingErrorLayerEnablement;
|
||||||
use utils::signals::ShutdownSignals;
|
use utils::signals::ShutdownSignals;
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -127,7 +126,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initialize up failpoints support
|
// Initialize up failpoints support
|
||||||
let scenario = failpoint_support::init();
|
let scenario = pageserver::failpoint_support::init();
|
||||||
|
|
||||||
// Basic initialization of things that don't change after startup
|
// Basic initialization of things that don't change after startup
|
||||||
virtual_file::init(conf.max_file_descriptors);
|
virtual_file::init(conf.max_file_descriptors);
|
||||||
@@ -527,7 +526,6 @@ fn start_pageserver(
|
|||||||
conf,
|
conf,
|
||||||
remote_storage.clone(),
|
remote_storage.clone(),
|
||||||
disk_usage_eviction_state.clone(),
|
disk_usage_eviction_state.clone(),
|
||||||
tenant_manager.clone(),
|
|
||||||
background_jobs_barrier.clone(),
|
background_jobs_barrier.clone(),
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,8 +37,8 @@ use crate::tenant::{
|
|||||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
|
||||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||||
};
|
};
|
||||||
|
|
||||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||||
@@ -75,9 +75,6 @@ pub mod defaults {
|
|||||||
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
||||||
|
|
||||||
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
|
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
|
||||||
pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
|
|
||||||
|
|
||||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Default built-in configuration file.
|
/// Default built-in configuration file.
|
||||||
@@ -91,7 +88,6 @@ pub mod defaults {
|
|||||||
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
||||||
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
||||||
|
|
||||||
#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
|
|
||||||
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
|
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
|
||||||
|
|
||||||
# initial superuser role name to use when creating a new tenant
|
# initial superuser role name to use when creating a new tenant
|
||||||
@@ -112,8 +108,6 @@ pub mod defaults {
|
|||||||
|
|
||||||
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
||||||
|
|
||||||
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
|
|
||||||
|
|
||||||
[tenant_config]
|
[tenant_config]
|
||||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||||
@@ -131,7 +125,6 @@ pub mod defaults {
|
|||||||
#gc_feedback = false
|
#gc_feedback = false
|
||||||
|
|
||||||
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
||||||
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
|
||||||
|
|
||||||
[remote_storage]
|
[remote_storage]
|
||||||
|
|
||||||
@@ -240,13 +233,6 @@ pub struct PageServerConf {
|
|||||||
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
|
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
|
||||||
/// heatmap uploads vs. other remote storage operations.
|
/// heatmap uploads vs. other remote storage operations.
|
||||||
pub heatmap_upload_concurrency: usize,
|
pub heatmap_upload_concurrency: usize,
|
||||||
|
|
||||||
/// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly
|
|
||||||
/// deprioritises secondary downloads vs. remote storage operations for attached tenants.
|
|
||||||
pub secondary_download_concurrency: usize,
|
|
||||||
|
|
||||||
/// Maximum number of WAL records to be ingested and committed at the same time
|
|
||||||
pub ingest_batch_size: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||||
@@ -328,9 +314,6 @@ struct PageServerConfigBuilder {
|
|||||||
control_plane_emergency_mode: BuilderValue<bool>,
|
control_plane_emergency_mode: BuilderValue<bool>,
|
||||||
|
|
||||||
heatmap_upload_concurrency: BuilderValue<usize>,
|
heatmap_upload_concurrency: BuilderValue<usize>,
|
||||||
secondary_download_concurrency: BuilderValue<usize>,
|
|
||||||
|
|
||||||
ingest_batch_size: BuilderValue<u64>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for PageServerConfigBuilder {
|
impl Default for PageServerConfigBuilder {
|
||||||
@@ -403,9 +386,6 @@ impl Default for PageServerConfigBuilder {
|
|||||||
control_plane_emergency_mode: Set(false),
|
control_plane_emergency_mode: Set(false),
|
||||||
|
|
||||||
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
||||||
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
|
|
||||||
|
|
||||||
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -554,14 +534,6 @@ impl PageServerConfigBuilder {
|
|||||||
self.heatmap_upload_concurrency = BuilderValue::Set(value)
|
self.heatmap_upload_concurrency = BuilderValue::Set(value)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn secondary_download_concurrency(&mut self, value: usize) {
|
|
||||||
self.secondary_download_concurrency = BuilderValue::Set(value)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
|
|
||||||
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||||
let concurrent_tenant_warmup = self
|
let concurrent_tenant_warmup = self
|
||||||
.concurrent_tenant_warmup
|
.concurrent_tenant_warmup
|
||||||
@@ -660,15 +632,10 @@ impl PageServerConfigBuilder {
|
|||||||
control_plane_emergency_mode: self
|
control_plane_emergency_mode: self
|
||||||
.control_plane_emergency_mode
|
.control_plane_emergency_mode
|
||||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
||||||
|
|
||||||
heatmap_upload_concurrency: self
|
heatmap_upload_concurrency: self
|
||||||
.heatmap_upload_concurrency
|
.heatmap_upload_concurrency
|
||||||
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
||||||
secondary_download_concurrency: self
|
|
||||||
.secondary_download_concurrency
|
|
||||||
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
|
|
||||||
ingest_batch_size: self
|
|
||||||
.ingest_batch_size
|
|
||||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -726,11 +693,6 @@ impl PageServerConf {
|
|||||||
.join(TENANT_LOCATION_CONFIG_NAME)
|
.join(TENANT_LOCATION_CONFIG_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
|
||||||
self.tenant_path(tenant_shard_id)
|
|
||||||
.join(TENANT_HEATMAP_BASENAME)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_shard_id)
|
self.tenant_path(tenant_shard_id)
|
||||||
.join(TIMELINES_SEGMENT_NAME)
|
.join(TIMELINES_SEGMENT_NAME)
|
||||||
@@ -916,10 +878,6 @@ impl PageServerConf {
|
|||||||
"heatmap_upload_concurrency" => {
|
"heatmap_upload_concurrency" => {
|
||||||
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
|
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
|
||||||
},
|
},
|
||||||
"secondary_download_concurrency" => {
|
|
||||||
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
|
|
||||||
},
|
|
||||||
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
|
|
||||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -991,8 +949,6 @@ impl PageServerConf {
|
|||||||
control_plane_api_token: None,
|
control_plane_api_token: None,
|
||||||
control_plane_emergency_mode: false,
|
control_plane_emergency_mode: false,
|
||||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
|
||||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1126,12 +1082,11 @@ mod tests {
|
|||||||
};
|
};
|
||||||
|
|
||||||
use camino_tempfile::{tempdir, Utf8TempDir};
|
use camino_tempfile::{tempdir, Utf8TempDir};
|
||||||
use pageserver_api::models::EvictionPolicy;
|
|
||||||
use remote_storage::{RemoteStorageKind, S3Config};
|
use remote_storage::{RemoteStorageKind, S3Config};
|
||||||
use utils::serde_percent::Percent;
|
use utils::serde_percent::Percent;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::DEFAULT_PG_VERSION;
|
use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION};
|
||||||
|
|
||||||
const ALL_BASE_VALUES_TOML: &str = r#"
|
const ALL_BASE_VALUES_TOML: &str = r#"
|
||||||
# Initial configuration file created by 'pageserver --init'
|
# Initial configuration file created by 'pageserver --init'
|
||||||
@@ -1222,9 +1177,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
control_plane_api: None,
|
control_plane_api: None,
|
||||||
control_plane_api_token: None,
|
control_plane_api_token: None,
|
||||||
control_plane_emergency_mode: false,
|
control_plane_emergency_mode: false,
|
||||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
||||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
|
||||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
|
||||||
},
|
},
|
||||||
"Correct defaults should be used when no config values are provided"
|
"Correct defaults should be used when no config values are provided"
|
||||||
);
|
);
|
||||||
@@ -1285,9 +1238,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
control_plane_api: None,
|
control_plane_api: None,
|
||||||
control_plane_api_token: None,
|
control_plane_api_token: None,
|
||||||
control_plane_emergency_mode: false,
|
control_plane_emergency_mode: false,
|
||||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
||||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
|
||||||
ingest_batch_size: 100,
|
|
||||||
},
|
},
|
||||||
"Should be able to parse all basic config values correctly"
|
"Should be able to parse all basic config values correctly"
|
||||||
);
|
);
|
||||||
@@ -1517,7 +1468,6 @@ threshold = "20m"
|
|||||||
period: Duration::from_secs(10),
|
period: Duration::from_secs(10),
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
mock_statvfs: None,
|
mock_statvfs: None,
|
||||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
|
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
match &conf.default_tenant_conf.eviction_policy {
|
match &conf.default_tenant_conf.eviction_policy {
|
||||||
|
|||||||
@@ -267,7 +267,7 @@ async fn calculate_synthetic_size_worker(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
for (tenant_shard_id, tenant_state, _gen) in tenants {
|
for (tenant_shard_id, tenant_state) in tenants {
|
||||||
if tenant_state != TenantState::Active {
|
if tenant_state != TenantState::Active {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -196,7 +196,7 @@ pub(super) async fn collect_all_metrics(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
|
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
|
||||||
if state != TenantState::Active || !id.is_zero() {
|
if state != TenantState::Active || !id.is_zero() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use futures::Future;
|
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
control_api::{
|
control_api::{
|
||||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||||
@@ -29,14 +28,13 @@ pub enum RetryForeverError {
|
|||||||
ShuttingDown,
|
ShuttingDown,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
pub trait ControlPlaneGenerationsApi {
|
pub trait ControlPlaneGenerationsApi {
|
||||||
fn re_attach(
|
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
|
||||||
&self,
|
async fn validate(
|
||||||
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
|
|
||||||
fn validate(
|
|
||||||
&self,
|
&self,
|
||||||
tenants: Vec<(TenantShardId, Generation)>,
|
tenants: Vec<(TenantShardId, Generation)>,
|
||||||
) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
|
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ControlPlaneClient {
|
impl ControlPlaneClient {
|
||||||
@@ -125,6 +123,7 @@ impl ControlPlaneClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||||
/// Block until we get a successful response, or error out if we are shut down
|
/// Block until we get a successful response, or error out if we are shut down
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||||
|
|||||||
@@ -831,6 +831,7 @@ mod test {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||||
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||||
|
|||||||
@@ -47,24 +47,21 @@ use std::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use camino::Utf8Path;
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||||
|
use utils::completion;
|
||||||
use utils::serde_percent::Percent;
|
use utils::serde_percent::Percent;
|
||||||
use utils::{completion, id::TimelineId};
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||||
tenant::{
|
tenant::{
|
||||||
self,
|
self,
|
||||||
mgr::TenantManager,
|
storage_layer::{AsLayerDesc, EvictionError, Layer},
|
||||||
remote_timeline_client::LayerFileMetadata,
|
|
||||||
secondary::SecondaryTenant,
|
|
||||||
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
|
|
||||||
Timeline,
|
Timeline,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -77,45 +74,6 @@ pub struct DiskUsageEvictionTaskConfig {
|
|||||||
pub period: Duration,
|
pub period: Duration,
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
||||||
/// Select sorting for evicted layers
|
|
||||||
#[serde(default)]
|
|
||||||
pub eviction_order: EvictionOrder,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
|
|
||||||
/// partitioning.
|
|
||||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
||||||
#[serde(tag = "type", content = "args")]
|
|
||||||
pub enum EvictionOrder {
|
|
||||||
/// Order the layers to be evicted by how recently they have been accessed in absolute
|
|
||||||
/// time.
|
|
||||||
///
|
|
||||||
/// This strategy is unfair when some tenants grow faster than others towards the slower
|
|
||||||
/// growing.
|
|
||||||
#[default]
|
|
||||||
AbsoluteAccessed,
|
|
||||||
|
|
||||||
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
|
||||||
/// the set of resident layers of a tenant.
|
|
||||||
///
|
|
||||||
/// This strategy will evict layers more fairly but is untested.
|
|
||||||
RelativeAccessed {
|
|
||||||
#[serde(default)]
|
|
||||||
highest_layer_count_loses_first: bool,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
impl EvictionOrder {
|
|
||||||
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
|
|
||||||
/// counts should be the first ones to have their layers evicted.
|
|
||||||
fn highest_layer_count_loses_first(&self) -> bool {
|
|
||||||
match self {
|
|
||||||
EvictionOrder::AbsoluteAccessed => false,
|
|
||||||
EvictionOrder::RelativeAccessed {
|
|
||||||
highest_layer_count_loses_first,
|
|
||||||
} => *highest_layer_count_loses_first,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@@ -128,7 +86,6 @@ pub fn launch_disk_usage_global_eviction_task(
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
storage: GenericRemoteStorage,
|
storage: GenericRemoteStorage,
|
||||||
state: Arc<State>,
|
state: Arc<State>,
|
||||||
tenant_manager: Arc<TenantManager>,
|
|
||||||
background_jobs_barrier: completion::Barrier,
|
background_jobs_barrier: completion::Barrier,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||||
@@ -154,7 +111,8 @@ pub fn launch_disk_usage_global_eviction_task(
|
|||||||
_ = background_jobs_barrier.wait() => { }
|
_ = background_jobs_barrier.wait() => { }
|
||||||
};
|
};
|
||||||
|
|
||||||
disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
|
disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
|
||||||
|
.await;
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
@@ -167,7 +125,7 @@ async fn disk_usage_eviction_task(
|
|||||||
state: &State,
|
state: &State,
|
||||||
task_config: &DiskUsageEvictionTaskConfig,
|
task_config: &DiskUsageEvictionTaskConfig,
|
||||||
storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenant_manager: Arc<TenantManager>,
|
tenants_dir: &Utf8Path,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) {
|
) {
|
||||||
scopeguard::defer! {
|
scopeguard::defer! {
|
||||||
@@ -194,7 +152,7 @@ async fn disk_usage_eviction_task(
|
|||||||
state,
|
state,
|
||||||
task_config,
|
task_config,
|
||||||
storage,
|
storage,
|
||||||
&tenant_manager,
|
tenants_dir,
|
||||||
&cancel,
|
&cancel,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
@@ -229,21 +187,12 @@ async fn disk_usage_eviction_task_iteration(
|
|||||||
state: &State,
|
state: &State,
|
||||||
task_config: &DiskUsageEvictionTaskConfig,
|
task_config: &DiskUsageEvictionTaskConfig,
|
||||||
storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenant_manager: &Arc<TenantManager>,
|
tenants_dir: &Utf8Path,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let tenants_dir = tenant_manager.get_conf().tenants_path();
|
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||||
let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
|
|
||||||
.context("get filesystem-level disk usage before evictions")?;
|
.context("get filesystem-level disk usage before evictions")?;
|
||||||
let res = disk_usage_eviction_task_iteration_impl(
|
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||||
state,
|
|
||||||
storage,
|
|
||||||
usage_pre,
|
|
||||||
tenant_manager,
|
|
||||||
task_config.eviction_order,
|
|
||||||
cancel,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
match res {
|
match res {
|
||||||
Ok(outcome) => {
|
Ok(outcome) => {
|
||||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||||
@@ -253,7 +202,7 @@ async fn disk_usage_eviction_task_iteration(
|
|||||||
}
|
}
|
||||||
IterationOutcome::Finished(outcome) => {
|
IterationOutcome::Finished(outcome) => {
|
||||||
// Verify with statvfs whether we made any real progress
|
// Verify with statvfs whether we made any real progress
|
||||||
let after = filesystem_level_usage::get(&tenants_dir, task_config)
|
let after = filesystem_level_usage::get(tenants_dir, task_config)
|
||||||
// It's quite unlikely to hit the error here. Keep the code simple and bail out.
|
// It's quite unlikely to hit the error here. Keep the code simple and bail out.
|
||||||
.context("get filesystem-level disk usage after evictions")?;
|
.context("get filesystem-level disk usage after evictions")?;
|
||||||
|
|
||||||
@@ -329,8 +278,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
state: &State,
|
state: &State,
|
||||||
_storage: &GenericRemoteStorage,
|
_storage: &GenericRemoteStorage,
|
||||||
usage_pre: U,
|
usage_pre: U,
|
||||||
tenant_manager: &Arc<TenantManager>,
|
|
||||||
eviction_order: EvictionOrder,
|
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<IterationOutcome<U>> {
|
) -> anyhow::Result<IterationOutcome<U>> {
|
||||||
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
|
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
|
||||||
@@ -350,29 +297,29 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
"running disk usage based eviction due to pressure"
|
"running disk usage based eviction due to pressure"
|
||||||
);
|
);
|
||||||
|
|
||||||
let candidates =
|
let candidates = match collect_eviction_candidates(cancel).await? {
|
||||||
match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
|
EvictionCandidates::Cancelled => {
|
||||||
EvictionCandidates::Cancelled => {
|
return Ok(IterationOutcome::Cancelled);
|
||||||
return Ok(IterationOutcome::Cancelled);
|
}
|
||||||
}
|
EvictionCandidates::Finished(partitioned) => partitioned,
|
||||||
EvictionCandidates::Finished(partitioned) => partitioned,
|
};
|
||||||
};
|
|
||||||
|
|
||||||
// Debug-log the list of candidates
|
// Debug-log the list of candidates
|
||||||
let now = SystemTime::now();
|
let now = SystemTime::now();
|
||||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||||
let nth = i + 1;
|
let desc = candidate.layer.layer_desc();
|
||||||
let total_candidates = candidates.len();
|
|
||||||
let size = candidate.layer.get_file_size();
|
|
||||||
let rel = candidate.relative_last_activity;
|
|
||||||
debug!(
|
debug!(
|
||||||
"cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
|
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
|
||||||
|
i + 1,
|
||||||
|
candidates.len(),
|
||||||
|
desc.file_size,
|
||||||
now.duration_since(candidate.last_activity_ts)
|
now.duration_since(candidate.last_activity_ts)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.as_micros(),
|
.as_micros(),
|
||||||
candidate.layer.get_tenant_shard_id(),
|
partition,
|
||||||
candidate.layer.get_timeline_id(),
|
desc.tenant_shard_id,
|
||||||
candidate.layer.get_name(),
|
desc.timeline_id,
|
||||||
|
candidate.layer,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -386,56 +333,39 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
// If we get far enough in the list that we start to evict layers that are below
|
// If we get far enough in the list that we start to evict layers that are below
|
||||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||||
|
let mut warned = None;
|
||||||
|
let mut usage_planned = usage_pre;
|
||||||
|
let mut evicted_amount = 0;
|
||||||
|
|
||||||
let selection = select_victims(&candidates, usage_pre);
|
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||||
|
if !usage_planned.has_pressure() {
|
||||||
let mut candidates = candidates;
|
debug!(
|
||||||
|
no_candidates_evicted = i,
|
||||||
let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
|
"took enough candidates for pressure to be relieved"
|
||||||
// we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
|
);
|
||||||
// for comparison here. this is a temporary measure to develop alternatives.
|
break;
|
||||||
use std::fmt::Write;
|
|
||||||
|
|
||||||
let mut summary_buf = String::with_capacity(256);
|
|
||||||
|
|
||||||
{
|
|
||||||
let absolute_summary = candidates
|
|
||||||
.iter()
|
|
||||||
.take(selection.amount)
|
|
||||||
.map(|(_, candidate)| candidate)
|
|
||||||
.collect::<summary::EvictionSummary>();
|
|
||||||
|
|
||||||
write!(summary_buf, "{absolute_summary}").expect("string grows");
|
|
||||||
|
|
||||||
info!("absolute accessed selection summary: {summary_buf}");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
if partition == &MinResidentSizePartition::Below && warned.is_none() {
|
||||||
(*partition, candidate.relative_last_activity)
|
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
||||||
});
|
warned = Some(usage_planned);
|
||||||
|
|
||||||
let selection = select_victims(&candidates, usage_pre);
|
|
||||||
|
|
||||||
{
|
|
||||||
summary_buf.clear();
|
|
||||||
|
|
||||||
let relative_summary = candidates
|
|
||||||
.iter()
|
|
||||||
.take(selection.amount)
|
|
||||||
.map(|(_, candidate)| candidate)
|
|
||||||
.collect::<summary::EvictionSummary>();
|
|
||||||
|
|
||||||
write!(summary_buf, "{relative_summary}").expect("string grows");
|
|
||||||
|
|
||||||
info!("relative accessed selection summary: {summary_buf}");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
selection
|
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
|
||||||
} else {
|
evicted_amount += 1;
|
||||||
selection
|
}
|
||||||
|
|
||||||
|
let usage_planned = match warned {
|
||||||
|
Some(respecting_tenant_min_resident_size) => PlannedUsage {
|
||||||
|
respecting_tenant_min_resident_size,
|
||||||
|
fallback_to_global_lru: Some(usage_planned),
|
||||||
|
},
|
||||||
|
None => PlannedUsage {
|
||||||
|
respecting_tenant_min_resident_size: usage_planned,
|
||||||
|
fallback_to_global_lru: None,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
debug!(?usage_planned, "usage planned");
|
||||||
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
|
|
||||||
|
|
||||||
// phase2: evict layers
|
// phase2: evict layers
|
||||||
|
|
||||||
@@ -486,30 +416,19 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
|
|
||||||
match candidate.layer {
|
js.spawn(async move {
|
||||||
EvictionLayer::Attached(layer) => {
|
let rtc = candidate.timeline.remote_client.as_ref().expect(
|
||||||
let file_size = layer.layer_desc().file_size;
|
"holding the witness, all timelines must have a remote timeline client",
|
||||||
js.spawn(async move {
|
);
|
||||||
layer
|
let file_size = candidate.layer.layer_desc().file_size;
|
||||||
.evict_and_wait()
|
candidate
|
||||||
.await
|
.layer
|
||||||
.map(|()| file_size)
|
.evict_and_wait(rtc)
|
||||||
.map_err(|e| (file_size, e))
|
.await
|
||||||
});
|
.map(|()| file_size)
|
||||||
}
|
.map_err(|e| (file_size, e))
|
||||||
EvictionLayer::Secondary(layer) => {
|
});
|
||||||
let file_size = layer.metadata.file_size();
|
|
||||||
let tenant_manager = tenant_manager.clone();
|
|
||||||
|
|
||||||
js.spawn(async move {
|
|
||||||
layer
|
|
||||||
.secondary_tenant
|
|
||||||
.evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
|
|
||||||
.await;
|
|
||||||
Ok(file_size)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -536,100 +455,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub(crate) struct EvictionSecondaryLayer {
|
struct EvictionCandidate {
|
||||||
pub(crate) secondary_tenant: Arc<SecondaryTenant>,
|
timeline: Arc<Timeline>,
|
||||||
pub(crate) timeline_id: TimelineId,
|
layer: Layer,
|
||||||
pub(crate) name: LayerFileName,
|
last_activity_ts: SystemTime,
|
||||||
pub(crate) metadata: LayerFileMetadata,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Full [`Layer`] objects are specific to tenants in attached mode. This type is a layer
|
|
||||||
/// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name.
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) enum EvictionLayer {
|
|
||||||
Attached(Layer),
|
|
||||||
#[allow(dead_code)]
|
|
||||||
Secondary(EvictionSecondaryLayer),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Layer> for EvictionLayer {
|
|
||||||
fn from(value: Layer) -> Self {
|
|
||||||
Self::Attached(value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl EvictionLayer {
|
|
||||||
pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
|
|
||||||
match self {
|
|
||||||
Self::Attached(l) => &l.layer_desc().tenant_shard_id,
|
|
||||||
Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_timeline_id(&self) -> &TimelineId {
|
|
||||||
match self {
|
|
||||||
Self::Attached(l) => &l.layer_desc().timeline_id,
|
|
||||||
Self::Secondary(sl) => &sl.timeline_id,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_name(&self) -> LayerFileName {
|
|
||||||
match self {
|
|
||||||
Self::Attached(l) => l.layer_desc().filename(),
|
|
||||||
Self::Secondary(sl) => sl.name.clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_file_size(&self) -> u64 {
|
|
||||||
match self {
|
|
||||||
Self::Attached(l) => l.layer_desc().file_size,
|
|
||||||
Self::Secondary(sl) => sl.metadata.file_size(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) struct EvictionCandidate {
|
|
||||||
pub(crate) layer: EvictionLayer,
|
|
||||||
pub(crate) last_activity_ts: SystemTime,
|
|
||||||
pub(crate) relative_last_activity: finite_f32::FiniteF32,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for EvictionLayer {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
||||||
match self {
|
|
||||||
Self::Attached(l) => l.fmt(f),
|
|
||||||
Self::Secondary(sl) => {
|
|
||||||
write!(f, "{}/{}", sl.timeline_id, sl.name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) struct DiskUsageEvictionInfo {
|
|
||||||
/// Timeline's largest layer (remote or resident)
|
|
||||||
pub max_layer_size: Option<u64>,
|
|
||||||
/// Timeline's resident layers
|
|
||||||
pub resident_layers: Vec<EvictionCandidate>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Debug for EvictionCandidate {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
// format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
|
|
||||||
// having to allocate a string to this is bad, but it will rarely be formatted
|
|
||||||
let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
|
|
||||||
let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
|
|
||||||
struct DisplayIsDebug<'a, T>(&'a T);
|
|
||||||
impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "{}", self.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
f.debug_struct("LocalLayerInfoForDiskUsageEviction")
|
|
||||||
.field("layer", &DisplayIsDebug(&self.layer))
|
|
||||||
.field("last_activity", &ts)
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
@@ -649,24 +478,24 @@ enum EvictionCandidates {
|
|||||||
/// order. A caller that evicts in that order, until pressure is relieved, implements
|
/// order. A caller that evicts in that order, until pressure is relieved, implements
|
||||||
/// the eviction policy outlined in the module comment.
|
/// the eviction policy outlined in the module comment.
|
||||||
///
|
///
|
||||||
/// # Example with EvictionOrder::AbsoluteAccessed
|
/// # Example
|
||||||
///
|
///
|
||||||
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
|
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
|
||||||
/// Each layer has size 100, and both tenant's min_resident_size is 150.
|
/// Each layer has size 100, and both tenant's min_resident_size is 150.
|
||||||
/// The eviction order would be
|
/// The eviction order would be
|
||||||
///
|
///
|
||||||
/// ```text
|
/// ```text
|
||||||
/// partition last_activity_ts tenant/layer
|
/// partition last_activity_ts tenant/layer
|
||||||
/// Above 18:30 A/c
|
/// Above 18:30 A/c
|
||||||
/// Above 19:00 A/b
|
/// Above 19:00 A/b
|
||||||
/// Above 18:29 B/c
|
/// Above 18:29 B/c
|
||||||
/// Above 19:05 B/b
|
/// Above 19:05 B/b
|
||||||
/// Above 20:00 B/a
|
/// Above 20:00 B/a
|
||||||
/// Above 20:03 A/a
|
/// Above 20:03 A/a
|
||||||
/// Below 20:30 A/d
|
/// Below 20:30 A/d
|
||||||
/// Below 20:40 B/d
|
/// Below 20:40 B/d
|
||||||
/// Below 20:45 B/e
|
/// Below 20:45 B/e
|
||||||
/// Below 20:58 A/e
|
/// Below 20:58 A/e
|
||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
|
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
|
||||||
@@ -676,78 +505,7 @@ enum EvictionCandidates {
|
|||||||
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
|
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
|
||||||
/// after exhauting the `Above` partition.
|
/// after exhauting the `Above` partition.
|
||||||
/// So, we did not respect each tenant's min_resident_size.
|
/// So, we did not respect each tenant's min_resident_size.
|
||||||
///
|
|
||||||
/// # Example with EvictionOrder::RelativeAccessed
|
|
||||||
///
|
|
||||||
/// ```text
|
|
||||||
/// partition relative_age last_activity_ts tenant/layer
|
|
||||||
/// Above 0/4 18:30 A/c
|
|
||||||
/// Above 0/4 18:29 B/c
|
|
||||||
/// Above 1/4 19:00 A/b
|
|
||||||
/// Above 1/4 19:05 B/b
|
|
||||||
/// Above 2/4 20:00 B/a
|
|
||||||
/// Above 2/4 20:03 A/a
|
|
||||||
/// Below 3/4 20:30 A/d
|
|
||||||
/// Below 3/4 20:40 B/d
|
|
||||||
/// Below 4/4 20:45 B/e
|
|
||||||
/// Below 4/4 20:58 A/e
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// With tenants having the same number of layers the picture does not change much. The same with
|
|
||||||
/// A having many more layers **resident** (not all of them listed):
|
|
||||||
///
|
|
||||||
/// ```text
|
|
||||||
/// Above 0/100 18:30 A/c
|
|
||||||
/// Above 0/4 18:29 B/c
|
|
||||||
/// Above 1/100 19:00 A/b
|
|
||||||
/// Above 2/100 20:03 A/a
|
|
||||||
/// Above 3/100 20:03 A/nth_3
|
|
||||||
/// Above 4/100 20:03 A/nth_4
|
|
||||||
/// ...
|
|
||||||
/// Above 1/4 19:05 B/b
|
|
||||||
/// Above 25/100 20:04 A/nth_25
|
|
||||||
/// ...
|
|
||||||
/// Above 2/4 20:00 B/a
|
|
||||||
/// Above 50/100 20:10 A/nth_50
|
|
||||||
/// ...
|
|
||||||
/// Below 3/4 20:40 B/d
|
|
||||||
/// Below 99/100 20:30 A/nth_99
|
|
||||||
/// Below 4/4 20:45 B/e
|
|
||||||
/// Below 100/100 20:58 A/nth_100
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
|
|
||||||
/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
|
|
||||||
/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
|
|
||||||
/// appeared:
|
|
||||||
///
|
|
||||||
/// ```text
|
|
||||||
/// Above 0/87 20:04 A/nth_23
|
|
||||||
/// Above 0/3 19:05 B/b
|
|
||||||
/// Above 0/50 20:59 C/nth_0
|
|
||||||
/// Above 1/87 20:04 A/nth_24
|
|
||||||
/// Above 1/50 21:00 C/nth_1
|
|
||||||
/// Above 2/87 20:04 A/nth_25
|
|
||||||
/// ...
|
|
||||||
/// Above 16/50 21:02 C/nth_16
|
|
||||||
/// Above 1/3 20:00 B/a
|
|
||||||
/// Above 27/87 20:10 A/nth_50
|
|
||||||
/// ...
|
|
||||||
/// Below 2/3 20:40 B/d
|
|
||||||
/// Below 49/50 21:05 C/nth_49
|
|
||||||
/// Below 86/87 20:30 A/nth_99
|
|
||||||
/// Below 3/3 20:45 B/e
|
|
||||||
/// Below 50/50 21:05 C/nth_50
|
|
||||||
/// Below 87/87 20:58 A/nth_100
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// Now relieving pressure with 23 layers would cost:
|
|
||||||
/// - tenant A 14 layers
|
|
||||||
/// - tenant B 1 layer
|
|
||||||
/// - tenant C 8 layers
|
|
||||||
async fn collect_eviction_candidates(
|
async fn collect_eviction_candidates(
|
||||||
tenant_manager: &Arc<TenantManager>,
|
|
||||||
eviction_order: EvictionOrder,
|
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<EvictionCandidates> {
|
) -> anyhow::Result<EvictionCandidates> {
|
||||||
// get a snapshot of the list of tenants
|
// get a snapshot of the list of tenants
|
||||||
@@ -755,16 +513,13 @@ async fn collect_eviction_candidates(
|
|||||||
.await
|
.await
|
||||||
.context("get list of tenants")?;
|
.context("get list of tenants")?;
|
||||||
|
|
||||||
// TODO: avoid listing every layer in every tenant: this loop can block the executor,
|
|
||||||
// and the resulting data structure can be huge.
|
|
||||||
// (https://github.com/neondatabase/neon/issues/6224)
|
|
||||||
let mut candidates = Vec::new();
|
let mut candidates = Vec::new();
|
||||||
|
|
||||||
for (tenant_id, _state, _gen) in tenants {
|
for (tenant_id, _state) in &tenants {
|
||||||
if cancel.is_cancelled() {
|
if cancel.is_cancelled() {
|
||||||
return Ok(EvictionCandidates::Cancelled);
|
return Ok(EvictionCandidates::Cancelled);
|
||||||
}
|
}
|
||||||
let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
|
let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
|
||||||
Ok(tenant) => tenant,
|
Ok(tenant) => tenant,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// this can happen if tenant has lifecycle transition after we fetched it
|
// this can happen if tenant has lifecycle transition after we fetched it
|
||||||
@@ -792,7 +547,11 @@ async fn collect_eviction_candidates(
|
|||||||
}
|
}
|
||||||
let info = tl.get_local_layers_for_disk_usage_eviction().await;
|
let info = tl.get_local_layers_for_disk_usage_eviction().await;
|
||||||
debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
||||||
tenant_candidates.extend(info.resident_layers.into_iter());
|
tenant_candidates.extend(
|
||||||
|
info.resident_layers
|
||||||
|
.into_iter()
|
||||||
|
.map(|layer_infos| (tl.clone(), layer_infos)),
|
||||||
|
);
|
||||||
max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
|
max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
|
||||||
|
|
||||||
if cancel.is_cancelled() {
|
if cancel.is_cancelled() {
|
||||||
@@ -813,16 +572,14 @@ async fn collect_eviction_candidates(
|
|||||||
// A default override can be put in the default tenant conf in the pageserver.toml.
|
// A default override can be put in the default tenant conf in the pageserver.toml.
|
||||||
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
|
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
|
||||||
debug!(
|
debug!(
|
||||||
tenant_id=%tenant.tenant_shard_id().tenant_id,
|
tenant_id=%tenant.tenant_id(),
|
||||||
shard_id=%tenant.tenant_shard_id().shard_slug(),
|
|
||||||
overridden_size=s,
|
overridden_size=s,
|
||||||
"using overridden min resident size for tenant"
|
"using overridden min resident size for tenant"
|
||||||
);
|
);
|
||||||
s
|
s
|
||||||
} else {
|
} else {
|
||||||
debug!(
|
debug!(
|
||||||
tenant_id=%tenant.tenant_shard_id().tenant_id,
|
tenant_id=%tenant.tenant_id(),
|
||||||
shard_id=%tenant.tenant_shard_id().shard_slug(),
|
|
||||||
max_layer_size,
|
max_layer_size,
|
||||||
"using max layer size as min_resident_size for tenant",
|
"using max layer size as min_resident_size for tenant",
|
||||||
);
|
);
|
||||||
@@ -832,175 +589,33 @@ async fn collect_eviction_candidates(
|
|||||||
// Sort layers most-recently-used first, then partition by
|
// Sort layers most-recently-used first, then partition by
|
||||||
// cumsum above/below min_resident_size.
|
// cumsum above/below min_resident_size.
|
||||||
tenant_candidates
|
tenant_candidates
|
||||||
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||||
let mut cumsum: i128 = 0;
|
let mut cumsum: i128 = 0;
|
||||||
|
for (timeline, layer_info) in tenant_candidates.into_iter() {
|
||||||
// keeping the -1 or not decides if every tenant should lose their least recently accessed
|
let file_size = layer_info.file_size();
|
||||||
// layer OR if this should happen in the order of having highest layer count:
|
let candidate = EvictionCandidate {
|
||||||
let fudge = if eviction_order.highest_layer_count_loses_first() {
|
timeline,
|
||||||
// relative_age vs. tenant layer count:
|
last_activity_ts: layer_info.last_activity_ts,
|
||||||
// - 0.1..=1.0 (10 layers)
|
layer: layer_info.layer,
|
||||||
// - 0.01..=1.0 (100 layers)
|
|
||||||
// - 0.001..=1.0 (1000 layers)
|
|
||||||
//
|
|
||||||
// leading to evicting less of the smallest tenants.
|
|
||||||
0
|
|
||||||
} else {
|
|
||||||
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
|
|
||||||
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
|
|
||||||
// be that less than 10k layer evictions is enough, so we would not need to evict from
|
|
||||||
// all tenants.
|
|
||||||
//
|
|
||||||
// as the tenant ordering is now deterministic this could hit the same tenants
|
|
||||||
// disproportionetly on multiple invocations. alternative could be to remember how many
|
|
||||||
// layers did we evict last time from this tenant, and inject that as an additional
|
|
||||||
// fudge here.
|
|
||||||
1
|
|
||||||
};
|
|
||||||
|
|
||||||
let total = tenant_candidates
|
|
||||||
.len()
|
|
||||||
.checked_sub(fudge)
|
|
||||||
.filter(|&x| x > 0)
|
|
||||||
// support 0 or 1 resident layer tenants as well
|
|
||||||
.unwrap_or(1);
|
|
||||||
let divider = total as f32;
|
|
||||||
|
|
||||||
for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
|
|
||||||
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
|
||||||
// be 1.0; this is for us to evict it last.
|
|
||||||
candidate.relative_last_activity = if matches!(
|
|
||||||
eviction_order,
|
|
||||||
EvictionOrder::RelativeAccessed { .. }
|
|
||||||
) {
|
|
||||||
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
|
|
||||||
// similarly for u16. unsure how it would help.
|
|
||||||
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
|
|
||||||
.unwrap_or_else(|val| {
|
|
||||||
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
|
|
||||||
finite_f32::FiniteF32::ZERO
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
finite_f32::FiniteF32::ZERO
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let partition = if cumsum > min_resident_size as i128 {
|
let partition = if cumsum > min_resident_size as i128 {
|
||||||
MinResidentSizePartition::Above
|
MinResidentSizePartition::Above
|
||||||
} else {
|
} else {
|
||||||
MinResidentSizePartition::Below
|
MinResidentSizePartition::Below
|
||||||
};
|
};
|
||||||
cumsum += i128::from(candidate.layer.get_file_size());
|
|
||||||
candidates.push((partition, candidate));
|
candidates.push((partition, candidate));
|
||||||
|
cumsum += i128::from(file_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: the same tenant ID might be hit twice, if it transitions from attached to
|
|
||||||
// secondary while we run. That is okay: when we eventually try and run the eviction,
|
|
||||||
// the `Gate` on the object will ensure that whichever one has already been shut down
|
|
||||||
// will not delete anything.
|
|
||||||
|
|
||||||
let mut secondary_tenants = Vec::new();
|
|
||||||
tenant_manager.foreach_secondary_tenants(
|
|
||||||
|_tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
|
|
||||||
secondary_tenants.push(state.clone());
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
for secondary_tenant in secondary_tenants {
|
|
||||||
let mut layer_info = secondary_tenant.get_layers_for_eviction();
|
|
||||||
|
|
||||||
layer_info
|
|
||||||
.resident_layers
|
|
||||||
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
|
||||||
|
|
||||||
candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
|
|
||||||
(
|
|
||||||
// Secondary locations' layers are always considered above the min resident size,
|
|
||||||
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
|
||||||
// the layers have sufficiently old access times.
|
|
||||||
MinResidentSizePartition::Above,
|
|
||||||
candidate,
|
|
||||||
)
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||||
|
|
||||||
// always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
|
|
||||||
// will sort later by candidate.relative_last_activity to get compare evictions.
|
|
||||||
candidates
|
candidates
|
||||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||||
|
|
||||||
Ok(EvictionCandidates::Finished(candidates))
|
Ok(EvictionCandidates::Finished(candidates))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
|
|
||||||
/// relieve pressure.
|
|
||||||
///
|
|
||||||
/// Returns the amount of candidates selected, with the planned usage.
|
|
||||||
fn select_victims<U: Usage>(
|
|
||||||
candidates: &[(MinResidentSizePartition, EvictionCandidate)],
|
|
||||||
usage_pre: U,
|
|
||||||
) -> VictimSelection<U> {
|
|
||||||
let mut usage_when_switched = None;
|
|
||||||
let mut usage_planned = usage_pre;
|
|
||||||
let mut evicted_amount = 0;
|
|
||||||
|
|
||||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
|
||||||
if !usage_planned.has_pressure() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
|
|
||||||
usage_when_switched = Some((usage_planned, i));
|
|
||||||
}
|
|
||||||
|
|
||||||
usage_planned.add_available_bytes(candidate.layer.get_file_size());
|
|
||||||
evicted_amount += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
VictimSelection {
|
|
||||||
amount: evicted_amount,
|
|
||||||
usage_pre,
|
|
||||||
usage_when_switched,
|
|
||||||
usage_planned,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct VictimSelection<U> {
|
|
||||||
amount: usize,
|
|
||||||
usage_pre: U,
|
|
||||||
usage_when_switched: Option<(U, usize)>,
|
|
||||||
usage_planned: U,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<U: Usage> VictimSelection<U> {
|
|
||||||
fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
|
|
||||||
debug!(
|
|
||||||
evicted_amount=%self.amount,
|
|
||||||
"took enough candidates for pressure to be relieved"
|
|
||||||
);
|
|
||||||
|
|
||||||
if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
|
|
||||||
warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
|
||||||
}
|
|
||||||
|
|
||||||
let planned = match self.usage_when_switched {
|
|
||||||
Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
|
|
||||||
respecting_tenant_min_resident_size,
|
|
||||||
fallback_to_global_lru: Some(self.usage_planned),
|
|
||||||
},
|
|
||||||
None => PlannedUsage {
|
|
||||||
respecting_tenant_min_resident_size: self.usage_planned,
|
|
||||||
fallback_to_global_lru: None,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
(self.amount, planned)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct TimelineKey(Arc<Timeline>);
|
struct TimelineKey(Arc<Timeline>);
|
||||||
|
|
||||||
impl PartialEq for TimelineKey {
|
impl PartialEq for TimelineKey {
|
||||||
@@ -1025,197 +640,6 @@ impl std::ops::Deref for TimelineKey {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A totally ordered f32 subset we can use with sorting functions.
|
|
||||||
pub(crate) mod finite_f32 {
|
|
||||||
|
|
||||||
/// A totally ordered f32 subset we can use with sorting functions.
|
|
||||||
#[derive(Clone, Copy, PartialEq)]
|
|
||||||
pub struct FiniteF32(f32);
|
|
||||||
|
|
||||||
impl std::fmt::Debug for FiniteF32 {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
std::fmt::Debug::fmt(&self.0, f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for FiniteF32 {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
std::fmt::Display::fmt(&self.0, f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::cmp::Eq for FiniteF32 {}
|
|
||||||
|
|
||||||
impl std::cmp::PartialOrd for FiniteF32 {
|
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
|
||||||
Some(self.cmp(other))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::cmp::Ord for FiniteF32 {
|
|
||||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
|
||||||
self.0.total_cmp(&other.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TryFrom<f32> for FiniteF32 {
|
|
||||||
type Error = f32;
|
|
||||||
|
|
||||||
fn try_from(value: f32) -> Result<Self, Self::Error> {
|
|
||||||
if value.is_finite() {
|
|
||||||
Ok(FiniteF32(value))
|
|
||||||
} else {
|
|
||||||
Err(value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FiniteF32 {
|
|
||||||
pub const ZERO: FiniteF32 = FiniteF32(0.0);
|
|
||||||
|
|
||||||
pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
|
|
||||||
if (0.0..=1.0).contains(&value) {
|
|
||||||
// -0.0 is within the range, make sure it is assumed 0.0..=1.0
|
|
||||||
let value = value.abs();
|
|
||||||
Ok(FiniteF32(value))
|
|
||||||
} else {
|
|
||||||
Err(value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mod summary {
|
|
||||||
use super::finite_f32::FiniteF32;
|
|
||||||
use super::{EvictionCandidate, LayerCount};
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use std::collections::{BTreeMap, HashMap};
|
|
||||||
use std::time::SystemTime;
|
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
|
||||||
pub(super) struct EvictionSummary {
|
|
||||||
evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
|
|
||||||
total: LayerCount,
|
|
||||||
|
|
||||||
last_absolute: Option<SystemTime>,
|
|
||||||
last_relative: Option<FiniteF32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
|
|
||||||
fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
|
|
||||||
let mut summary = EvictionSummary::default();
|
|
||||||
for item in iter {
|
|
||||||
let counts = summary
|
|
||||||
.evicted_per_tenant
|
|
||||||
.entry(*item.layer.get_tenant_shard_id())
|
|
||||||
.or_default();
|
|
||||||
|
|
||||||
let sz = item.layer.get_file_size();
|
|
||||||
|
|
||||||
counts.file_sizes += sz;
|
|
||||||
counts.count += 1;
|
|
||||||
|
|
||||||
summary.total.file_sizes += sz;
|
|
||||||
summary.total.count += 1;
|
|
||||||
|
|
||||||
summary.last_absolute = Some(item.last_activity_ts);
|
|
||||||
summary.last_relative = Some(item.relative_last_activity);
|
|
||||||
}
|
|
||||||
|
|
||||||
summary
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct SiBytesAmount(u64);
|
|
||||||
|
|
||||||
impl std::fmt::Display for SiBytesAmount {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
if self.0 < 1024 {
|
|
||||||
return write!(f, "{}B", self.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut tmp = self.0;
|
|
||||||
let mut ch = 0;
|
|
||||||
let suffixes = b"KMGTPE";
|
|
||||||
|
|
||||||
while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
|
|
||||||
tmp /= 1024;
|
|
||||||
ch += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
let ch = suffixes[ch] as char;
|
|
||||||
|
|
||||||
write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for EvictionSummary {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
// wasteful, but it's for testing
|
|
||||||
|
|
||||||
let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
|
|
||||||
|
|
||||||
for (tenant_shard_id, count) in &self.evicted_per_tenant {
|
|
||||||
sorted
|
|
||||||
.entry(count.count)
|
|
||||||
.or_default()
|
|
||||||
.push((*tenant_shard_id, count.file_sizes));
|
|
||||||
}
|
|
||||||
|
|
||||||
let total_file_sizes = SiBytesAmount(self.total.file_sizes);
|
|
||||||
|
|
||||||
writeln!(
|
|
||||||
f,
|
|
||||||
"selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
|
|
||||||
self.total.count, self.last_absolute, self.last_relative,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
for (count, per_tenant) in sorted.iter().rev().take(10) {
|
|
||||||
write!(f, "- {count} layers: ")?;
|
|
||||||
|
|
||||||
if per_tenant.len() < 3 {
|
|
||||||
for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
|
|
||||||
if i > 0 {
|
|
||||||
write!(f, ", ")?;
|
|
||||||
}
|
|
||||||
let bytes = SiBytesAmount(*bytes);
|
|
||||||
write!(f, "{tenant_shard_id} ({bytes})")?;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
let num_tenants = per_tenant.len();
|
|
||||||
let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
|
|
||||||
let total_bytes = SiBytesAmount(total_bytes);
|
|
||||||
let layers = num_tenants * count;
|
|
||||||
|
|
||||||
write!(
|
|
||||||
f,
|
|
||||||
"{num_tenants} tenants {total_bytes} in total {layers} layers",
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
writeln!(f)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
if sorted.len() > 10 {
|
|
||||||
let (rem_count, rem_bytes) = sorted
|
|
||||||
.iter()
|
|
||||||
.rev()
|
|
||||||
.map(|(count, per_tenant)| {
|
|
||||||
(
|
|
||||||
count,
|
|
||||||
per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
|
|
||||||
let rem_bytes = SiBytesAmount(rem_bytes);
|
|
||||||
writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mod filesystem_level_usage {
|
mod filesystem_level_usage {
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
@@ -1297,7 +721,6 @@ mod filesystem_level_usage {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn max_usage_pct_pressure() {
|
fn max_usage_pct_pressure() {
|
||||||
use super::EvictionOrder;
|
|
||||||
use super::Usage as _;
|
use super::Usage as _;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use utils::serde_percent::Percent;
|
use utils::serde_percent::Percent;
|
||||||
@@ -1309,7 +732,6 @@ mod filesystem_level_usage {
|
|||||||
period: Duration::MAX,
|
period: Duration::MAX,
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
mock_statvfs: None,
|
mock_statvfs: None,
|
||||||
eviction_order: EvictionOrder::default(),
|
|
||||||
},
|
},
|
||||||
total_bytes: 100_000,
|
total_bytes: 100_000,
|
||||||
avail_bytes: 0,
|
avail_bytes: 0,
|
||||||
|
|||||||
86
pageserver/src/failpoint_support.rs
Normal file
86
pageserver/src/failpoint_support.rs
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
/// use with fail::cfg("$name", "return(2000)")
|
||||||
|
///
|
||||||
|
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||||
|
/// specified time (in milliseconds). The main difference is that we use async
|
||||||
|
/// tokio sleep function. Another difference is that we print lines to the log,
|
||||||
|
/// which can be useful in tests to check that the failpoint was hit.
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! __failpoint_sleep_millis_async {
|
||||||
|
($name:literal) => {{
|
||||||
|
// If the failpoint is used with a "return" action, set should_sleep to the
|
||||||
|
// returned value (as string). Otherwise it's set to None.
|
||||||
|
let should_sleep = (|| {
|
||||||
|
::fail::fail_point!($name, |x| x);
|
||||||
|
::std::option::Option::None
|
||||||
|
})();
|
||||||
|
|
||||||
|
// Sleep if the action was a returned value
|
||||||
|
if let ::std::option::Option::Some(duration_str) = should_sleep {
|
||||||
|
$crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
|
||||||
|
}
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
pub use __failpoint_sleep_millis_async as sleep_millis_async;
|
||||||
|
|
||||||
|
// Helper function used by the macro. (A function has nicer scoping so we
|
||||||
|
// don't need to decorate everything with "::")
|
||||||
|
#[doc(hidden)]
|
||||||
|
pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
||||||
|
let millis = duration_str.parse::<u64>().unwrap();
|
||||||
|
let d = std::time::Duration::from_millis(millis);
|
||||||
|
|
||||||
|
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
|
||||||
|
tokio::time::sleep(d).await;
|
||||||
|
tracing::info!("failpoint {:?}: sleep done", name);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn init() -> fail::FailScenario<'static> {
|
||||||
|
// The failpoints lib provides support for parsing the `FAILPOINTS` env var.
|
||||||
|
// We want non-default behavior for `exit`, though, so, we handle it separately.
|
||||||
|
//
|
||||||
|
// Format for FAILPOINTS is "name=actions" separated by ";".
|
||||||
|
let actions = std::env::var("FAILPOINTS");
|
||||||
|
if actions.is_ok() {
|
||||||
|
std::env::remove_var("FAILPOINTS");
|
||||||
|
} else {
|
||||||
|
// let the library handle non-utf8, or nothing for not present
|
||||||
|
}
|
||||||
|
|
||||||
|
let scenario = fail::FailScenario::setup();
|
||||||
|
|
||||||
|
if let Ok(val) = actions {
|
||||||
|
val.split(';')
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, s)| s.split_once('=').ok_or((i, s)))
|
||||||
|
.for_each(|res| {
|
||||||
|
let (name, actions) = match res {
|
||||||
|
Ok(t) => t,
|
||||||
|
Err((i, s)) => {
|
||||||
|
panic!(
|
||||||
|
"startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
|
||||||
|
i + 1,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if let Err(e) = apply_failpoint(name, actions) {
|
||||||
|
panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
scenario
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
||||||
|
if actions == "exit" {
|
||||||
|
fail::cfg_callback(name, exit_failpoint)
|
||||||
|
} else {
|
||||||
|
fail::cfg(name, actions)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(never)]
|
||||||
|
fn exit_failpoint() {
|
||||||
|
tracing::info!("Exit requested by failpoint");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
@@ -159,12 +159,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/ConflictError"
|
$ref: "#/components/schemas/ConflictError"
|
||||||
"412":
|
|
||||||
description: Deletion may not proceed, tenant is not in Active state
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/PreconditionFailedError"
|
|
||||||
"500":
|
"500":
|
||||||
description: Generic operation error
|
description: Generic operation error
|
||||||
content:
|
content:
|
||||||
|
|||||||
@@ -14,10 +14,7 @@ use hyper::header;
|
|||||||
use hyper::StatusCode;
|
use hyper::StatusCode;
|
||||||
use hyper::{Body, Request, Response, Uri};
|
use hyper::{Body, Request, Response, Uri};
|
||||||
use metrics::launch_timestamp::LaunchTimestamp;
|
use metrics::launch_timestamp::LaunchTimestamp;
|
||||||
use pageserver_api::models::LocationConfigListResponse;
|
|
||||||
use pageserver_api::models::ShardParameters;
|
|
||||||
use pageserver_api::models::TenantDetails;
|
use pageserver_api::models::TenantDetails;
|
||||||
use pageserver_api::models::TenantState;
|
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||||
TenantLoadRequest, TenantLocationConfigRequest,
|
TenantLoadRequest, TenantLocationConfigRequest,
|
||||||
@@ -28,7 +25,6 @@ use tenant_size_model::{SizeResult, StorageModel};
|
|||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::auth::JwtAuth;
|
use utils::auth::JwtAuth;
|
||||||
use utils::failpoint_support::failpoints_handler;
|
|
||||||
use utils::http::endpoint::request_span;
|
use utils::http::endpoint::request_span;
|
||||||
use utils::http::json::json_request_or_empty_body;
|
use utils::http::json::json_request_or_empty_body;
|
||||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||||
@@ -44,14 +40,12 @@ use crate::tenant::mgr::{
|
|||||||
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||||
};
|
};
|
||||||
use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
|
|
||||||
use crate::tenant::secondary::SecondaryController;
|
use crate::tenant::secondary::SecondaryController;
|
||||||
use crate::tenant::size::ModelInputs;
|
use crate::tenant::size::ModelInputs;
|
||||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||||
use crate::tenant::timeline::CompactFlags;
|
use crate::tenant::timeline::CompactFlags;
|
||||||
use crate::tenant::timeline::Timeline;
|
use crate::tenant::timeline::Timeline;
|
||||||
use crate::tenant::SpawnMode;
|
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
|
||||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
|
||||||
use crate::{config::PageServerConf, tenant::mgr};
|
use crate::{config::PageServerConf, tenant::mgr};
|
||||||
use crate::{disk_usage_eviction_task, tenant};
|
use crate::{disk_usage_eviction_task, tenant};
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
@@ -72,6 +66,9 @@ use utils::{
|
|||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Imports only used for testing APIs
|
||||||
|
use pageserver_api::models::ConfigureFailpointsRequest;
|
||||||
|
|
||||||
// For APIs that require an Active tenant, how long should we block waiting for that state?
|
// For APIs that require an Active tenant, how long should we block waiting for that state?
|
||||||
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
||||||
// failed API calls while tenants are activating.
|
// failed API calls while tenants are activating.
|
||||||
@@ -117,6 +114,14 @@ impl State {
|
|||||||
secondary_controller,
|
secondary_controller,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn tenant_resources(&self) -> TenantSharedResources {
|
||||||
|
TenantSharedResources {
|
||||||
|
broker_client: self.broker_client.clone(),
|
||||||
|
remote_storage: self.remote_storage.clone(),
|
||||||
|
deletion_queue_client: self.deletion_queue_client.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
@@ -149,7 +154,6 @@ impl From<PageReconstructError> for ApiError {
|
|||||||
PageReconstructError::AncestorStopping(_) => {
|
PageReconstructError::AncestorStopping(_) => {
|
||||||
ApiError::ResourceUnavailable(format!("{pre}").into())
|
ApiError::ResourceUnavailable(format!("{pre}").into())
|
||||||
}
|
}
|
||||||
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
|
||||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -172,7 +176,7 @@ impl From<TenantSlotError> for ApiError {
|
|||||||
NotFound(tenant_id) => {
|
NotFound(tenant_id) => {
|
||||||
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
|
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
|
||||||
}
|
}
|
||||||
e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
|
e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
|
||||||
InProgress => {
|
InProgress => {
|
||||||
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
|
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
|
||||||
}
|
}
|
||||||
@@ -191,18 +195,6 @@ impl From<TenantSlotUpsertError> for ApiError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<UpsertLocationError> for ApiError {
|
|
||||||
fn from(e: UpsertLocationError) -> ApiError {
|
|
||||||
use UpsertLocationError::*;
|
|
||||||
match e {
|
|
||||||
BadRequest(e) => ApiError::BadRequest(e),
|
|
||||||
Unavailable(_) => ApiError::ShuttingDown,
|
|
||||||
e @ InProgress => ApiError::Conflict(format!("{e}")),
|
|
||||||
Flush(e) | Other(e) => ApiError::InternalServerError(e),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<TenantMapError> for ApiError {
|
impl From<TenantMapError> for ApiError {
|
||||||
fn from(e: TenantMapError) -> ApiError {
|
fn from(e: TenantMapError) -> ApiError {
|
||||||
use TenantMapError::*;
|
use TenantMapError::*;
|
||||||
@@ -267,7 +259,7 @@ impl From<SetNewTenantConfigError> for ApiError {
|
|||||||
SetNewTenantConfigError::GetTenant(tid) => {
|
SetNewTenantConfigError::GetTenant(tid) => {
|
||||||
ApiError::NotFound(anyhow!("tenant {}", tid).into())
|
ApiError::NotFound(anyhow!("tenant {}", tid).into())
|
||||||
}
|
}
|
||||||
e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
|
e @ SetNewTenantConfigError::Persist(_) => {
|
||||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -316,7 +308,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
|||||||
SlotUpsertError(e) => e.into(),
|
SlotUpsertError(e) => e.into(),
|
||||||
Other(o) => ApiError::InternalServerError(o),
|
Other(o) => ApiError::InternalServerError(o),
|
||||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||||
Cancelled => ApiError::ShuttingDown,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -325,21 +316,11 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
|||||||
async fn build_timeline_info(
|
async fn build_timeline_info(
|
||||||
timeline: &Arc<Timeline>,
|
timeline: &Arc<Timeline>,
|
||||||
include_non_incremental_logical_size: bool,
|
include_non_incremental_logical_size: bool,
|
||||||
force_await_initial_logical_size: bool,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<TimelineInfo> {
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
if force_await_initial_logical_size {
|
let mut info = build_timeline_info_common(timeline, ctx).await?;
|
||||||
timeline.clone().await_initial_logical_size().await
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut info = build_timeline_info_common(
|
|
||||||
timeline,
|
|
||||||
ctx,
|
|
||||||
tenant::timeline::GetLogicalSizePriority::Background,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
if include_non_incremental_logical_size {
|
if include_non_incremental_logical_size {
|
||||||
// XXX we should be using spawn_ondemand_logical_size_calculation here.
|
// XXX we should be using spawn_ondemand_logical_size_calculation here.
|
||||||
// Otherwise, if someone deletes the timeline / detaches the tenant while
|
// Otherwise, if someone deletes the timeline / detaches the tenant while
|
||||||
@@ -356,7 +337,6 @@ async fn build_timeline_info(
|
|||||||
async fn build_timeline_info_common(
|
async fn build_timeline_info_common(
|
||||||
timeline: &Arc<Timeline>,
|
timeline: &Arc<Timeline>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
logical_size_task_priority: tenant::timeline::GetLogicalSizePriority,
|
|
||||||
) -> anyhow::Result<TimelineInfo> {
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
let initdb_lsn = timeline.initdb_lsn;
|
let initdb_lsn = timeline.initdb_lsn;
|
||||||
@@ -379,7 +359,8 @@ async fn build_timeline_info_common(
|
|||||||
Lsn(0) => None,
|
Lsn(0) => None,
|
||||||
lsn @ Lsn(_) => Some(lsn),
|
lsn @ Lsn(_) => Some(lsn),
|
||||||
};
|
};
|
||||||
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
|
let current_logical_size =
|
||||||
|
timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
|
||||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||||
let state = timeline.current_state();
|
let state = timeline.current_state();
|
||||||
let remote_consistent_lsn_projected = timeline
|
let remote_consistent_lsn_projected = timeline
|
||||||
@@ -490,7 +471,7 @@ async fn timeline_create_handler(
|
|||||||
.await {
|
.await {
|
||||||
Ok(new_timeline) => {
|
Ok(new_timeline) => {
|
||||||
// Created. Construct a TimelineInfo for it.
|
// Created. Construct a TimelineInfo for it.
|
||||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
|
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
json_response(StatusCode::CREATED, timeline_info)
|
json_response(StatusCode::CREATED, timeline_info)
|
||||||
@@ -526,8 +507,6 @@ async fn timeline_list_handler(
|
|||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let include_non_incremental_logical_size: Option<bool> =
|
let include_non_incremental_logical_size: Option<bool> =
|
||||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||||
let force_await_initial_logical_size: Option<bool> =
|
|
||||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
@@ -541,7 +520,6 @@ async fn timeline_list_handler(
|
|||||||
let timeline_info = build_timeline_info(
|
let timeline_info = build_timeline_info(
|
||||||
&timeline,
|
&timeline,
|
||||||
include_non_incremental_logical_size.unwrap_or(false),
|
include_non_incremental_logical_size.unwrap_or(false),
|
||||||
force_await_initial_logical_size.unwrap_or(false),
|
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||||
@@ -569,8 +547,6 @@ async fn timeline_detail_handler(
|
|||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
let include_non_incremental_logical_size: Option<bool> =
|
let include_non_incremental_logical_size: Option<bool> =
|
||||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||||
let force_await_initial_logical_size: Option<bool> =
|
|
||||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
// Logical size calculation needs downloading.
|
// Logical size calculation needs downloading.
|
||||||
@@ -586,7 +562,6 @@ async fn timeline_detail_handler(
|
|||||||
let timeline_info = build_timeline_info(
|
let timeline_info = build_timeline_info(
|
||||||
&timeline,
|
&timeline,
|
||||||
include_non_incremental_logical_size.unwrap_or(false),
|
include_non_incremental_logical_size.unwrap_or(false),
|
||||||
force_await_initial_logical_size.unwrap_or(false),
|
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
@@ -705,39 +680,16 @@ async fn tenant_attach_handler(
|
|||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
mgr::attach_tenant(
|
||||||
let shard_params = ShardParameters::default();
|
state.conf,
|
||||||
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
|
tenant_id,
|
||||||
|
generation,
|
||||||
let tenant = state
|
tenant_conf,
|
||||||
.tenant_manager
|
state.tenant_resources(),
|
||||||
.upsert_location(
|
&ctx,
|
||||||
tenant_shard_id,
|
)
|
||||||
location_conf,
|
.instrument(info_span!("tenant_attach", %tenant_id))
|
||||||
None,
|
.await?;
|
||||||
SpawnMode::Normal,
|
|
||||||
&ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let Some(tenant) = tenant else {
|
|
||||||
// This should never happen: indicates a bug in upsert_location
|
|
||||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
|
||||||
"Upsert succeeded but didn't return tenant!"
|
|
||||||
)));
|
|
||||||
};
|
|
||||||
|
|
||||||
// We might have successfully constructed a Tenant, but it could still
|
|
||||||
// end up in a broken state:
|
|
||||||
if let TenantState::Broken {
|
|
||||||
reason,
|
|
||||||
backtrace: _,
|
|
||||||
} = tenant.current_state()
|
|
||||||
{
|
|
||||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
|
||||||
"Tenant state is Broken: {reason}"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
json_response(StatusCode::ACCEPTED, ())
|
json_response(StatusCode::ACCEPTED, ())
|
||||||
}
|
}
|
||||||
@@ -878,12 +830,11 @@ async fn tenant_list_handler(
|
|||||||
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
|
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
|
||||||
})?
|
})?
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(id, state, gen)| TenantInfo {
|
.map(|(id, state)| TenantInfo {
|
||||||
id: *id,
|
id: *id,
|
||||||
state: state.clone(),
|
state: state.clone(),
|
||||||
current_physical_size: None,
|
current_physical_size: None,
|
||||||
attachment_status: state.attachment_status(),
|
attachment_status: state.attachment_status(),
|
||||||
generation: (*gen).into(),
|
|
||||||
})
|
})
|
||||||
.collect::<Vec<TenantInfo>>();
|
.collect::<Vec<TenantInfo>>();
|
||||||
|
|
||||||
@@ -913,7 +864,6 @@ async fn tenant_status(
|
|||||||
state: state.clone(),
|
state: state.clone(),
|
||||||
current_physical_size: Some(current_physical_size),
|
current_physical_size: Some(current_physical_size),
|
||||||
attachment_status: state.attachment_status(),
|
attachment_status: state.attachment_status(),
|
||||||
generation: tenant.generation().into(),
|
|
||||||
},
|
},
|
||||||
timelines: tenant.list_timeline_ids(),
|
timelines: tenant.list_timeline_ids(),
|
||||||
})
|
})
|
||||||
@@ -936,9 +886,7 @@ async fn tenant_delete_handler(
|
|||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
state
|
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
|
||||||
.tenant_manager
|
|
||||||
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
|
|
||||||
.instrument(info_span!("tenant_delete_handler",
|
.instrument(info_span!("tenant_delete_handler",
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
shard = %tenant_shard_id.shard_slug()
|
shard = %tenant_shard_id.shard_slug()
|
||||||
@@ -1198,26 +1146,17 @@ async fn tenant_create_handler(
|
|||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
|
|
||||||
let location_conf =
|
let new_tenant = mgr::create_tenant(
|
||||||
LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);
|
state.conf,
|
||||||
|
tenant_conf,
|
||||||
|
target_tenant_id,
|
||||||
|
generation,
|
||||||
|
state.tenant_resources(),
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
|
||||||
|
.await?;
|
||||||
|
|
||||||
let new_tenant = state
|
|
||||||
.tenant_manager
|
|
||||||
.upsert_location(
|
|
||||||
target_tenant_id,
|
|
||||||
location_conf,
|
|
||||||
None,
|
|
||||||
SpawnMode::Create,
|
|
||||||
&ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let Some(new_tenant) = new_tenant else {
|
|
||||||
// This should never happen: indicates a bug in upsert_location
|
|
||||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
|
||||||
"Upsert succeeded but didn't return tenant!"
|
|
||||||
)));
|
|
||||||
};
|
|
||||||
// We created the tenant. Existing API semantics are that the tenant
|
// We created the tenant. Existing API semantics are that the tenant
|
||||||
// is Active when this function returns.
|
// is Active when this function returns.
|
||||||
if let res @ Err(_) = new_tenant
|
if let res @ Err(_) = new_tenant
|
||||||
@@ -1225,7 +1164,7 @@ async fn tenant_create_handler(
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
// This shouldn't happen because we just created the tenant directory
|
// This shouldn't happen because we just created the tenant directory
|
||||||
// in upsert_location, and there aren't any remote timelines
|
// in tenant::mgr::create_tenant, and there aren't any remote timelines
|
||||||
// to load, so, nothing can really fail during load.
|
// to load, so, nothing can really fail during load.
|
||||||
// Don't do cleanup because we don't know how we got here.
|
// Don't do cleanup because we don't know how we got here.
|
||||||
// The tenant will likely be in `Broken` state and subsequent
|
// The tenant will likely be in `Broken` state and subsequent
|
||||||
@@ -1236,7 +1175,7 @@ async fn tenant_create_handler(
|
|||||||
|
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::CREATED,
|
StatusCode::CREATED,
|
||||||
TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
|
TenantCreateResponse(new_tenant.tenant_id()),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1326,57 +1265,16 @@ async fn put_tenant_location_config_handler(
|
|||||||
|
|
||||||
state
|
state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.upsert_location(
|
.upsert_location(tenant_shard_id, location_conf, flush, &ctx)
|
||||||
tenant_shard_id,
|
.await
|
||||||
location_conf,
|
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
||||||
flush,
|
// principle we might have hit something like concurrent API calls to the same tenant,
|
||||||
tenant::SpawnMode::Normal,
|
// which is not a 400 but a 409.
|
||||||
&ctx,
|
.map_err(ApiError::BadRequest)?;
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if let Some(_flush_ms) = flush {
|
|
||||||
match state
|
|
||||||
.secondary_controller
|
|
||||||
.upload_tenant(tenant_shard_id)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(()) => {
|
|
||||||
tracing::info!("Uploaded heatmap during flush");
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
tracing::warn!("Failed to flush heatmap: {e}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
tracing::info!("No flush requested when configuring");
|
|
||||||
}
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn list_location_config_handler(
|
|
||||||
request: Request<Body>,
|
|
||||||
_cancel: CancellationToken,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
let state = get_state(&request);
|
|
||||||
let slots = state.tenant_manager.list();
|
|
||||||
let result = LocationConfigListResponse {
|
|
||||||
tenant_shards: slots
|
|
||||||
.into_iter()
|
|
||||||
.map(|(tenant_shard_id, slot)| {
|
|
||||||
let v = match slot {
|
|
||||||
TenantSlot::Attached(t) => Some(t.get_location_conf()),
|
|
||||||
TenantSlot::Secondary(s) => Some(s.get_location_conf()),
|
|
||||||
TenantSlot::InProgress(_) => None,
|
|
||||||
};
|
|
||||||
(tenant_shard_id, v)
|
|
||||||
})
|
|
||||||
.collect(),
|
|
||||||
};
|
|
||||||
json_response(StatusCode::OK, result)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
|
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
|
||||||
async fn handle_tenant_break(
|
async fn handle_tenant_break(
|
||||||
r: Request<Body>,
|
r: Request<Body>,
|
||||||
@@ -1392,6 +1290,34 @@ async fn handle_tenant_break(
|
|||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn failpoints_handler(
|
||||||
|
mut request: Request<Body>,
|
||||||
|
_cancel: CancellationToken,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
if !fail::has_failpoints() {
|
||||||
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
|
"Cannot manage failpoints because pageserver was compiled without failpoints support"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||||
|
for fp in failpoints {
|
||||||
|
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||||
|
|
||||||
|
// We recognize one extra "action" that's not natively recognized
|
||||||
|
// by the failpoints crate: exit, to immediately kill the process
|
||||||
|
let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
|
||||||
|
|
||||||
|
if let Err(err_msg) = cfg_result {
|
||||||
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
|
"Failed to configure failpoints: {err_msg}"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}
|
||||||
|
|
||||||
// Run GC immediately on given timeline.
|
// Run GC immediately on given timeline.
|
||||||
async fn timeline_gc_handler(
|
async fn timeline_gc_handler(
|
||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
@@ -1640,22 +1566,19 @@ async fn disk_usage_eviction_run(
|
|||||||
struct Config {
|
struct Config {
|
||||||
/// How many bytes to evict before reporting that pressure is relieved.
|
/// How many bytes to evict before reporting that pressure is relieved.
|
||||||
evict_bytes: u64,
|
evict_bytes: u64,
|
||||||
|
|
||||||
#[serde(default)]
|
|
||||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
||||||
struct Usage {
|
struct Usage {
|
||||||
// remains unchanged after instantiation of the struct
|
// remains unchanged after instantiation of the struct
|
||||||
evict_bytes: u64,
|
config: Config,
|
||||||
// updated by `add_available_bytes`
|
// updated by `add_available_bytes`
|
||||||
freed_bytes: u64,
|
freed_bytes: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl crate::disk_usage_eviction_task::Usage for Usage {
|
impl crate::disk_usage_eviction_task::Usage for Usage {
|
||||||
fn has_pressure(&self) -> bool {
|
fn has_pressure(&self) -> bool {
|
||||||
self.evict_bytes > self.freed_bytes
|
self.config.evict_bytes > self.freed_bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_available_bytes(&mut self, bytes: u64) {
|
fn add_available_bytes(&mut self, bytes: u64) {
|
||||||
@@ -1666,7 +1589,7 @@ async fn disk_usage_eviction_run(
|
|||||||
let config = json_request::<Config>(&mut r).await?;
|
let config = json_request::<Config>(&mut r).await?;
|
||||||
|
|
||||||
let usage = Usage {
|
let usage = Usage {
|
||||||
evict_bytes: config.evict_bytes,
|
config,
|
||||||
freed_bytes: 0,
|
freed_bytes: 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1678,15 +1601,10 @@ async fn disk_usage_eviction_run(
|
|||||||
)));
|
)));
|
||||||
};
|
};
|
||||||
|
|
||||||
let eviction_state = state.disk_usage_eviction_state.clone();
|
let state = state.disk_usage_eviction_state.clone();
|
||||||
|
|
||||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||||
&eviction_state,
|
&state, storage, usage, &cancel,
|
||||||
storage,
|
|
||||||
usage,
|
|
||||||
&state.tenant_manager,
|
|
||||||
config.eviction_order,
|
|
||||||
&cancel,
|
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -1712,21 +1630,6 @@ async fn secondary_upload_handler(
|
|||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn secondary_download_handler(
|
|
||||||
request: Request<Body>,
|
|
||||||
_cancel: CancellationToken,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
let state = get_state(&request);
|
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
|
||||||
state
|
|
||||||
.secondary_controller
|
|
||||||
.download_tenant(tenant_shard_id)
|
|
||||||
.await
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::NOT_FOUND,
|
StatusCode::NOT_FOUND,
|
||||||
@@ -1919,9 +1822,6 @@ pub fn make_router(
|
|||||||
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
||||||
api_handler(r, put_tenant_location_config_handler)
|
api_handler(r, put_tenant_location_config_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/location_config", |r| {
|
|
||||||
api_handler(r, list_location_config_handler)
|
|
||||||
})
|
|
||||||
.get("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
.get("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
||||||
api_handler(r, timeline_list_handler)
|
api_handler(r, timeline_list_handler)
|
||||||
})
|
})
|
||||||
@@ -1998,9 +1898,6 @@ pub fn make_router(
|
|||||||
.put("/v1/deletion_queue/flush", |r| {
|
.put("/v1/deletion_queue/flush", |r| {
|
||||||
api_handler(r, deletion_queue_flush)
|
api_handler(r, deletion_queue_flush)
|
||||||
})
|
})
|
||||||
.post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
|
|
||||||
api_handler(r, secondary_download_handler)
|
|
||||||
})
|
|
||||||
.put("/v1/tenant/:tenant_shard_id/break", |r| {
|
.put("/v1/tenant/:tenant_shard_id/break", |r| {
|
||||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ use tracing::*;
|
|||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::metrics::WAL_INGEST;
|
|
||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
@@ -313,16 +312,13 @@ async fn import_wal(
|
|||||||
waldecoder.feed_bytes(&buf);
|
waldecoder.feed_bytes(&buf);
|
||||||
|
|
||||||
let mut nrecords = 0;
|
let mut nrecords = 0;
|
||||||
let mut modification = tline.begin_modification(last_lsn);
|
let mut modification = tline.begin_modification(endpoint);
|
||||||
let mut decoded = DecodedWALRecord::default();
|
let mut decoded = DecodedWALRecord::default();
|
||||||
while last_lsn <= endpoint {
|
while last_lsn <= endpoint {
|
||||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||||
walingest
|
walingest
|
||||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
WAL_INGEST.records_committed.inc();
|
|
||||||
|
|
||||||
modification.commit(ctx).await?;
|
|
||||||
last_lsn = lsn;
|
last_lsn = lsn;
|
||||||
|
|
||||||
nrecords += 1;
|
nrecords += 1;
|
||||||
@@ -452,14 +448,13 @@ pub async fn import_wal_from_tar(
|
|||||||
|
|
||||||
waldecoder.feed_bytes(&bytes[offset..]);
|
waldecoder.feed_bytes(&bytes[offset..]);
|
||||||
|
|
||||||
let mut modification = tline.begin_modification(last_lsn);
|
let mut modification = tline.begin_modification(end_lsn);
|
||||||
let mut decoded = DecodedWALRecord::default();
|
let mut decoded = DecodedWALRecord::default();
|
||||||
while last_lsn <= end_lsn {
|
while last_lsn <= end_lsn {
|
||||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||||
walingest
|
walingest
|
||||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
modification.commit(ctx).await?;
|
|
||||||
last_lsn = lsn;
|
last_lsn = lsn;
|
||||||
|
|
||||||
debug!("imported record at {} (end {})", lsn, end_lsn);
|
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ pub mod walingest;
|
|||||||
pub mod walrecord;
|
pub mod walrecord;
|
||||||
pub mod walredo;
|
pub mod walredo;
|
||||||
|
|
||||||
|
pub mod failpoint_support;
|
||||||
|
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use deletion_queue::DeletionQueue;
|
use deletion_queue::DeletionQueue;
|
||||||
@@ -117,10 +119,6 @@ pub const TENANT_CONFIG_NAME: &str = "config";
|
|||||||
/// Full path: `tenants/<tenant_id>/config`.
|
/// Full path: `tenants/<tenant_id>/config`.
|
||||||
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||||
|
|
||||||
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
|
||||||
/// tenant path while in secondary mode.
|
|
||||||
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
|
||||||
|
|
||||||
/// A suffix used for various temporary files. Any temporary files found in the
|
/// A suffix used for various temporary files. Any temporary files found in the
|
||||||
/// data directory at pageserver startup can be automatically removed.
|
/// data directory at pageserver startup can be automatically removed.
|
||||||
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
|
|||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
||||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||||
use utils::id::TimelineId;
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
/// Prometheus histogram buckets (in seconds) for operations in the critical
|
/// Prometheus histogram buckets (in seconds) for operations in the critical
|
||||||
/// path. In other words, operations that directly affect that latency of user
|
/// path. In other words, operations that directly affect that latency of user
|
||||||
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
|
|||||||
// Metrics collected on operations on the storage repository.
|
// Metrics collected on operations on the storage repository.
|
||||||
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
|
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
|
||||||
#[strum(serialize_all = "kebab_case")]
|
#[strum(serialize_all = "kebab_case")]
|
||||||
pub(crate) enum StorageTimeOperation {
|
pub enum StorageTimeOperation {
|
||||||
#[strum(serialize = "layer flush")]
|
#[strum(serialize = "layer flush")]
|
||||||
LayerFlush,
|
LayerFlush,
|
||||||
|
|
||||||
@@ -55,20 +55,20 @@ pub(crate) enum StorageTimeOperation {
|
|||||||
CreateTenant,
|
CreateTenant,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
||||||
register_counter_vec!(
|
register_counter_vec!(
|
||||||
"pageserver_storage_operations_seconds_sum",
|
"pageserver_storage_operations_seconds_sum",
|
||||||
"Total time spent on storage operations with operation, tenant and timeline dimensions",
|
"Total time spent on storage operations with operation, tenant and timeline dimensions",
|
||||||
&["operation", "tenant_id", "shard_id", "timeline_id"],
|
&["operation", "tenant_id", "timeline_id"],
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
register_int_counter_vec!(
|
register_int_counter_vec!(
|
||||||
"pageserver_storage_operations_seconds_count",
|
"pageserver_storage_operations_seconds_count",
|
||||||
"Count of storage operations with operation, tenant and timeline dimensions",
|
"Count of storage operations with operation, tenant and timeline dimensions",
|
||||||
&["operation", "tenant_id", "shard_id", "timeline_id"],
|
&["operation", "tenant_id", "timeline_id"],
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) struct PageCacheMetricsForTaskKind {
|
pub struct PageCacheMetricsForTaskKind {
|
||||||
pub read_accesses_materialized_page: IntCounter,
|
pub read_accesses_materialized_page: IntCounter,
|
||||||
pub read_accesses_immutable: IntCounter,
|
pub read_accesses_immutable: IntCounter,
|
||||||
|
|
||||||
@@ -159,7 +159,7 @@ pub(crate) struct PageCacheMetricsForTaskKind {
|
|||||||
pub read_hits_materialized_page_older_lsn: IntCounter,
|
pub read_hits_materialized_page_older_lsn: IntCounter,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct PageCacheMetrics {
|
pub struct PageCacheMetrics {
|
||||||
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
|
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
||||||
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
|
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
|
||||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
|
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
|
||||||
let task_kind: &'static str = task_kind.into();
|
let task_kind: &'static str = task_kind.into();
|
||||||
@@ -243,9 +243,10 @@ impl PageCacheMetrics {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct PageCacheSizeMetrics {
|
pub struct PageCacheSizeMetrics {
|
||||||
pub max_bytes: UIntGauge,
|
pub max_bytes: UIntGauge,
|
||||||
|
|
||||||
|
pub current_bytes_ephemeral: UIntGauge,
|
||||||
pub current_bytes_immutable: UIntGauge,
|
pub current_bytes_immutable: UIntGauge,
|
||||||
pub current_bytes_materialized_page: UIntGauge,
|
pub current_bytes_materialized_page: UIntGauge,
|
||||||
}
|
}
|
||||||
@@ -259,26 +260,31 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
|
pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
|
||||||
Lazy::new(|| PageCacheSizeMetrics {
|
max_bytes: {
|
||||||
max_bytes: {
|
register_uint_gauge!(
|
||||||
register_uint_gauge!(
|
"pageserver_page_cache_size_max_bytes",
|
||||||
"pageserver_page_cache_size_max_bytes",
|
"Maximum size of the page cache in bytes"
|
||||||
"Maximum size of the page cache in bytes"
|
)
|
||||||
)
|
.expect("failed to define a metric")
|
||||||
.expect("failed to define a metric")
|
},
|
||||||
},
|
|
||||||
current_bytes_immutable: {
|
current_bytes_ephemeral: {
|
||||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||||
.get_metric_with_label_values(&["immutable"])
|
.get_metric_with_label_values(&["ephemeral"])
|
||||||
.unwrap()
|
.unwrap()
|
||||||
},
|
},
|
||||||
current_bytes_materialized_page: {
|
current_bytes_immutable: {
|
||||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||||
.get_metric_with_label_values(&["materialized_page"])
|
.get_metric_with_label_values(&["immutable"])
|
||||||
.unwrap()
|
.unwrap()
|
||||||
},
|
},
|
||||||
});
|
current_bytes_materialized_page: {
|
||||||
|
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||||
|
.get_metric_with_label_values(&["materialized_page"])
|
||||||
|
.unwrap()
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
pub(crate) mod page_cache_eviction_metrics {
|
pub(crate) mod page_cache_eviction_metrics {
|
||||||
use std::num::NonZeroUsize;
|
use std::num::NonZeroUsize;
|
||||||
@@ -337,6 +343,15 @@ pub(crate) mod page_cache_eviction_metrics {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||||
|
register_histogram!(
|
||||||
|
"pageserver_page_cache_acquire_pinned_slot_seconds",
|
||||||
|
"Time spent acquiring a pinned slot in the page cache",
|
||||||
|
CRITICAL_OP_BUCKETS.into(),
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
register_int_counter_vec!(
|
register_int_counter_vec!(
|
||||||
"page_cache_errors_total",
|
"page_cache_errors_total",
|
||||||
@@ -373,7 +388,7 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
|||||||
register_int_gauge_vec!(
|
register_int_gauge_vec!(
|
||||||
"pageserver_last_record_lsn",
|
"pageserver_last_record_lsn",
|
||||||
"Last record LSN grouped by timeline",
|
"Last record LSN grouped by timeline",
|
||||||
&["tenant_id", "shard_id", "timeline_id"]
|
&["tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -382,7 +397,7 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
register_uint_gauge_vec!(
|
register_uint_gauge_vec!(
|
||||||
"pageserver_resident_physical_size",
|
"pageserver_resident_physical_size",
|
||||||
"The size of the layer files present in the pageserver's filesystem.",
|
"The size of the layer files present in the pageserver's filesystem.",
|
||||||
&["tenant_id", "shard_id", "timeline_id"]
|
&["tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -400,7 +415,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
"pageserver_remote_physical_size",
|
"pageserver_remote_physical_size",
|
||||||
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
|
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
|
||||||
// Corollary: If any files are missing from the index part, they won't be included here.
|
// Corollary: If any files are missing from the index part, they won't be included here.
|
||||||
&["tenant_id", "shard_id", "timeline_id"]
|
&["tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -433,7 +448,7 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
register_uint_gauge_vec!(
|
register_uint_gauge_vec!(
|
||||||
"pageserver_current_logical_size",
|
"pageserver_current_logical_size",
|
||||||
"Current logical size grouped by timeline",
|
"Current logical size grouped by timeline",
|
||||||
&["tenant_id", "shard_id", "timeline_id"]
|
&["tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define current logical size metric")
|
.expect("failed to define current logical size metric")
|
||||||
});
|
});
|
||||||
@@ -582,7 +597,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
register_uint_gauge_vec!(
|
register_uint_gauge_vec!(
|
||||||
"pageserver_broken_tenants_count",
|
"pageserver_broken_tenants_count",
|
||||||
"Set of broken tenants",
|
"Set of broken tenants",
|
||||||
&["tenant_id", "shard_id"]
|
&["tenant_id"]
|
||||||
)
|
)
|
||||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||||
});
|
});
|
||||||
@@ -602,7 +617,7 @@ static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
|
|||||||
register_int_counter_vec!(
|
register_int_counter_vec!(
|
||||||
"pageserver_created_persistent_files_total",
|
"pageserver_created_persistent_files_total",
|
||||||
"Number of files created that are meant to be uploaded to cloud storage",
|
"Number of files created that are meant to be uploaded to cloud storage",
|
||||||
&["tenant_id", "shard_id", "timeline_id"]
|
&["tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -611,7 +626,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
|
|||||||
register_int_counter_vec!(
|
register_int_counter_vec!(
|
||||||
"pageserver_written_persistent_bytes_total",
|
"pageserver_written_persistent_bytes_total",
|
||||||
"Total bytes written that are meant to be uploaded to cloud storage",
|
"Total bytes written that are meant to be uploaded to cloud storage",
|
||||||
&["tenant_id", "shard_id", "timeline_id"]
|
&["tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -630,7 +645,7 @@ static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
|
|||||||
register_int_counter_vec!(
|
register_int_counter_vec!(
|
||||||
"pageserver_evictions",
|
"pageserver_evictions",
|
||||||
"Number of layers evicted from the pageserver",
|
"Number of layers evicted from the pageserver",
|
||||||
&["tenant_id", "shard_id", "timeline_id"]
|
&["tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -725,13 +740,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
|
|||||||
|
|
||||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct EvictionsWithLowResidenceDuration {
|
pub struct EvictionsWithLowResidenceDuration {
|
||||||
data_source: &'static str,
|
data_source: &'static str,
|
||||||
threshold: Duration,
|
threshold: Duration,
|
||||||
counter: Option<IntCounter>,
|
counter: Option<IntCounter>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
|
pub struct EvictionsWithLowResidenceDurationBuilder {
|
||||||
data_source: &'static str,
|
data_source: &'static str,
|
||||||
threshold: Duration,
|
threshold: Duration,
|
||||||
}
|
}
|
||||||
@@ -927,7 +942,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
|||||||
register_int_gauge_vec!(
|
register_int_gauge_vec!(
|
||||||
"pageserver_io_operations_bytes_total",
|
"pageserver_io_operations_bytes_total",
|
||||||
"Total amount of bytes read/written in IO operations",
|
"Total amount of bytes read/written in IO operations",
|
||||||
&["operation", "tenant_id", "shard_id", "timeline_id"]
|
&["operation", "tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -994,7 +1009,7 @@ pub enum SmgrQueryType {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
pub struct SmgrQueryTimePerTimeline {
|
||||||
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1002,7 +1017,7 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
|||||||
register_histogram_vec!(
|
register_histogram_vec!(
|
||||||
"pageserver_smgr_query_seconds",
|
"pageserver_smgr_query_seconds",
|
||||||
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
|
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
|
||||||
&["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
|
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||||
CRITICAL_OP_BUCKETS.into(),
|
CRITICAL_OP_BUCKETS.into(),
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
@@ -1069,9 +1084,8 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
|||||||
});
|
});
|
||||||
|
|
||||||
impl SmgrQueryTimePerTimeline {
|
impl SmgrQueryTimePerTimeline {
|
||||||
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
let tenant_id = tenant_id.to_string();
|
||||||
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
|
||||||
let timeline_id = timeline_id.to_string();
|
let timeline_id = timeline_id.to_string();
|
||||||
let metrics = std::array::from_fn(|i| {
|
let metrics = std::array::from_fn(|i| {
|
||||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||||
@@ -1079,7 +1093,7 @@ impl SmgrQueryTimePerTimeline {
|
|||||||
.get_metric_with_label_values(&[op.into()])
|
.get_metric_with_label_values(&[op.into()])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||||
.get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
|
.get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
GlobalAndPerTimelineHistogram {
|
GlobalAndPerTimelineHistogram {
|
||||||
global,
|
global,
|
||||||
@@ -1099,7 +1113,6 @@ impl SmgrQueryTimePerTimeline {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod smgr_query_time_tests {
|
mod smgr_query_time_tests {
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use strum::IntoEnumIterator;
|
use strum::IntoEnumIterator;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
@@ -1126,10 +1139,7 @@ mod smgr_query_time_tests {
|
|||||||
for op in &ops {
|
for op in &ops {
|
||||||
let tenant_id = TenantId::generate();
|
let tenant_id = TenantId::generate();
|
||||||
let timeline_id = TimelineId::generate();
|
let timeline_id = TimelineId::generate();
|
||||||
let metrics = super::SmgrQueryTimePerTimeline::new(
|
let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||||
&TenantShardId::unsharded(tenant_id),
|
|
||||||
&timeline_id,
|
|
||||||
);
|
|
||||||
|
|
||||||
let get_counts = || {
|
let get_counts = || {
|
||||||
let global: u64 = ops
|
let global: u64 = ops
|
||||||
@@ -1171,8 +1181,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
|||||||
.map(|ms| (ms as f64) / 1000.0)
|
.map(|ms| (ms as f64) / 1000.0)
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) struct BasebackupQueryTime(HistogramVec);
|
pub struct BasebackupQueryTime(HistogramVec);
|
||||||
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||||
BasebackupQueryTime({
|
BasebackupQueryTime({
|
||||||
register_histogram_vec!(
|
register_histogram_vec!(
|
||||||
"pageserver_basebackup_query_seconds",
|
"pageserver_basebackup_query_seconds",
|
||||||
@@ -1192,7 +1202,7 @@ impl DurationResultObserver for BasebackupQueryTime {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||||
register_int_gauge_vec!(
|
register_int_gauge_vec!(
|
||||||
"pageserver_live_connections",
|
"pageserver_live_connections",
|
||||||
"Number of live network connections",
|
"Number of live network connections",
|
||||||
@@ -1210,13 +1220,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
|
|||||||
"Number of ongoing calls to remote timeline client. \
|
"Number of ongoing calls to remote timeline client. \
|
||||||
Used to populate pageserver_remote_timeline_client_calls_started. \
|
Used to populate pageserver_remote_timeline_client_calls_started. \
|
||||||
This metric is not useful for sampling from Prometheus, but useful in tests.",
|
This metric is not useful for sampling from Prometheus, but useful in tests.",
|
||||||
&[
|
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||||
"tenant_id",
|
|
||||||
"shard_id",
|
|
||||||
"timeline_id",
|
|
||||||
"file_kind",
|
|
||||||
"op_kind"
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -1237,23 +1241,22 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
|
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
Lazy::new(|| {
|
register_int_counter_vec!(
|
||||||
register_int_counter_vec!(
|
|
||||||
"pageserver_remote_timeline_client_bytes_started",
|
"pageserver_remote_timeline_client_bytes_started",
|
||||||
"Incremented by the number of bytes associated with a remote timeline client operation. \
|
"Incremented by the number of bytes associated with a remote timeline client operation. \
|
||||||
The increment happens when the operation is scheduled.",
|
The increment happens when the operation is scheduled.",
|
||||||
&["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
|
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
register_int_counter_vec!(
|
register_int_counter_vec!(
|
||||||
"pageserver_remote_timeline_client_bytes_finished",
|
"pageserver_remote_timeline_client_bytes_finished",
|
||||||
"Incremented by the number of bytes associated with a remote timeline client operation. \
|
"Incremented by the number of bytes associated with a remote timeline client operation. \
|
||||||
The increment happens when the operation finishes (regardless of success/failure/shutdown).",
|
The increment happens when the operation finishes (regardless of success/failure/shutdown).",
|
||||||
&["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
|
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -1366,8 +1369,6 @@ pub(crate) struct SecondaryModeMetrics {
|
|||||||
pub(crate) upload_heatmap: IntCounter,
|
pub(crate) upload_heatmap: IntCounter,
|
||||||
pub(crate) upload_heatmap_errors: IntCounter,
|
pub(crate) upload_heatmap_errors: IntCounter,
|
||||||
pub(crate) upload_heatmap_duration: Histogram,
|
pub(crate) upload_heatmap_duration: Histogram,
|
||||||
pub(crate) download_heatmap: IntCounter,
|
|
||||||
pub(crate) download_layer: IntCounter,
|
|
||||||
}
|
}
|
||||||
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
|
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
|
||||||
upload_heatmap: register_int_counter!(
|
upload_heatmap: register_int_counter!(
|
||||||
@@ -1385,16 +1386,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
|
|||||||
"Time to build and upload a heatmap, including any waiting inside the S3 client"
|
"Time to build and upload a heatmap, including any waiting inside the S3 client"
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric"),
|
.expect("failed to define a metric"),
|
||||||
download_heatmap: register_int_counter!(
|
|
||||||
"pageserver_secondary_download_heatmap",
|
|
||||||
"Number of downloads of heatmaps by secondary mode locations"
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric"),
|
|
||||||
download_layer: register_int_counter!(
|
|
||||||
"pageserver_secondary_download_layer",
|
|
||||||
"Number of downloads of layers by secondary mode locations"
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric"),
|
|
||||||
});
|
});
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
@@ -1664,7 +1655,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
|||||||
Lazy::new(WalRedoProcessCounters::default);
|
Lazy::new(WalRedoProcessCounters::default);
|
||||||
|
|
||||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||||
pub(crate) struct StorageTimeMetricsTimer {
|
pub struct StorageTimeMetricsTimer {
|
||||||
metrics: StorageTimeMetrics,
|
metrics: StorageTimeMetrics,
|
||||||
start: Instant,
|
start: Instant,
|
||||||
}
|
}
|
||||||
@@ -1689,7 +1680,7 @@ impl StorageTimeMetricsTimer {
|
|||||||
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
|
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
|
||||||
/// timeline total sum and count.
|
/// timeline total sum and count.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub(crate) struct StorageTimeMetrics {
|
pub struct StorageTimeMetrics {
|
||||||
/// Sum of f64 seconds, per operation, tenant_id and timeline_id
|
/// Sum of f64 seconds, per operation, tenant_id and timeline_id
|
||||||
timeline_sum: Counter,
|
timeline_sum: Counter,
|
||||||
/// Number of oeprations, per operation, tenant_id and timeline_id
|
/// Number of oeprations, per operation, tenant_id and timeline_id
|
||||||
@@ -1699,19 +1690,14 @@ pub(crate) struct StorageTimeMetrics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl StorageTimeMetrics {
|
impl StorageTimeMetrics {
|
||||||
pub fn new(
|
pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
|
||||||
operation: StorageTimeOperation,
|
|
||||||
tenant_id: &str,
|
|
||||||
shard_id: &str,
|
|
||||||
timeline_id: &str,
|
|
||||||
) -> Self {
|
|
||||||
let operation: &'static str = operation.into();
|
let operation: &'static str = operation.into();
|
||||||
|
|
||||||
let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
|
let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
|
||||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
.get_metric_with_label_values(&[operation, tenant_id, timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
|
let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
|
||||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
.get_metric_with_label_values(&[operation, tenant_id, timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let global_histogram = STORAGE_TIME_GLOBAL
|
let global_histogram = STORAGE_TIME_GLOBAL
|
||||||
.get_metric_with_label_values(&[operation])
|
.get_metric_with_label_values(&[operation])
|
||||||
@@ -1733,7 +1719,7 @@ impl StorageTimeMetrics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct TimelineMetrics {
|
pub struct TimelineMetrics {
|
||||||
tenant_id: String,
|
tenant_id: String,
|
||||||
shard_id: String,
|
shard_id: String,
|
||||||
timeline_id: String,
|
timeline_id: String,
|
||||||
@@ -1763,66 +1749,40 @@ impl TimelineMetrics {
|
|||||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||||
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||||
let timeline_id = timeline_id.to_string();
|
let timeline_id = timeline_id.to_string();
|
||||||
let flush_time_histo = StorageTimeMetrics::new(
|
let flush_time_histo =
|
||||||
StorageTimeOperation::LayerFlush,
|
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
|
||||||
&tenant_id,
|
let compact_time_histo =
|
||||||
&shard_id,
|
StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
|
||||||
&timeline_id,
|
let create_images_time_histo =
|
||||||
);
|
StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
|
||||||
let compact_time_histo = StorageTimeMetrics::new(
|
let logical_size_histo =
|
||||||
StorageTimeOperation::Compact,
|
StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
|
||||||
&tenant_id,
|
|
||||||
&shard_id,
|
|
||||||
&timeline_id,
|
|
||||||
);
|
|
||||||
let create_images_time_histo = StorageTimeMetrics::new(
|
|
||||||
StorageTimeOperation::CreateImages,
|
|
||||||
&tenant_id,
|
|
||||||
&shard_id,
|
|
||||||
&timeline_id,
|
|
||||||
);
|
|
||||||
let logical_size_histo = StorageTimeMetrics::new(
|
|
||||||
StorageTimeOperation::LogicalSize,
|
|
||||||
&tenant_id,
|
|
||||||
&shard_id,
|
|
||||||
&timeline_id,
|
|
||||||
);
|
|
||||||
let imitate_logical_size_histo = StorageTimeMetrics::new(
|
let imitate_logical_size_histo = StorageTimeMetrics::new(
|
||||||
StorageTimeOperation::ImitateLogicalSize,
|
StorageTimeOperation::ImitateLogicalSize,
|
||||||
&tenant_id,
|
&tenant_id,
|
||||||
&shard_id,
|
|
||||||
&timeline_id,
|
|
||||||
);
|
|
||||||
let load_layer_map_histo = StorageTimeMetrics::new(
|
|
||||||
StorageTimeOperation::LoadLayerMap,
|
|
||||||
&tenant_id,
|
|
||||||
&shard_id,
|
|
||||||
&timeline_id,
|
|
||||||
);
|
|
||||||
let garbage_collect_histo = StorageTimeMetrics::new(
|
|
||||||
StorageTimeOperation::Gc,
|
|
||||||
&tenant_id,
|
|
||||||
&shard_id,
|
|
||||||
&timeline_id,
|
&timeline_id,
|
||||||
);
|
);
|
||||||
|
let load_layer_map_histo =
|
||||||
|
StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
|
||||||
|
let garbage_collect_histo =
|
||||||
|
StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
|
||||||
let last_record_gauge = LAST_RECORD_LSN
|
let last_record_gauge = LAST_RECORD_LSN
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
// TODO: we shouldn't expose this metric
|
|
||||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
|
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let evictions = EVICTIONS
|
let evictions = EVICTIONS
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
|
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
|
||||||
.build(&tenant_id, &shard_id, &timeline_id);
|
.build(&tenant_id, &shard_id, &timeline_id);
|
||||||
@@ -1876,17 +1836,15 @@ impl Drop for TimelineMetrics {
|
|||||||
let tenant_id = &self.tenant_id;
|
let tenant_id = &self.tenant_id;
|
||||||
let timeline_id = &self.timeline_id;
|
let timeline_id = &self.timeline_id;
|
||||||
let shard_id = &self.shard_id;
|
let shard_id = &self.shard_id;
|
||||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
{
|
{
|
||||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||||
let _ =
|
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
|
||||||
}
|
}
|
||||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
let _ =
|
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
|
||||||
|
|
||||||
self.evictions_with_low_residence_duration
|
self.evictions_with_low_residence_duration
|
||||||
.write()
|
.write()
|
||||||
@@ -1899,42 +1857,29 @@ impl Drop for TimelineMetrics {
|
|||||||
// outlive an individual smgr connection, but not the timeline.
|
// outlive an individual smgr connection, but not the timeline.
|
||||||
|
|
||||||
for op in StorageTimeOperation::VARIANTS {
|
for op in StorageTimeOperation::VARIANTS {
|
||||||
let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
|
let _ =
|
||||||
op,
|
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||||
tenant_id,
|
let _ =
|
||||||
shard_id,
|
STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||||
timeline_id,
|
|
||||||
]);
|
|
||||||
let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
|
|
||||||
op,
|
|
||||||
tenant_id,
|
|
||||||
shard_id,
|
|
||||||
timeline_id,
|
|
||||||
]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for op in STORAGE_IO_SIZE_OPERATIONS {
|
for op in STORAGE_IO_SIZE_OPERATIONS {
|
||||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
|
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for op in SmgrQueryType::iter() {
|
for op in SmgrQueryType::iter() {
|
||||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||||
op.into(),
|
op.into(),
|
||||||
tenant_id,
|
tenant_id,
|
||||||
shard_id,
|
|
||||||
timeline_id,
|
timeline_id,
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||||
// Only shard zero deals in synthetic sizes
|
let tid = tenant_id.to_string();
|
||||||
if tenant_shard_id.is_zero() {
|
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||||
let tid = tenant_shard_id.tenant_id.to_string();
|
|
||||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// we leave the BROKEN_TENANTS_SET entry if any
|
// we leave the BROKEN_TENANTS_SET entry if any
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1982,9 +1927,8 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct RemoteTimelineClientMetrics {
|
pub struct RemoteTimelineClientMetrics {
|
||||||
tenant_id: String,
|
tenant_id: String,
|
||||||
shard_id: String,
|
|
||||||
timeline_id: String,
|
timeline_id: String,
|
||||||
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
||||||
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||||
@@ -1996,7 +1940,6 @@ impl RemoteTimelineClientMetrics {
|
|||||||
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||||
RemoteTimelineClientMetrics {
|
RemoteTimelineClientMetrics {
|
||||||
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
||||||
shard_id: format!("{}", tenant_shard_id.shard_slug()),
|
|
||||||
timeline_id: timeline_id.to_string(),
|
timeline_id: timeline_id.to_string(),
|
||||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||||
@@ -2011,9 +1954,8 @@ impl RemoteTimelineClientMetrics {
|
|||||||
PerTimelineRemotePhysicalSizeGauge::new(
|
PerTimelineRemotePhysicalSizeGauge::new(
|
||||||
REMOTE_PHYSICAL_SIZE
|
REMOTE_PHYSICAL_SIZE
|
||||||
.get_metric_with_label_values(&[
|
.get_metric_with_label_values(&[
|
||||||
&self.tenant_id,
|
&self.tenant_id.to_string(),
|
||||||
&self.shard_id,
|
&self.timeline_id.to_string(),
|
||||||
&self.timeline_id,
|
|
||||||
])
|
])
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
)
|
)
|
||||||
@@ -2048,9 +1990,8 @@ impl RemoteTimelineClientMetrics {
|
|||||||
let metric = guard.entry(key).or_insert_with(move || {
|
let metric = guard.entry(key).or_insert_with(move || {
|
||||||
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
|
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
|
||||||
.get_metric_with_label_values(&[
|
.get_metric_with_label_values(&[
|
||||||
&self.tenant_id,
|
&self.tenant_id.to_string(),
|
||||||
&self.shard_id,
|
&self.timeline_id.to_string(),
|
||||||
&self.timeline_id,
|
|
||||||
key.0,
|
key.0,
|
||||||
key.1,
|
key.1,
|
||||||
])
|
])
|
||||||
@@ -2080,9 +2021,8 @@ impl RemoteTimelineClientMetrics {
|
|||||||
let metric = guard.entry(key).or_insert_with(move || {
|
let metric = guard.entry(key).or_insert_with(move || {
|
||||||
REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
|
REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
|
||||||
.get_metric_with_label_values(&[
|
.get_metric_with_label_values(&[
|
||||||
&self.tenant_id,
|
&self.tenant_id.to_string(),
|
||||||
&self.shard_id,
|
&self.timeline_id.to_string(),
|
||||||
&self.timeline_id,
|
|
||||||
key.0,
|
key.0,
|
||||||
key.1,
|
key.1,
|
||||||
])
|
])
|
||||||
@@ -2101,9 +2041,8 @@ impl RemoteTimelineClientMetrics {
|
|||||||
let metric = guard.entry(key).or_insert_with(move || {
|
let metric = guard.entry(key).or_insert_with(move || {
|
||||||
REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
|
REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
|
||||||
.get_metric_with_label_values(&[
|
.get_metric_with_label_values(&[
|
||||||
&self.tenant_id,
|
&self.tenant_id.to_string(),
|
||||||
&self.shard_id,
|
&self.timeline_id.to_string(),
|
||||||
&self.timeline_id,
|
|
||||||
key.0,
|
key.0,
|
||||||
key.1,
|
key.1,
|
||||||
])
|
])
|
||||||
@@ -2247,7 +2186,6 @@ impl Drop for RemoteTimelineClientMetrics {
|
|||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
let RemoteTimelineClientMetrics {
|
let RemoteTimelineClientMetrics {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
shard_id,
|
|
||||||
timeline_id,
|
timeline_id,
|
||||||
remote_physical_size_gauge,
|
remote_physical_size_gauge,
|
||||||
calls_unfinished_gauge,
|
calls_unfinished_gauge,
|
||||||
@@ -2257,7 +2195,6 @@ impl Drop for RemoteTimelineClientMetrics {
|
|||||||
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
|
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
|
||||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
|
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
|
||||||
tenant_id,
|
tenant_id,
|
||||||
shard_id,
|
|
||||||
timeline_id,
|
timeline_id,
|
||||||
a,
|
a,
|
||||||
b,
|
b,
|
||||||
@@ -2266,7 +2203,6 @@ impl Drop for RemoteTimelineClientMetrics {
|
|||||||
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
|
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
|
||||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
|
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
|
||||||
tenant_id,
|
tenant_id,
|
||||||
shard_id,
|
|
||||||
timeline_id,
|
timeline_id,
|
||||||
a,
|
a,
|
||||||
b,
|
b,
|
||||||
@@ -2275,7 +2211,6 @@ impl Drop for RemoteTimelineClientMetrics {
|
|||||||
for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
|
for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
|
||||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
|
let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
|
||||||
tenant_id,
|
tenant_id,
|
||||||
shard_id,
|
|
||||||
timeline_id,
|
timeline_id,
|
||||||
a,
|
a,
|
||||||
b,
|
b,
|
||||||
@@ -2283,16 +2218,18 @@ impl Drop for RemoteTimelineClientMetrics {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
|
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
|
||||||
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wrapper future that measures the time spent by a remote storage operation,
|
/// Wrapper future that measures the time spent by a remote storage operation,
|
||||||
/// and records the time and success/failure as a prometheus metric.
|
/// and records the time and success/failure as a prometheus metric.
|
||||||
pub(crate) trait MeasureRemoteOp: Sized {
|
pub trait MeasureRemoteOp: Sized {
|
||||||
fn measure_remote_op(
|
fn measure_remote_op(
|
||||||
self,
|
self,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
file_kind: RemoteOpFileKind,
|
file_kind: RemoteOpFileKind,
|
||||||
op: RemoteOpKind,
|
op: RemoteOpKind,
|
||||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||||
@@ -2300,6 +2237,8 @@ pub(crate) trait MeasureRemoteOp: Sized {
|
|||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
MeasuredRemoteOp {
|
MeasuredRemoteOp {
|
||||||
inner: self,
|
inner: self,
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
file_kind,
|
file_kind,
|
||||||
op,
|
op,
|
||||||
start,
|
start,
|
||||||
@@ -2311,10 +2250,12 @@ pub(crate) trait MeasureRemoteOp: Sized {
|
|||||||
impl<T: Sized> MeasureRemoteOp for T {}
|
impl<T: Sized> MeasureRemoteOp for T {}
|
||||||
|
|
||||||
pin_project! {
|
pin_project! {
|
||||||
pub(crate) struct MeasuredRemoteOp<F>
|
pub struct MeasuredRemoteOp<F>
|
||||||
{
|
{
|
||||||
#[pin]
|
#[pin]
|
||||||
inner: F,
|
inner: F,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
file_kind: RemoteOpFileKind,
|
file_kind: RemoteOpFileKind,
|
||||||
op: RemoteOpKind,
|
op: RemoteOpKind,
|
||||||
start: Instant,
|
start: Instant,
|
||||||
|
|||||||
@@ -550,6 +550,7 @@ impl PageCache {
|
|||||||
// not require changes.
|
// not require changes.
|
||||||
|
|
||||||
async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
|
async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
|
||||||
|
let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
|
||||||
match tokio::time::timeout(
|
match tokio::time::timeout(
|
||||||
// Choose small timeout, neon_smgr does its own retries.
|
// Choose small timeout, neon_smgr does its own retries.
|
||||||
// https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
|
// https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
|
||||||
@@ -562,6 +563,7 @@ impl PageCache {
|
|||||||
res.expect("this semaphore is never closed"),
|
res.expect("this semaphore is never closed"),
|
||||||
)),
|
)),
|
||||||
Err(_timeout) => {
|
Err(_timeout) => {
|
||||||
|
timer.stop_and_discard();
|
||||||
crate::metrics::page_cache_errors_inc(
|
crate::metrics::page_cache_errors_inc(
|
||||||
crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
|
crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
|
||||||
);
|
);
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user