mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-11 06:30:37 +00:00
Compare commits
90 Commits
jcsp/issue
...
arpad/musl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
85593264b5 | ||
|
|
5e4111699a | ||
|
|
a356cf2a56 | ||
|
|
4eb9c4eff3 | ||
|
|
a2026dd5a1 | ||
|
|
a5003bdee9 | ||
|
|
71a7fd983e | ||
|
|
a3f5b83677 | ||
|
|
1455f5a261 | ||
|
|
3860bc9c6c | ||
|
|
c1f4028fc0 | ||
|
|
0e4f182680 | ||
|
|
ea2e830707 | ||
|
|
7cf726e36e | ||
|
|
6b3164269c | ||
|
|
75a52ac7fd | ||
|
|
e28e46f20b | ||
|
|
d5d15eb6eb | ||
|
|
49d7f9b5a4 | ||
|
|
95a49f0075 | ||
|
|
545f7e8cd7 | ||
|
|
cd6d811213 | ||
|
|
8f3c316bae | ||
|
|
58e31fe098 | ||
|
|
a43a1ad1df | ||
|
|
eb0c026aac | ||
|
|
ff560a1113 | ||
|
|
4a278cce7c | ||
|
|
f98fdd20e3 | ||
|
|
014f822a78 | ||
|
|
ddd8ebd253 | ||
|
|
9cfe08e3d9 | ||
|
|
64577cfddc | ||
|
|
37f81289c2 | ||
|
|
9217564026 | ||
|
|
3404e76a51 | ||
|
|
62aac6c8ad | ||
|
|
e015b2bf3e | ||
|
|
a7f31f1a59 | ||
|
|
325f3784f9 | ||
|
|
900f391115 | ||
|
|
8901ce9c99 | ||
|
|
ce44dfe353 | ||
|
|
d1d55bbd9f | ||
|
|
df9ab1b5e3 | ||
|
|
ef96c82c9f | ||
|
|
b43f6daa48 | ||
|
|
664f92dc6e | ||
|
|
bd5cb9e86b | ||
|
|
00d66e8012 | ||
|
|
679e031cf6 | ||
|
|
e3f6a07ca3 | ||
|
|
a8a88ba7bc | ||
|
|
353afe4fe7 | ||
|
|
1988ad8db7 | ||
|
|
e3415706b7 | ||
|
|
9d081851ec | ||
|
|
781352bd8e | ||
|
|
8030b8e4c5 | ||
|
|
9a4b896636 | ||
|
|
e8b8ebfa1d | ||
|
|
d9d471e3c4 | ||
|
|
d43dcceef9 | ||
|
|
f2771a99b7 | ||
|
|
f54c3b96e0 | ||
|
|
478cc37a70 | ||
|
|
4ce6e2d2fc | ||
|
|
baeb58432f | ||
|
|
6f3e043a76 | ||
|
|
6810d2aa53 | ||
|
|
2d7091871f | ||
|
|
7701ca45dd | ||
|
|
de8dfee4bd | ||
|
|
e3f51abadf | ||
|
|
a7b84cca5a | ||
|
|
291fcb9e4f | ||
|
|
a5ecca976e | ||
|
|
5caee4ca54 | ||
|
|
e1a9669d05 | ||
|
|
aaf60819fa | ||
|
|
c84656a53e | ||
|
|
af99c959ef | ||
|
|
a8e6d259cb | ||
|
|
c1390bfc3b | ||
|
|
6d951e69d6 | ||
|
|
4b8809b280 | ||
|
|
4c5afb7b10 | ||
|
|
ec069dc45e | ||
|
|
790c05d675 | ||
|
|
923cf91aa4 |
@@ -17,6 +17,7 @@
|
||||
!libs/
|
||||
!neon_local/
|
||||
!pageserver/
|
||||
!patches/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!s3_scrubber/
|
||||
|
||||
@@ -3,13 +3,13 @@ description: 'Create Branch using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
desctiption: 'Neon API key'
|
||||
description: 'Neon API key'
|
||||
required: true
|
||||
project_id:
|
||||
desctiption: 'ID of the Project to create Branch in'
|
||||
description: 'ID of the Project to create Branch in'
|
||||
required: true
|
||||
api_host:
|
||||
desctiption: 'Neon API host'
|
||||
description: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
outputs:
|
||||
dsn:
|
||||
|
||||
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
desctiption: 'Neon API key'
|
||||
description: 'Neon API key'
|
||||
required: true
|
||||
project_id:
|
||||
desctiption: 'ID of the Project which should be deleted'
|
||||
description: 'ID of the Project which should be deleted'
|
||||
required: true
|
||||
branch_id:
|
||||
desctiption: 'ID of the branch to delete'
|
||||
description: 'ID of the branch to delete'
|
||||
required: true
|
||||
api_host:
|
||||
desctiption: 'Neon API host'
|
||||
description: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
|
||||
runs:
|
||||
|
||||
14
.github/actions/neon-project-create/action.yml
vendored
14
.github/actions/neon-project-create/action.yml
vendored
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
desctiption: 'Neon API key'
|
||||
description: 'Neon API key'
|
||||
required: true
|
||||
region_id:
|
||||
desctiption: 'Region ID, if not set the project will be created in the default region'
|
||||
description: 'Region ID, if not set the project will be created in the default region'
|
||||
default: aws-us-east-2
|
||||
postgres_version:
|
||||
desctiption: 'Postgres version; default is 15'
|
||||
default: 15
|
||||
description: 'Postgres version; default is 15'
|
||||
default: '15'
|
||||
api_host:
|
||||
desctiption: 'Neon API host'
|
||||
description: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
provisioner:
|
||||
desctiption: 'k8s-pod or k8s-neonvm'
|
||||
description: 'k8s-pod or k8s-neonvm'
|
||||
default: 'k8s-pod'
|
||||
compute_units:
|
||||
desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
default: '[1, 1]'
|
||||
|
||||
outputs:
|
||||
|
||||
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
desctiption: 'Neon API key'
|
||||
description: 'Neon API key'
|
||||
required: true
|
||||
project_id:
|
||||
desctiption: 'ID of the Project to delete'
|
||||
description: 'ID of the Project to delete'
|
||||
required: true
|
||||
api_host:
|
||||
desctiption: 'Neon API host'
|
||||
description: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
|
||||
runs:
|
||||
|
||||
244
.github/workflows/build_and_test.yml
vendored
244
.github/workflows/build_and_test.yml
vendored
@@ -548,7 +548,7 @@ jobs:
|
||||
|
||||
report-benchmarks-failures:
|
||||
needs: [ benchmarks, create-test-report ]
|
||||
if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
|
||||
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -723,9 +723,13 @@ jobs:
|
||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||
secrets: inherit
|
||||
|
||||
neon-image:
|
||||
neon-image-arch:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -747,12 +751,6 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
@@ -764,25 +762,52 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
neon-image:
|
||||
needs: [ neon-image-arch, tag ]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Push multi-arch image to ECR
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
compute-node-image-arch:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -829,15 +854,14 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
if: ${{ matrix.version == 'v16' }}
|
||||
if: matrix.version == 'v16'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
target: compute-tools-image
|
||||
@@ -851,14 +875,57 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
compute-node-image:
|
||||
needs: [ compute-node-image-arch, tag ]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch compute-node image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- name: Create multi-arch compute-tools image
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Push multi-arch compute-tools image to ECR
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
@@ -866,11 +933,8 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.28.1
|
||||
VM_BUILDER_VERSION: v0.29.3
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -883,26 +947,48 @@ jobs:
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
|
||||
# it won't have the proper authentication (written at v0.6.0)
|
||||
- name: Pulling compute-node image
|
||||
run: |
|
||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-spec=vm-image-spec.yaml \
|
||||
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
test-images:
|
||||
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -920,7 +1006,7 @@ jobs:
|
||||
- name: Verify image versions
|
||||
shell: bash # ensure no set -e for better error messages
|
||||
run: |
|
||||
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
|
||||
echo "Pageserver version string: $pageserver_version"
|
||||
|
||||
@@ -946,78 +1032,48 @@ jobs:
|
||||
|
||||
promote-images:
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
# Don't add if-condition here.
|
||||
# The job should always be run because we have dependant other jobs that shouldn't be skipped
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
VERSIONS: v14 v15 v16
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Copy vm-compute-node images to Docker Hub
|
||||
- name: Copy vm-compute-node images to ECR
|
||||
run: |
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
|
||||
for version in ${VERSIONS}; do
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
|
||||
docker buildx imagetools create -t $repo/neon:latest \
|
||||
$repo/neon:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Push images to production ECR
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
|
||||
docker buildx imagetools create -t $repo/compute-tools:latest \
|
||||
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
|
||||
echo "" > /github/home/.docker/config.json
|
||||
crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
|
||||
for version in ${VERSIONS}; do
|
||||
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
|
||||
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Push vm-compute-node to Docker Hub
|
||||
run: |
|
||||
crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
|
||||
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push latest tags to Docker Hub
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
|
||||
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
done
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
needs: [ check-permissions, tag ]
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
## Release ${RELEASE_DATE}
|
||||
## Storage & Compute release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
327
Cargo.lock
generated
327
Cargo.lock
generated
@@ -708,7 +708,7 @@ dependencies = [
|
||||
"sha1",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.20.0",
|
||||
"tokio-tungstenite",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -734,8 +734,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_core"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.21.1",
|
||||
@@ -764,8 +762,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_identity"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
|
||||
dependencies = [
|
||||
"async-lock",
|
||||
"async-trait",
|
||||
@@ -784,8 +780,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_storage"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
|
||||
dependencies = [
|
||||
"RustyXML",
|
||||
"async-lock",
|
||||
@@ -803,8 +797,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_storage_blobs"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
|
||||
dependencies = [
|
||||
"RustyXML",
|
||||
"azure_core",
|
||||
@@ -824,8 +816,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_svc_blobstorage"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
|
||||
dependencies = [
|
||||
"azure_core",
|
||||
"bytes",
|
||||
@@ -979,6 +969,12 @@ version = "3.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
@@ -1066,9 +1062,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.31"
|
||||
version = "0.4.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
|
||||
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
|
||||
dependencies = [
|
||||
"android-tzdata",
|
||||
"iana-time-zone",
|
||||
@@ -1076,7 +1072,7 @@ dependencies = [
|
||||
"num-traits",
|
||||
"serde",
|
||||
"wasm-bindgen",
|
||||
"windows-targets 0.48.0",
|
||||
"windows-targets 0.52.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1103,7 +1099,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
|
||||
dependencies = [
|
||||
"ciborium-io",
|
||||
"half",
|
||||
"half 1.8.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1233,8 +1229,10 @@ dependencies = [
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
@@ -1463,26 +1461,21 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.3"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
|
||||
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.14"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"memoffset 0.8.0",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1596,7 +1589,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
"lock_api",
|
||||
"once_cell",
|
||||
"parking_lot_core 0.9.8",
|
||||
@@ -1973,21 +1966,6 @@ version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
|
||||
dependencies = [
|
||||
"foreign-types-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types-shared"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.1.0"
|
||||
@@ -1997,6 +1975,27 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "framed-websockets"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"bytemuck",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"http-body-util",
|
||||
"hyper 1.2.0",
|
||||
"hyper-util",
|
||||
"pin-project",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs2"
|
||||
version = "0.4.3"
|
||||
@@ -2249,6 +2248,17 @@ version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crunchy",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hash32"
|
||||
version = "0.3.1"
|
||||
@@ -2275,9 +2285,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.14.0"
|
||||
version = "0.14.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
|
||||
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"allocator-api2",
|
||||
@@ -2285,11 +2295,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
version = "0.8.4"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
|
||||
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
|
||||
dependencies = [
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2585,34 +2595,6 @@ dependencies = [
|
||||
"tokio-io-timeout",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tls"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"hyper 0.14.26",
|
||||
"native-tls",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tungstenite"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
|
||||
dependencies = [
|
||||
"http-body-util",
|
||||
"hyper 1.2.0",
|
||||
"hyper-util",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.21.0",
|
||||
"tungstenite 0.21.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.3"
|
||||
@@ -2690,7 +2672,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2952,7 +2934,7 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
|
||||
dependencies = [
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3005,7 +2987,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"crossbeam-utils",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
"itoa",
|
||||
"lasso",
|
||||
"measured-derive",
|
||||
@@ -3148,24 +3130,6 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"openssl",
|
||||
"openssl-probe",
|
||||
"openssl-sys",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.25.1"
|
||||
@@ -3394,55 +3358,15 @@ version = "11.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.96"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
|
||||
dependencies = [
|
||||
"opentelemetry_api",
|
||||
"opentelemetry_sdk",
|
||||
@@ -3451,8 +3375,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "opentelemetry-http"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -3464,8 +3386,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-core",
|
||||
@@ -3485,8 +3405,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
|
||||
dependencies = [
|
||||
"opentelemetry_api",
|
||||
"opentelemetry_sdk",
|
||||
@@ -3497,8 +3415,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "opentelemetry-semantic-conventions"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
]
|
||||
@@ -3506,8 +3422,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "opentelemetry_api"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
@@ -3522,8 +3436,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"crossbeam-channel",
|
||||
@@ -3567,7 +3479,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
|
||||
dependencies = [
|
||||
"dlv-list",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3888,13 +3800,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "parquet"
|
||||
version = "49.0.0"
|
||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
||||
version = "51.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"hashbrown 0.14.0",
|
||||
"half 2.4.1",
|
||||
"hashbrown 0.14.5",
|
||||
"num",
|
||||
"num-bigint",
|
||||
"paste",
|
||||
@@ -3902,12 +3815,13 @@ dependencies = [
|
||||
"thrift",
|
||||
"twox-hash",
|
||||
"zstd",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parquet_derive"
|
||||
version = "49.0.0"
|
||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
||||
version = "51.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
||||
dependencies = [
|
||||
"parquet",
|
||||
"proc-macro2",
|
||||
@@ -3934,9 +3848,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
|
||||
|
||||
[[package]]
|
||||
name = "pbkdf2"
|
||||
version = "0.12.1"
|
||||
version = "0.12.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
|
||||
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"hmac",
|
||||
@@ -4083,17 +3997,6 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-postgres",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
@@ -4359,6 +4262,7 @@ dependencies = [
|
||||
name = "proxy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
@@ -4375,12 +4279,14 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"consumption_metrics",
|
||||
"crossbeam-deque",
|
||||
"dashmap",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
"framed-websockets",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.13.2",
|
||||
"hashbrown 0.14.5",
|
||||
"hashlink",
|
||||
"hex",
|
||||
"hmac",
|
||||
@@ -4390,7 +4296,6 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper 1.2.0",
|
||||
"hyper-tungstenite",
|
||||
"hyper-util",
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
@@ -4399,7 +4304,6 @@ dependencies = [
|
||||
"md5",
|
||||
"measured",
|
||||
"metrics",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"parking_lot 0.12.1",
|
||||
@@ -4407,7 +4311,6 @@ dependencies = [
|
||||
"parquet_derive",
|
||||
"pbkdf2",
|
||||
"pin-project-lite",
|
||||
"postgres-native-tls",
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
@@ -4435,7 +4338,6 @@ dependencies = [
|
||||
"smol_str",
|
||||
"socket2 0.5.5",
|
||||
"subtle",
|
||||
"sync_wrapper",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
"tikv-jemalloc-ctl",
|
||||
@@ -4444,6 +4346,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-tungstenite",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -4455,7 +4358,7 @@ dependencies = [
|
||||
"utils",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"webpki-roots 0.25.2",
|
||||
"webpki-roots 0.26.1",
|
||||
"workspace_hack",
|
||||
"x509-parser",
|
||||
]
|
||||
@@ -4762,20 +4665,21 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"hyper-tls",
|
||||
"hyper-rustls 0.24.0",
|
||||
"ipnet",
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.21.11",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"url",
|
||||
@@ -4783,6 +4687,7 @@ dependencies = [
|
||||
"wasm-bindgen-futures",
|
||||
"wasm-streams 0.3.0",
|
||||
"web-sys",
|
||||
"webpki-roots 0.25.2",
|
||||
"winreg 0.50.0",
|
||||
]
|
||||
|
||||
@@ -5208,20 +5113,20 @@ dependencies = [
|
||||
"hex",
|
||||
"histogram",
|
||||
"itertools",
|
||||
"native-tls",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres-native-tls",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
"rustls 0.22.4",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
@@ -5229,6 +5134,7 @@ dependencies = [
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
"utils",
|
||||
"webpki-roots 0.26.1",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -5819,6 +5725,15 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
||||
|
||||
[[package]]
|
||||
name = "statx-sys"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69c325f46f705b7a66fb87f0ebb999524a7363f30f05d373277b4ef7f409fe87"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storage_broker"
|
||||
version = "0.1.0"
|
||||
@@ -6242,7 +6157,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=arpad/statx_sys#ca8446b8edb5e0aef88520f2fc209a13a834fd25"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -6276,16 +6191,6 @@ dependencies = [
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-native-tls"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
@@ -6312,10 +6217,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres-rustls"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"ring 0.17.6",
|
||||
"rustls 0.22.4",
|
||||
"tokio",
|
||||
@@ -6380,19 +6282,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.20.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.21.0",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6406,7 +6296,7 @@ dependencies = [
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -6688,25 +6578,6 @@ dependencies = [
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"data-encoding",
|
||||
"http 1.1.0",
|
||||
"httparse",
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"url",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "1.6.3"
|
||||
@@ -6804,11 +6675,12 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=arpad/statx_sys#ca8446b8edb5e0aef88520f2fc209a13a834fd25"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
"libc",
|
||||
"statx-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7477,6 +7349,7 @@ dependencies = [
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"aws-runtime",
|
||||
@@ -7502,7 +7375,7 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"getrandom 0.2.11",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper 0.14.26",
|
||||
@@ -7635,9 +7508,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.6.0"
|
||||
version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
|
||||
checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
|
||||
dependencies = [
|
||||
"zeroize_derive",
|
||||
]
|
||||
|
||||
41
Cargo.toml
41
Cargo.toml
@@ -41,14 +41,15 @@ license = "Apache-2.0"
|
||||
|
||||
## All dependency versions, used in the project
|
||||
[workspace.dependencies]
|
||||
ahash = "0.8"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
azure_core = "0.19"
|
||||
azure_identity = "0.19"
|
||||
azure_storage = "0.19"
|
||||
azure_storage_blobs = "0.19"
|
||||
azure_core = { path = "../azure-sdk-for-rust/sdk/core" } # "0.19"
|
||||
azure_identity = { path = "../azure-sdk-for-rust/sdk/identity" } # "0.19"
|
||||
azure_storage = { path = "../azure-sdk-for-rust/sdk/storage" } # "0.19"
|
||||
azure_storage_blobs = { path = "../azure-sdk-for-rust/sdk/storage_blobs" } # "0.19"
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
@@ -74,6 +75,7 @@ clap = { version = "4.0", features = ["derive"] }
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-deque = "0.8.5"
|
||||
crossbeam-utils = "0.8.5"
|
||||
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
||||
either = "1.8"
|
||||
@@ -81,13 +83,14 @@ enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
fallible-iterator = "0.2"
|
||||
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.13"
|
||||
hashlink = "0.8.4"
|
||||
hashbrown = "0.14"
|
||||
hashlink = "0.9.1"
|
||||
hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
@@ -98,7 +101,7 @@ http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.13.0"
|
||||
tokio-tungstenite = "0.20.0"
|
||||
indexmap = "2"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
@@ -111,18 +114,17 @@ md5 = "0.7.0"
|
||||
measured = { version = "0.0.21", features=["lasso"] }
|
||||
measured-process = { version = "0.0.21" }
|
||||
memoffset = "0.8"
|
||||
native-tls = "0.2"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.20.0"
|
||||
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-otlp = { path="../opentelemetry-rust/opentelemetry-otlp", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.12.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "49.0.0"
|
||||
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "51.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
procfs = "0.14"
|
||||
@@ -168,9 +170,9 @@ thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
tikv-jemalloc-ctl = "0.5"
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "arpad/statx_sys" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.11.0"
|
||||
#tokio-postgres-rustls = "0.11.0"
|
||||
tokio-rustls = "0.25"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
@@ -188,7 +190,7 @@ url = "2.2"
|
||||
urlencoding = "2.1"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
webpki-roots = "0.25"
|
||||
webpki-roots = "0.26"
|
||||
x509-parser = "0.15"
|
||||
|
||||
## TODO replace this with tracing
|
||||
@@ -197,7 +199,7 @@ log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
tokio-postgres-rustls = {path = "../tokio-postgres-rustls"}
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
@@ -238,13 +240,18 @@ tonic-build = "0.9"
|
||||
|
||||
[patch.crates-io]
|
||||
|
||||
opentelemetry_api = { path = "../opentelemetry-rust/opentelemetry-api" }
|
||||
opentelemetry_sdk = { path = "../opentelemetry-rust/opentelemetry-sdk" }
|
||||
opentelemetry-semantic-conventions = { path = "../opentelemetry-rust/opentelemetry-semantic-conventions" }
|
||||
opentelemetry = { path = "../opentelemetry-rust/opentelemetry" }
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
# bug fixes for UUID
|
||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -241,11 +241,17 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# because we build the images on different machines than where we run them.
|
||||
# Pass OPTFLAGS="" to remove it.
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
[](https://neon.tech)
|
||||
[](https://neon.tech)
|
||||
|
||||
|
||||
|
||||
# Neon
|
||||
|
||||
|
||||
@@ -23,14 +23,16 @@ serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
tar.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
reqwest = { workspace = true, features = ["json", "rustls-tls"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
thiserror.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
|
||||
116
compute_tools/src/catalog.rs
Normal file
116
compute_tools/src/catalog.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
use compute_api::{
|
||||
responses::CatalogObjects,
|
||||
spec::{Database, Role},
|
||||
};
|
||||
use futures::Stream;
|
||||
use postgres::{Client, NoTls};
|
||||
use std::{path::Path, process::Stdio, result::Result, sync::Arc};
|
||||
use tokio::{
|
||||
io::{AsyncBufReadExt, BufReader},
|
||||
process::Command,
|
||||
task,
|
||||
};
|
||||
use tokio_stream::{self as stream, StreamExt};
|
||||
use tokio_util::codec::{BytesCodec, FramedRead};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::{
|
||||
compute::ComputeNode,
|
||||
pg_helpers::{get_existing_dbs, get_existing_roles},
|
||||
};
|
||||
|
||||
pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
|
||||
let connstr = compute.connstr.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
let roles: Vec<Role>;
|
||||
{
|
||||
let mut xact = client.transaction()?;
|
||||
roles = get_existing_roles(&mut xact)?;
|
||||
}
|
||||
let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
|
||||
|
||||
Ok(CatalogObjects { roles, databases })
|
||||
})
|
||||
.await?
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SchemaDumpError {
|
||||
#[error("Database does not exist.")]
|
||||
DatabaseDoesNotExist,
|
||||
#[error("Failed to execute pg_dump.")]
|
||||
IO(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
// It uses the pg_dump utility to dump the schema of the specified database.
|
||||
// The output is streamed back to the caller and supposed to be streamed via HTTP.
|
||||
//
|
||||
// Before return the result with the output, it checks that pg_dump produced any output.
|
||||
// If not, it tries to parse the stderr output to determine if the database does not exist
|
||||
// and special error is returned.
|
||||
//
|
||||
// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
|
||||
pub async fn get_database_schema(
|
||||
compute: &Arc<ComputeNode>,
|
||||
dbname: &str,
|
||||
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
|
||||
let pgbin = &compute.pgbin;
|
||||
let basepath = Path::new(pgbin).parent().unwrap();
|
||||
let pgdump = basepath.join("pg_dump");
|
||||
let mut connstr = compute.connstr.clone();
|
||||
connstr.set_path(dbname);
|
||||
let mut cmd = Command::new(pgdump)
|
||||
.arg("--schema-only")
|
||||
.arg(connstr.as_str())
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.kill_on_drop(true)
|
||||
.spawn()?;
|
||||
|
||||
let stdout = cmd.stdout.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
|
||||
})?;
|
||||
|
||||
let stderr = cmd.stderr.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
|
||||
})?;
|
||||
|
||||
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
|
||||
let stderr_reader = BufReader::new(stderr);
|
||||
|
||||
let first_chunk = match stdout_reader.next().await {
|
||||
Some(Ok(bytes)) if !bytes.is_empty() => bytes,
|
||||
Some(Err(e)) => {
|
||||
return Err(SchemaDumpError::IO(e));
|
||||
}
|
||||
_ => {
|
||||
let mut lines = stderr_reader.lines();
|
||||
if let Some(line) = lines.next_line().await? {
|
||||
if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) {
|
||||
return Err(SchemaDumpError::DatabaseDoesNotExist);
|
||||
}
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
tokio::spawn(async move {
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
});
|
||||
|
||||
return Err(SchemaDumpError::IO(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"failed to start pg_dump",
|
||||
)));
|
||||
}
|
||||
};
|
||||
let initial_stream = stream::once(Ok(first_chunk.freeze()));
|
||||
// Consume stderr and log warnings
|
||||
tokio::spawn(async move {
|
||||
let mut lines = stderr_reader.lines();
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
});
|
||||
Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
|
||||
}
|
||||
@@ -5,17 +5,21 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::catalog::SchemaDumpError;
|
||||
use crate::catalog::{get_database_schema, get_dbs_and_roles};
|
||||
use crate::compute::forward_termination_signal;
|
||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use tokio::task;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
use utils::http::request::must_get_query_param;
|
||||
|
||||
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
|
||||
ComputeStatusResponse {
|
||||
@@ -133,6 +137,34 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/dbs_and_roles") => {
|
||||
info!("serving /dbs_and_roles GET request",);
|
||||
match get_dbs_and_roles(compute).await {
|
||||
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
|
||||
Err(_) => {
|
||||
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/database_schema") => {
|
||||
let database = match must_get_query_param(&req, "database") {
|
||||
Err(e) => return e.into_response(),
|
||||
Ok(database) => database,
|
||||
};
|
||||
info!("serving /database_schema GET request with database: {database}",);
|
||||
match get_database_schema(compute, &database).await {
|
||||
Ok(res) => render_plain(Body::wrap_stream(res)),
|
||||
Err(SchemaDumpError::DatabaseDoesNotExist) => {
|
||||
render_json_error("database does not exist", StatusCode::NOT_FOUND)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("can't get schema dump: {}", e);
|
||||
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from remote extension storage on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
@@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
};
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.header(CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(serde_json::to_string(&error).unwrap()))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn render_json(body: Body) -> Response<Body> {
|
||||
Response::builder()
|
||||
.header(CONTENT_TYPE, "application/json")
|
||||
.body(body)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn render_plain(body: Body) -> Response<Body> {
|
||||
Response::builder()
|
||||
.header(CONTENT_TYPE, "text/plain")
|
||||
.body(body)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
|
||||
@@ -68,6 +68,51 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Info"
|
||||
|
||||
/dbs_and_roles:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get databases and roles in the catalog.
|
||||
description: ""
|
||||
operationId: getDbsAndRoles
|
||||
responses:
|
||||
200:
|
||||
description: Compute schema objects
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/DbsAndRoles"
|
||||
|
||||
/database_schema:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get schema dump
|
||||
parameters:
|
||||
- name: database
|
||||
in: query
|
||||
description: Database name to dump.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
example: "postgres"
|
||||
description: Get schema dump in SQL format.
|
||||
operationId: getDatabaseSchema
|
||||
responses:
|
||||
200:
|
||||
description: Schema dump
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Schema dump in SQL format.
|
||||
404:
|
||||
description: Non existing database.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
/check_writability:
|
||||
post:
|
||||
tags:
|
||||
@@ -229,6 +274,73 @@ components:
|
||||
num_cpus:
|
||||
type: integer
|
||||
|
||||
DbsAndRoles:
|
||||
type: object
|
||||
description: Databases and Roles
|
||||
required:
|
||||
- roles
|
||||
- databases
|
||||
properties:
|
||||
roles:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Role"
|
||||
databases:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Database"
|
||||
|
||||
Database:
|
||||
type: object
|
||||
description: Database
|
||||
required:
|
||||
- name
|
||||
- owner
|
||||
- restrict_conn
|
||||
- invalid
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
owner:
|
||||
type: string
|
||||
options:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/GenericOption"
|
||||
restrict_conn:
|
||||
type: boolean
|
||||
invalid:
|
||||
type: boolean
|
||||
|
||||
Role:
|
||||
type: object
|
||||
description: Role
|
||||
required:
|
||||
- name
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
encrypted_password:
|
||||
type: string
|
||||
options:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/GenericOption"
|
||||
|
||||
GenericOption:
|
||||
type: object
|
||||
description: Schema Generic option
|
||||
required:
|
||||
- name
|
||||
- vartype
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
value:
|
||||
type: string
|
||||
vartype:
|
||||
type: string
|
||||
|
||||
ComputeState:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -8,6 +8,7 @@ pub mod configurator;
|
||||
pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
pub mod monitor;
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use tracing::warn;
|
||||
|
||||
@@ -17,17 +19,24 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
|
||||
.arg(size_bytes.to_string())
|
||||
.spawn();
|
||||
|
||||
if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
|
||||
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
child_result
|
||||
.context("spawn() failed")
|
||||
.and_then(|mut child| child.wait().context("wait() failed"))
|
||||
.and_then(|status| match status.success() {
|
||||
true => Ok(()),
|
||||
false => Err(anyhow!("process exited with {status}")),
|
||||
false => {
|
||||
// The command failed. Maybe it was because the resize-swap file doesn't exist?
|
||||
// The --once flag causes it to delete itself on success so we don't disable swap
|
||||
// while postgres is running; maybe this is fine.
|
||||
match Path::new(RESIZE_SWAP_BIN).try_exists() {
|
||||
Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
|
||||
// The path doesn't exist; we're actually ok
|
||||
Ok(false) => {
|
||||
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
|
||||
Ok(())
|
||||
},
|
||||
}
|
||||
}
|
||||
})
|
||||
// wrap any prior error with the overall context that we couldn't run the command
|
||||
.with_context(|| {
|
||||
|
||||
@@ -20,7 +20,7 @@ hex.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
reqwest = { workspace = true, features = ["blocking", "json", "rustls-tls"] }
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
@@ -152,6 +152,9 @@ pub struct NeonStorageControllerConf {
|
||||
/// Heartbeat timeout before marking a node offline
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub max_unavailable: Duration,
|
||||
|
||||
/// Threshold for auto-splitting a tenant into shards
|
||||
pub split_threshold: Option<u64>,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -164,6 +167,7 @@ impl Default for NeonStorageControllerConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
|
||||
split_threshold: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,9 +243,13 @@ impl StorageController {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}", self.postgres_port),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
};
|
||||
@@ -305,6 +309,10 @@ impl StorageController {
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(split_threshold) = self.config.split_threshold.as_ref() {
|
||||
args.push(format!("--split-threshold={split_threshold}"))
|
||||
}
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
|
||||
@@ -12,7 +12,7 @@ comfy-table.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
reqwest = { workspace = true }
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG COMPUTE_IMAGE=compute-node-v14
|
||||
ARG TAG=latest
|
||||
|
||||
|
||||
@@ -8,8 +8,6 @@
|
||||
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
|
||||
# to verify custom image builds (e.g pre-published ones).
|
||||
|
||||
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
|
||||
|
||||
set -eux -o pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
|
||||
use crate::spec::ComputeSpec;
|
||||
use crate::spec::{ComputeSpec, Database, Role};
|
||||
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
pub struct GenericAPIError {
|
||||
@@ -113,6 +113,12 @@ pub struct ComputeMetrics {
|
||||
pub total_ext_download_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
pub struct CatalogObjects {
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
/// This is not actually a compute API response, so consider moving
|
||||
/// to a different place.
|
||||
|
||||
@@ -307,7 +307,7 @@ impl KeySpace {
|
||||
}
|
||||
|
||||
/// Merge another keyspace into the current one.
|
||||
/// Note: the keyspaces must not ovelap (enforced via assertions)
|
||||
/// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
|
||||
pub fn merge(&mut self, other: &KeySpace) {
|
||||
let all_ranges = self
|
||||
.ranges
|
||||
|
||||
@@ -9,7 +9,7 @@ use std::{
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
sync::atomic::AtomicUsize,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
@@ -161,6 +161,22 @@ impl std::fmt::Debug for TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
/// A temporary lease to a specific lsn inside a timeline.
|
||||
/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct LsnLease {
|
||||
#[serde_as(as = "SystemTimeAsRfc3339Millis")]
|
||||
pub valid_until: SystemTime,
|
||||
}
|
||||
|
||||
serde_with::serde_conv!(
|
||||
SystemTimeAsRfc3339Millis,
|
||||
SystemTime,
|
||||
|time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
|
||||
|value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
|
||||
);
|
||||
|
||||
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum ActivatingFrom {
|
||||
@@ -289,7 +305,7 @@ pub struct TenantConfig {
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
@@ -308,28 +324,100 @@ pub struct TenantConfig {
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
|
||||
/// tenant config. When the first aux file written, the policy will be persisted in the
|
||||
/// `index_part.json` file and has a limited migration path.
|
||||
///
|
||||
/// Currently, we only allow the following migration path:
|
||||
///
|
||||
/// Unset -> V1
|
||||
/// -> V2
|
||||
/// -> CrossValidation -> V2
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum AuxFilePolicy {
|
||||
/// V1 aux file policy: store everything in AUX_FILE_KEY
|
||||
#[strum(ascii_case_insensitive)]
|
||||
V1,
|
||||
/// V2 aux file policy: store in the AUX_FILE keyspace
|
||||
#[strum(ascii_case_insensitive)]
|
||||
V2,
|
||||
/// Cross validation runs both formats on the write path and does validation
|
||||
/// on the read path.
|
||||
#[strum(ascii_case_insensitive)]
|
||||
CrossValidation,
|
||||
}
|
||||
|
||||
impl FromStr for AuxFilePolicy {
|
||||
type Err = anyhow::Error;
|
||||
impl AuxFilePolicy {
|
||||
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
|
||||
matches!(
|
||||
(from, to),
|
||||
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
|
||||
)
|
||||
}
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.to_lowercase();
|
||||
if s == "v1" {
|
||||
Ok(Self::V1)
|
||||
} else if s == "v2" {
|
||||
Ok(Self::V2)
|
||||
} else if s == "crossvalidation" || s == "cross_validation" {
|
||||
Ok(Self::CrossValidation)
|
||||
} else {
|
||||
anyhow::bail!("cannot parse {} to aux file policy", s)
|
||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||
pub fn default_tenant_config() -> Self {
|
||||
Self::V1
|
||||
}
|
||||
}
|
||||
|
||||
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
|
||||
pub struct AtomicAuxFilePolicy(AtomicUsize);
|
||||
|
||||
impl AtomicAuxFilePolicy {
|
||||
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
|
||||
Self(AtomicUsize::new(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn load(&self) -> Option<AuxFilePolicy> {
|
||||
match self.0.load(std::sync::atomic::Ordering::Acquire) {
|
||||
0 => None,
|
||||
other => Some(AuxFilePolicy::from_usize(other)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store(&self, policy: Option<AuxFilePolicy>) {
|
||||
self.0.store(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
std::sync::atomic::Ordering::Release,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn to_usize(self) -> usize {
|
||||
match self {
|
||||
Self::V1 => 1,
|
||||
Self::CrossValidation => 2,
|
||||
Self::V2 => 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_from_usize(this: usize) -> Option<Self> {
|
||||
match this {
|
||||
1 => Some(Self::V1),
|
||||
2 => Some(Self::CrossValidation),
|
||||
3 => Some(Self::V2),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_usize(this: usize) -> Self {
|
||||
Self::try_from_usize(this).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -350,13 +438,28 @@ impl EvictionPolicy {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum CompactionAlgorithm {
|
||||
Legacy,
|
||||
Tiered,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompactionAlgorithmSettings {
|
||||
pub kind: CompactionAlgorithm,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -604,6 +707,9 @@ pub struct TimelineInfo {
|
||||
pub state: TimelineState,
|
||||
|
||||
pub walreceiver_status: String,
|
||||
|
||||
/// The last aux file policy being used on this timeline
|
||||
pub last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -710,6 +816,8 @@ pub enum HistoricLayerInfo {
|
||||
lsn_end: Lsn,
|
||||
remote: bool,
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
l0: bool,
|
||||
},
|
||||
Image {
|
||||
layer_file_name: String,
|
||||
@@ -762,6 +870,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
|
||||
pub max_concurrent_downloads: NonZeroUsize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct IngestAuxFilesRequest {
|
||||
pub aux_files: HashMap<String, String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ListAuxFilesRequest {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct DownloadRemoteLayersTaskInfo {
|
||||
pub task_id: String,
|
||||
@@ -824,6 +942,55 @@ pub struct TenantScanRemoteStorageResponse {
|
||||
pub shards: Vec<TenantScanRemoteStorageShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum TenantSorting {
|
||||
ResidentSize,
|
||||
MaxLogicalSize,
|
||||
}
|
||||
|
||||
impl Default for TenantSorting {
|
||||
fn default() -> Self {
|
||||
Self::ResidentSize
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct TopTenantShardsRequest {
|
||||
// How would you like to sort the tenants?
|
||||
pub order_by: TenantSorting,
|
||||
|
||||
// How many results?
|
||||
pub limit: usize,
|
||||
|
||||
// Omit tenants with more than this many shards (e.g. if this is the max number of shards
|
||||
// that the caller would ever split to)
|
||||
pub where_shards_lt: Option<ShardCount>,
|
||||
|
||||
// Omit tenants where the ordering metric is less than this (this is an optimization to
|
||||
// let us quickly exclude numerous tiny shards)
|
||||
pub where_gt: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
pub struct TopTenantShardItem {
|
||||
pub id: TenantShardId,
|
||||
|
||||
/// Total size of layers on local disk for all timelines in this tenant
|
||||
pub resident_size: u64,
|
||||
|
||||
/// Total size of layers in remote storage for all timelines in this tenant
|
||||
pub physical_size: u64,
|
||||
|
||||
/// The largest logical size of a timeline within this tenant
|
||||
pub max_logical_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
pub struct TopTenantShardsResponse {
|
||||
pub shards: Vec<TopTenantShardItem>,
|
||||
}
|
||||
|
||||
pub mod virtual_file {
|
||||
#[derive(
|
||||
Copy,
|
||||
@@ -1249,6 +1416,7 @@ impl PagestreamBeMessage {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -1456,4 +1624,69 @@ mod tests {
|
||||
assert_eq!(actual, expected, "example on {line}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_file_migration_path() {
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Self-migration is not a valid migration path, and the caller should handle it by itself.
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations not allowed
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations allowed
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_parse() {
|
||||
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(
|
||||
AuxFilePolicy::from_str("cross-validation").unwrap(),
|
||||
AuxFilePolicy::CrossValidation
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,7 +125,7 @@ impl ShardCount {
|
||||
|
||||
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
||||
/// [`Self::literal`] would return.
|
||||
pub fn new(val: u8) -> Self {
|
||||
pub const fn new(val: u8) -> Self {
|
||||
Self(val)
|
||||
}
|
||||
}
|
||||
@@ -559,6 +559,14 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Obtains the shard number and count combined into a `ShardIndex`.
|
||||
pub fn shard_index(&self) -> ShardIndex {
|
||||
ShardIndex {
|
||||
shard_count: self.count,
|
||||
shard_number: self.number,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
if self.count > ShardCount(0) {
|
||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||
|
||||
@@ -820,10 +820,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
|
||||
/// Log as info/error result of handling COPY stream and send back
|
||||
/// ErrorResponse if that makes sense. Shutdown the stream if we got
|
||||
/// Terminate. TODO: transition into waiting for Sync msg if we initiate the
|
||||
/// close.
|
||||
/// - Log as info/error result of handling COPY stream and send back
|
||||
/// ErrorResponse if that makes sense.
|
||||
/// - Shutdown the stream if we got Terminate.
|
||||
/// - Then close the connection because we don't handle exiting from COPY
|
||||
/// stream normally.
|
||||
pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
|
||||
use CopyStreamHandlerEnd::*;
|
||||
|
||||
@@ -849,10 +850,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
}
|
||||
}
|
||||
|
||||
if let Terminate = &end {
|
||||
self.state = ProtoState::Closed;
|
||||
}
|
||||
|
||||
let err_to_send_and_errcode = match &end {
|
||||
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
|
||||
Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
|
||||
@@ -882,6 +879,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
error!("failed to send ErrorResponse: {}", ee);
|
||||
}
|
||||
}
|
||||
|
||||
// Proper COPY stream finishing to continue using the connection is not
|
||||
// implemented at the server side (we don't need it so far). To prevent
|
||||
// further usages of the connection, close it.
|
||||
self.framed.shutdown().await.ok();
|
||||
self.state = ProtoState::Closed;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -178,6 +178,13 @@ impl PgConnectionConfig {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// The password is intentionally hidden and not part of this display string.
|
||||
write!(f, "postgresql://{}:{}", self.host, self.port)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
|
||||
|
||||
@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
// Is there enough space on the page for another logical message and an
|
||||
// XLOG_SWITCH? If not, start over.
|
||||
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
|
||||
if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
|
||||
if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -373,31 +373,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
||||
&[&(repeats as i32)],
|
||||
)?;
|
||||
break;
|
||||
}
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
XLOG_SIZE_OF_XLOG_RECORD
|
||||
);
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
XLOG_SIZE_OF_XLOG_RECORD
|
||||
);
|
||||
|
||||
// Emit the XLOG_SWITCH
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let next_segment = PgLsn::from(0x0200_0000);
|
||||
ensure!(
|
||||
xlog_switch_record_end < next_segment,
|
||||
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
|
||||
xlog_switch_record_end,
|
||||
next_segment
|
||||
);
|
||||
ensure!(
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
||||
xlog_switch_record_end,
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
);
|
||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
||||
// Emit the XLOG_SWITCH
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
let xlog_switch_record_end: PgLsn =
|
||||
client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
|
||||
if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
!= XLOG_SIZE_OF_XLOG_SHORT_PHD
|
||||
{
|
||||
warn!(
|
||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
|
||||
xlog_switch_record_end,
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
);
|
||||
continue;
|
||||
}
|
||||
return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,6 @@ use http_types::{StatusCode, Url};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::RemoteStorageActivity;
|
||||
use crate::{
|
||||
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
|
||||
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
@@ -526,10 +525,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
self.concurrency_limiter.activity()
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
|
||||
@@ -263,17 +263,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), TimeTravelError>;
|
||||
|
||||
/// Query how busy we currently are: may be used by callers which wish to politely
|
||||
/// back off if there are already a lot of operations underway.
|
||||
fn activity(&self) -> RemoteStorageActivity;
|
||||
}
|
||||
|
||||
pub struct RemoteStorageActivity {
|
||||
pub read_available: usize,
|
||||
pub read_total: usize,
|
||||
pub write_available: usize,
|
||||
pub write_total: usize,
|
||||
}
|
||||
|
||||
/// DownloadStream is sensitive to the timeout and cancellation used with the original
|
||||
@@ -455,15 +444,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn activity(&self) -> RemoteStorageActivity {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.activity(),
|
||||
Self::AwsS3(s) => s.activity(),
|
||||
Self::AzureBlob(s) => s.activity(),
|
||||
Self::Unreliable(s) => s.activity(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
@@ -794,9 +774,6 @@ struct ConcurrencyLimiter {
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
write: Arc<Semaphore>,
|
||||
read: Arc<Semaphore>,
|
||||
|
||||
write_total: usize,
|
||||
read_total: usize,
|
||||
}
|
||||
|
||||
impl ConcurrencyLimiter {
|
||||
@@ -825,21 +802,10 @@ impl ConcurrencyLimiter {
|
||||
Arc::clone(self.for_kind(kind)).acquire_owned().await
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
RemoteStorageActivity {
|
||||
read_available: self.read.available_permits(),
|
||||
read_total: self.read_total,
|
||||
write_available: self.write.available_permits(),
|
||||
write_total: self.write_total,
|
||||
}
|
||||
}
|
||||
|
||||
fn new(limit: usize) -> ConcurrencyLimiter {
|
||||
Self {
|
||||
read: Arc::new(Semaphore::new(limit)),
|
||||
write: Arc::new(Semaphore::new(limit)),
|
||||
read_total: limit,
|
||||
write_total: limit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
|
||||
TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
@@ -605,16 +605,6 @@ impl RemoteStorage for LocalFs {
|
||||
) -> Result<(), TimeTravelError> {
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
// LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
|
||||
RemoteStorageActivity {
|
||||
read_available: 16,
|
||||
read_total: 16,
|
||||
write_available: 16,
|
||||
write_total: 16,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
|
||||
@@ -47,8 +47,8 @@ use utils::backoff;
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
|
||||
Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
|
||||
TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
|
||||
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
@@ -975,10 +975,6 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
self.concurrency_limiter.activity()
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
|
||||
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
RemoteStorageActivity, StorageMetadata, TimeTravelError,
|
||||
StorageMetadata, TimeTravelError,
|
||||
};
|
||||
|
||||
pub struct UnreliableWrapper {
|
||||
@@ -213,8 +213,4 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
self.inner.activity()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,6 +50,9 @@ pub struct SkTimelineInfo {
|
||||
pub safekeeper_connstr: Option<String>,
|
||||
#[serde(default)]
|
||||
pub http_connstr: Option<String>,
|
||||
// Minimum of all active RO replicas flush LSN
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub standby_horizon: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
|
||||
@@ -135,7 +135,8 @@ impl Gate {
|
||||
let started_at = std::time::Instant::now();
|
||||
let mut do_close = std::pin::pin!(self.do_close());
|
||||
|
||||
let nag_after = Duration::from_secs(1);
|
||||
// with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
|
||||
let nag_after = Duration::from_millis(100);
|
||||
|
||||
let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
|
||||
return;
|
||||
|
||||
@@ -496,9 +496,9 @@ mod tests {
|
||||
// TODO: When updating Postgres versions, this test will cause
|
||||
// problems. Postgres version in message needs updating.
|
||||
//
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||
|
||||
@@ -84,7 +84,7 @@ storage_broker.workspace = true
|
||||
tenant_size_model.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
reqwest.workspace = true
|
||||
reqwest = { workspace = true }
|
||||
rpds.workspace = true
|
||||
enum-map.workspace = true
|
||||
enumset = { workspace = true, features = ["serde"]}
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
async-trait.workspace = true
|
||||
reqwest.workspace = true
|
||||
reqwest = { workspace = true }
|
||||
utils.workspace = true
|
||||
serde.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub mod util;
|
||||
@@ -486,6 +490,18 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn top_tenant_shards(
|
||||
&self,
|
||||
request: TopTenantShardsRequest,
|
||||
) -> Result<TopTenantShardsResponse> {
|
||||
let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
|
||||
self.request(Method::POST, uri, request)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn layer_map_info(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -549,4 +565,57 @@ impl Client {
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn ingest_aux_files(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
aux_files: HashMap<String, String>,
|
||||
) -> Result<bool> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
|
||||
);
|
||||
let resp = self
|
||||
.request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
|
||||
.await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => Ok(true),
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_aux_files(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashMap<String, Bytes>> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/list_aux_files",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
|
||||
);
|
||||
let resp = self
|
||||
.request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
|
||||
.await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => {
|
||||
let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
|
||||
Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
|
||||
})?;
|
||||
Ok(resp)
|
||||
}
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_compaction::helpers::PAGE_SZ;
|
||||
use pageserver_compaction::simulator::MockTimeline;
|
||||
use rand::Rng;
|
||||
use std::io::Write;
|
||||
@@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
|
||||
let mut executor = MockTimeline::new();
|
||||
|
||||
// Convert the logical size in MB into a key range.
|
||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
|
||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
|
||||
//let key_range = u64::MIN..u64::MAX;
|
||||
println!(
|
||||
"starting simulation with key range {:016X}-{:016X}",
|
||||
|
||||
@@ -25,7 +25,7 @@ use std::collections::{HashSet, VecDeque};
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::helpers::{
|
||||
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
|
||||
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
|
||||
};
|
||||
use crate::interface::*;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -379,7 +379,7 @@ where
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?,
|
||||
&self.shard_identity,
|
||||
) * 8192;
|
||||
) * PAGE_SZ;
|
||||
|
||||
let wal_size = job
|
||||
.input_layers
|
||||
@@ -441,7 +441,7 @@ where
|
||||
let mut window = KeyspaceWindow::new(
|
||||
E::Key::MIN..E::Key::MAX,
|
||||
keyspace,
|
||||
self.target_file_size / 8192,
|
||||
self.target_file_size / PAGE_SZ,
|
||||
);
|
||||
while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
|
||||
new_jobs.push(CompactionJob::<E> {
|
||||
@@ -663,8 +663,8 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
// This is used by over_with_images to decide on good split points
|
||||
/// Sliding window through keyspace and values for image layer
|
||||
/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
|
||||
struct KeyspaceWindow<K> {
|
||||
head: KeyspaceWindowHead<K>,
|
||||
|
||||
@@ -804,9 +804,9 @@ struct WindowElement<K> {
|
||||
accum_size: u64,
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
//
|
||||
// This is used to decide what layer to write next, from the beginning of the window.
|
||||
/// Sliding window through keyspace and values for delta layer tiling
|
||||
///
|
||||
/// This is used to decide which delta layer to write next.
|
||||
struct Window<K> {
|
||||
elems: VecDeque<WindowElement<K>>,
|
||||
|
||||
@@ -830,11 +830,13 @@ where
|
||||
fn feed(&mut self, key: K, size: u64) {
|
||||
let last_size;
|
||||
if let Some(last) = self.elems.back_mut() {
|
||||
assert!(last.last_key <= key);
|
||||
if key == last.last_key {
|
||||
last.accum_size += size;
|
||||
return;
|
||||
}
|
||||
// We require the keys to be strictly increasing for the window.
|
||||
// Keys should already have been deduplicated by `accum_key_values`
|
||||
assert!(
|
||||
last.last_key < key,
|
||||
"last_key(={}) >= key(={key})",
|
||||
last.last_key
|
||||
);
|
||||
last_size = last.accum_size;
|
||||
} else {
|
||||
last_size = 0;
|
||||
@@ -922,7 +924,7 @@ where
|
||||
// If we're willing to stretch it up to 1.25 target size, could we
|
||||
// gobble up the rest of the work? This avoids creating very small
|
||||
// "tail" layers at the end of the keyspace
|
||||
if !has_more && self.remain_size() < target_size * 5 / 3 {
|
||||
if !has_more && self.remain_size() < target_size * 5 / 4 {
|
||||
self.commit_upto(self.elems.len());
|
||||
} else {
|
||||
let delta_split_at = self.find_size_split(target_size);
|
||||
|
||||
@@ -16,6 +16,8 @@ use std::pin::Pin;
|
||||
use std::task::{ready, Poll};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub const PAGE_SZ: u64 = 8192;
|
||||
|
||||
pub fn keyspace_total_size<K>(
|
||||
keyspace: &CompactionKeySpace<K>,
|
||||
shard_identity: &ShardIdentity,
|
||||
|
||||
@@ -14,6 +14,7 @@ use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::helpers::PAGE_SZ;
|
||||
use crate::helpers::{merge_delta_keys, overlaps_with};
|
||||
|
||||
use crate::interface;
|
||||
@@ -379,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.file_size(),
|
||||
MockLayer::Image(this) => this.file_size(),
|
||||
MockLayer::Delta(this) => this.file_size,
|
||||
MockLayer::Image(this) => this.file_size,
|
||||
}
|
||||
}
|
||||
fn short_id(&self) -> String {
|
||||
@@ -509,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
|
||||
let new_layer = Arc::new(MockImageLayer {
|
||||
key_range: key_range.clone(),
|
||||
lsn_range: lsn..lsn,
|
||||
file_size: accum_size * 8192,
|
||||
file_size: accum_size * PAGE_SZ,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!(
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output<'a> {
|
||||
layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
|
||||
layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
timeline_metadata: &'a TimelineMetadata,
|
||||
}
|
||||
|
||||
@@ -219,6 +219,7 @@ fn handle_metadata(
|
||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||
println!("Current metadata:\n{meta:?}");
|
||||
let mut update_meta = false;
|
||||
// TODO: simplify this part
|
||||
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
|
||||
meta = TimelineMetadata::new(
|
||||
*disk_consistent_lsn,
|
||||
|
||||
98
pageserver/pagebench/src/cmd/aux_files.rs
Normal file
98
pageserver/pagebench/src/cmd/aux_files.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Ingest aux files into the pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(args));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: None,
|
||||
targets: {
|
||||
if let Some(targets) = &args.targets {
|
||||
if targets.len() != 1 {
|
||||
anyhow::bail!("must specify exactly one target");
|
||||
}
|
||||
Some(targets.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let timeline = timelines[0];
|
||||
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
println!("operating on timeline {}", timeline);
|
||||
|
||||
mgmt_api_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
config: TenantConfig {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.await?;
|
||||
|
||||
for batch in 0..100 {
|
||||
let items = (0..100)
|
||||
.map(|id| {
|
||||
(
|
||||
format!("pg_logical/mappings/{:03}.{:03}", batch, id),
|
||||
format!("{:08}", id),
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
let file_cnt = items.len();
|
||||
mgmt_api_client
|
||||
.ingest_aux_files(tenant_shard_id, timeline_id, items)
|
||||
.await?;
|
||||
println!("ingested {file_cnt} files");
|
||||
}
|
||||
|
||||
let files = mgmt_api_client
|
||||
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
|
||||
.await?;
|
||||
|
||||
println!("{} files found", files.len());
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
@@ -14,6 +14,7 @@ mod util {
|
||||
|
||||
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
|
||||
mod cmd {
|
||||
pub(super) mod aux_files;
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod ondemand_download_churn;
|
||||
@@ -27,6 +28,7 @@ enum Args {
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||
AuxFiles(cmd::aux_files::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -46,6 +48,7 @@ fn main() {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
Args::AuxFiles(args) => cmd::aux_files::main(args),
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
@@ -534,18 +534,12 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
});
|
||||
}
|
||||
EvictionLayer::Secondary(layer) => {
|
||||
let file_size = layer.metadata.file_size();
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
let file_size = layer.metadata.file_size;
|
||||
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.secondary_tenant
|
||||
.evict_layer(
|
||||
tenant_manager.get_conf(),
|
||||
layer.timeline_id,
|
||||
layer.name,
|
||||
layer.metadata,
|
||||
)
|
||||
.evict_layer(layer.timeline_id, layer.name)
|
||||
.await;
|
||||
Ok(file_size)
|
||||
});
|
||||
@@ -647,7 +641,7 @@ impl EvictionLayer {
|
||||
pub(crate) fn get_file_size(&self) -> u64 {
|
||||
match self {
|
||||
Self::Attached(l) => l.layer_desc().file_size,
|
||||
Self::Secondary(sl) => sl.metadata.file_size(),
|
||||
Self::Secondary(sl) => sl.metadata.file_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,6 +257,37 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Obtain lease for the given LSN
|
||||
parameters:
|
||||
- name: lsn
|
||||
in: query
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
description: A LSN to obtain the lease for
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LsnLease"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -581,6 +612,80 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
ŕequired: true
|
||||
schema:
|
||||
type: string
|
||||
|
||||
put:
|
||||
description: |
|
||||
Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
|
||||
Current implementation might not be retryable across failure cases, but will be enhanced in future.
|
||||
Detaching should be expected to be expensive operation. Timeouts should be retried.
|
||||
responses:
|
||||
"200":
|
||||
description: |
|
||||
The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
|
||||
If any timelines were deleted after reparenting, they might not be on this list.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/AncestorDetached"
|
||||
|
||||
"400":
|
||||
description: |
|
||||
Number of early checks meaning the timeline cannot be detached now:
|
||||
- the ancestor of timeline has an ancestor: not supported, see RFC
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
"404":
|
||||
description: Tenant or timeline not found.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
|
||||
"409":
|
||||
description: |
|
||||
The timeline can never be detached:
|
||||
- timeline has no ancestor, implying that the timeline has never had an ancestor
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
|
||||
"500":
|
||||
description: |
|
||||
Transient error, for example, pageserver shutdown happened while
|
||||
processing the request but we were unable to distinguish that. Must
|
||||
be retried.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
"503":
|
||||
description: |
|
||||
Temporarily unavailable, please retry. Possible reasons:
|
||||
- another timeline detach for the same tenant is underway, please retry later
|
||||
- detected shutdown error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/:
|
||||
get:
|
||||
description: Get tenants list
|
||||
@@ -980,6 +1085,15 @@ components:
|
||||
type: string
|
||||
enum: [past, present, future, nodata]
|
||||
|
||||
LsnLease:
|
||||
type: object
|
||||
required:
|
||||
- valid_until
|
||||
properties:
|
||||
valid_until:
|
||||
type: string
|
||||
format: date-time
|
||||
|
||||
PageserverUtilization:
|
||||
type: object
|
||||
required:
|
||||
@@ -1037,6 +1151,19 @@ components:
|
||||
format: int64
|
||||
description: How many bytes of layer content were in the latest layer heatmap
|
||||
|
||||
AncestorDetached:
|
||||
type: object
|
||||
required:
|
||||
- reparented_timelines
|
||||
properties:
|
||||
reparented_timelines:
|
||||
type: array
|
||||
description: Set of reparented timeline ids
|
||||
properties:
|
||||
type: string
|
||||
format: hex
|
||||
description: TimelineId
|
||||
|
||||
|
||||
Error:
|
||||
type: object
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//!
|
||||
//! Management HTTP API
|
||||
//!
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
@@ -14,6 +16,9 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
@@ -24,7 +29,11 @@ use pageserver_api::models::TenantScanRemoteStorageShard;
|
||||
use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::TopTenantShardsRequest;
|
||||
use pageserver_api::models::TopTenantShardsResponse;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
@@ -66,6 +75,7 @@ use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::SpawnMode;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
@@ -271,6 +281,13 @@ impl From<GetTenantError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetTimelineError> for ApiError {
|
||||
fn from(gte: GetTimelineError) -> Self {
|
||||
// Rationale: tenant is activated only after eligble timelines activate
|
||||
ApiError::NotFound(gte.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetActiveTenantError> for ApiError {
|
||||
fn from(e: GetActiveTenantError) -> ApiError {
|
||||
match e {
|
||||
@@ -378,7 +395,7 @@ async fn build_timeline_info_common(
|
||||
let guard = timeline.last_received_wal.lock().unwrap();
|
||||
if let Some(info) = guard.as_ref() {
|
||||
(
|
||||
Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
|
||||
Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
|
||||
Some(info.last_received_msg_lsn),
|
||||
Some(info.last_received_msg_ts),
|
||||
)
|
||||
@@ -433,6 +450,8 @@ async fn build_timeline_info_common(
|
||||
state,
|
||||
|
||||
walreceiver_status,
|
||||
|
||||
last_aux_file_policy: timeline.last_aux_file_policy.load(),
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
@@ -633,9 +652,7 @@ async fn timeline_preserve_initdb_handler(
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
let timeline = tenant.get_timeline(timeline_id, false)?;
|
||||
|
||||
timeline
|
||||
.preserve_initdb_archive()
|
||||
@@ -677,9 +694,7 @@ async fn timeline_detail_handler(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
let timeline = tenant.get_timeline(timeline_id, false)?;
|
||||
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
@@ -1691,6 +1706,32 @@ async fn handle_tenant_break(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
// Obtains an lsn lease on the given timeline.
|
||||
async fn lsn_lease_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let lsn: Lsn = parse_query_param(&request, "lsn")?
|
||||
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline
|
||||
.make_lsn_lease(lsn, &ctx)
|
||||
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
async fn timeline_gc_handler(
|
||||
mut request: Request<Body>,
|
||||
@@ -1726,6 +1767,8 @@ async fn timeline_compact_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
@@ -1734,6 +1777,9 @@ async fn timeline_compact_handler(
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
@@ -1758,6 +1804,8 @@ async fn timeline_checkpoint_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
@@ -1771,6 +1819,10 @@ async fn timeline_checkpoint_handler(
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
@@ -1854,14 +1906,11 @@ async fn timeline_detach_ancestor_handler(
|
||||
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
let ctx = &ctx;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let (_guard, prepared) = timeline
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
.await?;
|
||||
|
||||
let res = state
|
||||
.tenant_manager
|
||||
@@ -1995,9 +2044,7 @@ async fn active_timeline_of_active_tenant(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))
|
||||
Ok(tenant.get_timeline(timeline_id, true)?)
|
||||
}
|
||||
|
||||
async fn always_panic_handler(
|
||||
@@ -2261,6 +2308,31 @@ async fn post_tracing_event_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn force_aux_policy_switch_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
|
||||
let policy: AuxFilePolicy = json_request(&mut r).await?;
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
timeline
|
||||
.do_switch_aux_policy(policy)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn put_io_engine_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -2323,6 +2395,150 @@ async fn get_utilization(
|
||||
.map_err(ApiError::InternalServerError)
|
||||
}
|
||||
|
||||
async fn list_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let body: ListAuxFilesRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
|
||||
json_response(StatusCode::OK, files)
|
||||
}
|
||||
|
||||
async fn ingest_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let body: IngestAuxFilesRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let mut modification = timeline.begin_modification(
|
||||
Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
|
||||
);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
for (fname, content) in body.aux_files {
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
modification
|
||||
.commit(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Report on the largest tenants on this pageserver, for the storage controller to identify
|
||||
/// candidates for splitting
|
||||
async fn post_top_tenants(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let request: TopTenantShardsRequest = json_request(&mut r).await?;
|
||||
let state = get_state(&r);
|
||||
|
||||
fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 {
|
||||
match order_by {
|
||||
TenantSorting::ResidentSize => sizes.resident_size,
|
||||
TenantSorting::MaxLogicalSize => sizes.max_logical_size,
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
struct HeapItem {
|
||||
metric: u64,
|
||||
sizes: TopTenantShardItem,
|
||||
}
|
||||
|
||||
impl PartialOrd for HeapItem {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which
|
||||
/// supports popping the greatest item but not the smallest.
|
||||
impl Ord for HeapItem {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
Reverse(self.metric).cmp(&Reverse(other.metric))
|
||||
}
|
||||
}
|
||||
|
||||
let mut top_n: BinaryHeap<HeapItem> = BinaryHeap::with_capacity(request.limit);
|
||||
|
||||
// FIXME: this is a lot of clones to take this tenant list
|
||||
for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() {
|
||||
if let Some(shards_lt) = request.where_shards_lt {
|
||||
// Ignore tenants which already have >= this many shards
|
||||
if tenant_shard_id.shard_count >= shards_lt {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let sizes = match tenant_slot {
|
||||
TenantSlot::Attached(tenant) => tenant.get_sizes(),
|
||||
TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let metric = get_size_metric(&sizes, &request.order_by);
|
||||
|
||||
if let Some(gt) = request.where_gt {
|
||||
// Ignore tenants whose metric is <= the lower size threshold, to do less sorting work
|
||||
if metric <= gt {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match top_n.peek() {
|
||||
None => {
|
||||
// Top N list is empty: candidate becomes first member
|
||||
top_n.push(HeapItem { metric, sizes });
|
||||
}
|
||||
Some(i) if i.metric > metric && top_n.len() < request.limit => {
|
||||
// Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end
|
||||
top_n.push(HeapItem { metric, sizes });
|
||||
}
|
||||
Some(i) if i.metric > metric => {
|
||||
// List is at limit and lowest value is greater than our candidate, drop it.
|
||||
}
|
||||
Some(_) => top_n.push(HeapItem { metric, sizes }),
|
||||
}
|
||||
|
||||
while top_n.len() > request.limit {
|
||||
top_n.pop();
|
||||
}
|
||||
}
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
TopTenantShardsResponse {
|
||||
shards: top_n.into_iter().map(|i| i.sizes).collect(),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
@@ -2535,6 +2751,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|
||||
|r| api_handler(r, get_timestamp_of_lsn_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|
||||
|r| api_handler(r, lsn_lease_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
|
||||
|r| api_handler(r, timeline_gc_handler),
|
||||
@@ -2608,6 +2828,19 @@ pub fn make_router(
|
||||
|r| api_handler(r, timeline_collect_keyspace),
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
||||
|r| api_handler(r, force_aux_policy_switch_handler),
|
||||
)
|
||||
.get("/v1/utilization", |r| api_handler(r, get_utilization))
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
|
||||
|r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
|
||||
|r| testing_api_handler("list_aux_files", r, list_aux_files),
|
||||
)
|
||||
.post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -525,6 +525,15 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_standby_horizon",
|
||||
"Standby apply LSN for which GC is hold off, by timeline.",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_resident_physical_size",
|
||||
@@ -1858,7 +1867,6 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) time_spent_on_ingest: Histogram,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
@@ -1882,12 +1890,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
time_spent_on_ingest: register_histogram!(
|
||||
"pageserver_wal_ingest_put_value_seconds",
|
||||
"Actual time spent on ingesting a record",
|
||||
redo_histogram_time_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
@@ -2098,7 +2100,8 @@ pub(crate) struct TimelineMetrics {
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
resident_physical_size_gauge: UIntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub aux_file_size_gauge: IntGauge,
|
||||
@@ -2167,6 +2170,9 @@ impl TimelineMetrics {
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let standby_horizon_gauge = STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -2212,6 +2218,7 @@ impl TimelineMetrics {
|
||||
find_gc_cutoffs_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
aux_file_size_gauge,
|
||||
@@ -2246,6 +2253,7 @@ impl TimelineMetrics {
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
{
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
@@ -2312,6 +2320,7 @@ use pin_project_lite::pin_project;
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -2321,35 +2330,35 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
|
||||
/// Maintain a per timeline gauge in addition to the global gauge.
|
||||
struct PerTimelineRemotePhysicalSizeGauge {
|
||||
last_set: u64,
|
||||
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
|
||||
last_set: AtomicU64,
|
||||
gauge: UIntGauge,
|
||||
}
|
||||
|
||||
impl PerTimelineRemotePhysicalSizeGauge {
|
||||
fn new(per_timeline_gauge: UIntGauge) -> Self {
|
||||
Self {
|
||||
last_set: per_timeline_gauge.get(),
|
||||
last_set: AtomicU64::new(0),
|
||||
gauge: per_timeline_gauge,
|
||||
}
|
||||
}
|
||||
fn set(&mut self, sz: u64) {
|
||||
pub(crate) fn set(&self, sz: u64) {
|
||||
self.gauge.set(sz);
|
||||
if sz < self.last_set {
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
|
||||
let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
|
||||
if sz < prev {
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
|
||||
} else {
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
|
||||
};
|
||||
self.last_set = sz;
|
||||
}
|
||||
fn get(&self) -> u64 {
|
||||
pub(crate) fn get(&self) -> u64 {
|
||||
self.gauge.get()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PerTimelineRemotePhysicalSizeGauge {
|
||||
fn drop(&mut self) {
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2357,7 +2366,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
||||
pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
|
||||
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
|
||||
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
@@ -2365,38 +2374,27 @@ pub(crate) struct RemoteTimelineClientMetrics {
|
||||
|
||||
impl RemoteTimelineClientMetrics {
|
||||
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id_str = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id_str = timeline_id.to_string();
|
||||
|
||||
let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
|
||||
REMOTE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
||||
shard_id: format!("{}", tenant_shard_id.shard_slug()),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
tenant_id: tenant_id_str,
|
||||
shard_id: shard_id_str,
|
||||
timeline_id: timeline_id_str,
|
||||
calls: Mutex::new(HashMap::default()),
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
bytes_finished_counter: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
remote_physical_size_gauge,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn remote_physical_size_set(&self, sz: u64) {
|
||||
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
|
||||
let gauge = guard.get_or_insert_with(|| {
|
||||
PerTimelineRemotePhysicalSizeGauge::new(
|
||||
REMOTE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
&self.timeline_id,
|
||||
])
|
||||
.unwrap(),
|
||||
)
|
||||
});
|
||||
gauge.set(sz);
|
||||
}
|
||||
|
||||
pub(crate) fn remote_physical_size_get(&self) -> u64 {
|
||||
let guard = self.remote_physical_size_gauge.lock().unwrap();
|
||||
guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
|
||||
}
|
||||
|
||||
pub fn remote_operation_time(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
|
||||
@@ -19,6 +19,7 @@ use pageserver_api::models::{
|
||||
};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use pageserver_api::shard::ShardNumber;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
@@ -33,6 +34,7 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::time::SystemTime;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
@@ -258,6 +260,8 @@ async fn page_service_conn_main(
|
||||
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
|
||||
let socket = std::pin::pin!(socket);
|
||||
|
||||
fail::fail_point!("ps::connection-start::pre-login");
|
||||
|
||||
// XXX: pgbackend.run() should take the connection_ctx,
|
||||
// and create a child per-query context when it invokes process_query.
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
@@ -601,6 +605,7 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
trace!("query: {copy_data_bytes:?}");
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
// Trace request if needed
|
||||
if let Some(t) = tracer.as_mut() {
|
||||
@@ -615,6 +620,7 @@ impl PageServerHandler {
|
||||
|
||||
let (response, span) = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -624,6 +630,7 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -633,6 +640,7 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
// shard_id is filled in by the handler
|
||||
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
@@ -643,6 +651,7 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -652,6 +661,7 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -905,6 +915,39 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id, %lsn))]
|
||||
async fn handle_make_lsn_lease<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
|
||||
.await?;
|
||||
let lease = timeline.make_lsn_lease(lsn, ctx)?;
|
||||
let valid_until = lease
|
||||
.valid_until
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.map_err(|e| QueryError::Other(e.into()))?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"valid_until",
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(
|
||||
&valid_until.as_millis().to_be_bytes(),
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_rel_exists_request(
|
||||
&mut self,
|
||||
@@ -1470,6 +1513,7 @@ where
|
||||
_pgb: &mut PostgresBackend<IO>,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
fail::fail_point!("ps::connection-start::startup-packet");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1484,11 +1528,12 @@ where
|
||||
Err(QueryError::SimulatedConnectionError)
|
||||
});
|
||||
|
||||
fail::fail_point!("ps::connection-start::process-query");
|
||||
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
debug!("process query {query_string:?}");
|
||||
if query_string.starts_with("pagestream_v2 ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
let parts = query_string.split_whitespace().collect::<Vec<_>>();
|
||||
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
@@ -1513,9 +1558,7 @@ where
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if query_string.starts_with("pagestream ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
@@ -1540,10 +1583,7 @@ where
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if query_string.starts_with("basebackup ") {
|
||||
let (_, params_raw) = query_string.split_at("basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for basebackup command"
|
||||
@@ -1561,26 +1601,23 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let lsn = if params.len() >= 3 {
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
Some(
|
||||
Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let gzip = if params.len() >= 4 {
|
||||
if params[3] == "--gzip" {
|
||||
true
|
||||
} else {
|
||||
let gzip = match params.get(3) {
|
||||
Some(&"--gzip") => true,
|
||||
None => false,
|
||||
Some(third_param) => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Parameter in position 3 unknown {}",
|
||||
params[3],
|
||||
)));
|
||||
"Parameter in position 3 unknown {third_param}",
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
@@ -1604,10 +1641,7 @@ where
|
||||
res?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for get_last_record_rlsn command"
|
||||
@@ -1649,10 +1683,7 @@ where
|
||||
.await?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
else if query_string.starts_with("fullbackup ") {
|
||||
let (_, params_raw) = query_string.split_at("fullbackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for fullbackup command"
|
||||
@@ -1669,18 +1700,18 @@ where
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if params.len() > 2 {
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
Some(
|
||||
Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let prev_lsn = if params.len() > 3 {
|
||||
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
|
||||
Some(
|
||||
Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
|
||||
Lsn::from_str(prev_lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
@@ -1713,8 +1744,7 @@ where
|
||||
// 2. Run:
|
||||
// cat my_backup/base.tar | psql -h $PAGESERVER \
|
||||
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
|
||||
let (_, params_raw) = query_string.split_at("import basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
let params = &parts[2..];
|
||||
if params.len() != 5 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import basebackup command"
|
||||
@@ -1763,8 +1793,7 @@ where
|
||||
//
|
||||
// Files are scheduled to be persisted to remote storage, and the
|
||||
// caller should poll the http api to check when that is done.
|
||||
let (_, params_raw) = query_string.split_at("import wal ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
let params = &parts[2..];
|
||||
if params.len() != 4 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import wal command"
|
||||
@@ -1802,10 +1831,45 @@ where
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("show ") {
|
||||
} else if query_string.starts_with("lease lsn ") {
|
||||
let params = &parts[2..];
|
||||
if params.len() != 3 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number {} for lease lsn command",
|
||||
params.len()
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_shard_id = TenantShardId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_shard_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// The caller is responsible for providing correct lsn.
|
||||
let lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
|
||||
match self
|
||||
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error obtaining lsn lease for {lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else if let Some(params) = parts.strip_prefix(&["show"]) {
|
||||
// show <tenant_id>
|
||||
let (_, params_raw) = query_string.split_at("show ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
if params.len() != 1 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for config command"
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
@@ -35,12 +34,16 @@ use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
|
||||
/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LsnForTimestamp {
|
||||
@@ -718,10 +721,11 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
match self.get_switch_aux_file_policy() {
|
||||
AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
|
||||
AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
|
||||
AuxFilePolicy::CrossValidation => {
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
match current_policy {
|
||||
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::CrossValidation) => {
|
||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
|
||||
match (v1_result, v2_result) {
|
||||
@@ -1469,7 +1473,40 @@ impl<'a> DatadirModification<'a> {
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let policy = self.tline.get_switch_aux_file_policy();
|
||||
let switch_policy = self.tline.get_switch_aux_file_policy();
|
||||
|
||||
let policy = {
|
||||
let current_policy = self.tline.last_aux_file_policy.load();
|
||||
// Allowed switch path:
|
||||
// * no aux files -> v1/v2/cross-validation
|
||||
// * cross-validation->v2
|
||||
|
||||
let current_policy = if current_policy.is_none() {
|
||||
// This path will only be hit once per tenant: we will decide the final policy in this code block.
|
||||
// The next call to `put_file` will always have `last_aux_file_policy != None`.
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
|
||||
if aux_files_key_v1.is_empty() {
|
||||
None
|
||||
} else {
|
||||
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
||||
Some(AuxFilePolicy::V1)
|
||||
}
|
||||
} else {
|
||||
current_policy
|
||||
};
|
||||
|
||||
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
|
||||
self.tline.do_switch_aux_policy(switch_policy)?;
|
||||
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
|
||||
switch_policy
|
||||
} else {
|
||||
// This branch handles non-valid migration path, and the case that switch_policy == current_policy.
|
||||
// And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
|
||||
current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
|
||||
}
|
||||
};
|
||||
|
||||
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
@@ -1677,8 +1714,6 @@ impl<'a> DatadirModification<'a> {
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
@@ -1718,8 +1753,6 @@ impl<'a> DatadirModification<'a> {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
timer.observe_duration();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1755,6 +1788,12 @@ impl<'a> DatadirModification<'a> {
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
/// Only used during unit tests, force putting a key into the modification.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
|
||||
self.put(key, val);
|
||||
}
|
||||
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -238,10 +238,13 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
io_buf,
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({} bytes)", len),
|
||||
format!("blob too large ({len} bytes)"),
|
||||
)),
|
||||
);
|
||||
}
|
||||
if len > 0x0fff_ffff {
|
||||
tracing::warn!("writing blob above future limit ({len} bytes)");
|
||||
}
|
||||
let mut len_buf = (len as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
io_buf.extend_from_slice(&len_buf[..]);
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
use anyhow::bail;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
@@ -320,7 +321,7 @@ pub struct TenantConf {
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
pub compaction_algorithm: CompactionAlgorithm,
|
||||
pub compaction_algorithm: CompactionAlgorithmSettings,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is #of bytes of WAL.
|
||||
@@ -373,6 +374,8 @@ pub struct TenantConf {
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||
/// file is written.
|
||||
pub switch_aux_file_policy: AuxFilePolicy,
|
||||
}
|
||||
|
||||
@@ -404,7 +407,7 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
@@ -495,7 +498,9 @@ impl TenantConfOpt {
|
||||
.unwrap_or(global_conf.compaction_threshold),
|
||||
compaction_algorithm: self
|
||||
.compaction_algorithm
|
||||
.unwrap_or(global_conf.compaction_algorithm),
|
||||
.as_ref()
|
||||
.unwrap_or(&global_conf.compaction_algorithm)
|
||||
.clone(),
|
||||
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
|
||||
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
|
||||
image_creation_threshold: self
|
||||
@@ -548,7 +553,9 @@ impl Default for TenantConf {
|
||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
|
||||
compaction_algorithm: CompactionAlgorithmSettings {
|
||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||
},
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
@@ -574,7 +581,7 @@ impl Default for TenantConf {
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: AuxFilePolicy::V1,
|
||||
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use pageserver_api::upcall_api::ReAttachResponseTenant;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
@@ -127,6 +127,8 @@ pub(crate) enum ShardSelector {
|
||||
First,
|
||||
/// Pick the shard that holds this key
|
||||
Page(Key),
|
||||
/// The shard ID is known: pick the given shard
|
||||
Known(ShardIndex),
|
||||
}
|
||||
|
||||
/// A convenience for use with the re_attach ControlPlaneClient function: rather
|
||||
@@ -2067,6 +2069,11 @@ impl TenantManager {
|
||||
return ShardResolveResult::Found(tenant.clone());
|
||||
}
|
||||
}
|
||||
ShardSelector::Known(shard)
|
||||
if tenant.shard_identity.shard_index() == shard =>
|
||||
{
|
||||
return ShardResolveResult::Found(tenant.clone());
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,6 +189,7 @@ use camino::Utf8Path;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
|
||||
pub(crate) use download::download_initdb_tar_zst;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -317,7 +318,7 @@ pub struct RemoteTimelineClient {
|
||||
|
||||
upload_queue: Mutex<UploadQueue>,
|
||||
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
|
||||
storage_impl: GenericRemoteStorage,
|
||||
|
||||
@@ -461,11 +462,11 @@ impl RemoteTimelineClient {
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.metrics.remote_physical_size_set(size);
|
||||
self.metrics.remote_physical_size_gauge.set(size);
|
||||
}
|
||||
|
||||
pub fn get_remote_physical_size(&self) -> u64 {
|
||||
self.metrics.remote_physical_size_get()
|
||||
self.metrics.remote_physical_size_gauge.get()
|
||||
}
|
||||
|
||||
//
|
||||
@@ -518,6 +519,7 @@ impl RemoteTimelineClient {
|
||||
&self,
|
||||
layer_file_name: &LayerName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<u64> {
|
||||
@@ -536,6 +538,7 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
layer_file_name,
|
||||
layer_metadata,
|
||||
local_path,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
@@ -609,6 +612,17 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
|
||||
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
|
||||
self: &Arc<Self>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.last_aux_file_policy = last_aux_file_policy;
|
||||
self.schedule_index_upload(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
@@ -1178,7 +1192,7 @@ impl RemoteTimelineClient {
|
||||
&self.storage_impl,
|
||||
uploaded.local_path(),
|
||||
&remote_path,
|
||||
uploaded.metadata().file_size(),
|
||||
uploaded.metadata().file_size,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
@@ -1559,7 +1573,7 @@ impl RemoteTimelineClient {
|
||||
&self.storage_impl,
|
||||
local_path,
|
||||
&remote_path,
|
||||
layer_metadata.file_size(),
|
||||
layer_metadata.file_size,
|
||||
&self.cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -1754,7 +1768,7 @@ impl RemoteTimelineClient {
|
||||
UploadOp::UploadLayer(_, m) => (
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
|
||||
),
|
||||
UploadOp::UploadMetadata(_, _) => (
|
||||
RemoteOpFileKind::Index,
|
||||
@@ -1849,6 +1863,7 @@ impl RemoteTimelineClient {
|
||||
dangling_files: HashMap::default(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: initialized.last_aux_file_policy,
|
||||
};
|
||||
|
||||
let upload_queue = std::mem::replace(
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
@@ -50,19 +49,13 @@ pub async fn download_layer_file<'a>(
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &'a LayerName,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
layer_file_name,
|
||||
&layer_metadata.generation,
|
||||
);
|
||||
|
||||
let remote_path = remote_layer_path(
|
||||
&tenant_shard_id.tenant_id,
|
||||
@@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>(
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let bytes_amount = download_retry(
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
|
||||
@@ -91,7 +84,7 @@ pub async fn download_layer_file<'a>(
|
||||
)
|
||||
.await?;
|
||||
|
||||
let expected = layer_metadata.file_size();
|
||||
let expected = layer_metadata.file_size;
|
||||
if expected != bytes_amount {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -16,46 +17,6 @@ use pageserver_api::shard::ShardIndex;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
//#[cfg_attr(test, derive(Default))]
|
||||
pub struct LayerFileMetadata {
|
||||
file_size: u64,
|
||||
|
||||
pub(crate) generation: Generation,
|
||||
|
||||
pub(crate) shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||
fn from(other: &IndexLayerMetadata) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
shard: other.shard,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size,
|
||||
generation,
|
||||
shard,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
}
|
||||
|
||||
// TODO seems like another part of the remote storage file format
|
||||
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
|
||||
/// In-memory representation of an `index_part.json` file
|
||||
///
|
||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||
@@ -76,7 +37,7 @@ pub struct IndexPart {
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
|
||||
pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated for convenience when reading the serialized structure, but is
|
||||
@@ -88,6 +49,16 @@ pub struct IndexPart {
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) lineage: Lineage,
|
||||
|
||||
/// Describes the kind of aux files stored in the timeline.
|
||||
///
|
||||
/// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
|
||||
/// A V1 setting after V2 files have been committed is not accepted.
|
||||
///
|
||||
/// None means no aux files have been written to the storage before the point
|
||||
/// when this flag is introduced.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
@@ -101,10 +72,11 @@ impl IndexPart {
|
||||
/// is always generated from the keys of `layer_metadata`)
|
||||
/// - 4: timeline_layers is fully removed.
|
||||
/// - 5: lineage was added
|
||||
const LATEST_VERSION: usize = 5;
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
const LATEST_VERSION: usize = 6;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -113,11 +85,9 @@ impl IndexPart {
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata: TimelineMetadata,
|
||||
lineage: Lineage,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> Self {
|
||||
let layer_metadata = layers_and_metadata
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
|
||||
.collect();
|
||||
let layer_metadata = layers_and_metadata.clone();
|
||||
|
||||
Self {
|
||||
version: Self::LATEST_VERSION,
|
||||
@@ -126,6 +96,7 @@ impl IndexPart {
|
||||
metadata,
|
||||
deleted_at: None,
|
||||
lineage,
|
||||
last_aux_file_policy,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,8 +126,13 @@ impl IndexPart {
|
||||
example_metadata.disk_consistent_lsn(),
|
||||
example_metadata,
|
||||
Default::default(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
|
||||
self.last_aux_file_policy
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&UploadQueueInitialized> for IndexPart {
|
||||
@@ -165,13 +141,22 @@ impl From<&UploadQueueInitialized> for IndexPart {
|
||||
let metadata = uq.latest_metadata.clone();
|
||||
let lineage = uq.latest_lineage.clone();
|
||||
|
||||
Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
|
||||
Self::new(
|
||||
&uq.latest_files,
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
lineage,
|
||||
uq.last_aux_file_policy,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialized form of [`LayerFileMetadata`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexLayerMetadata {
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub struct LayerFileMetadata {
|
||||
pub file_size: u64,
|
||||
|
||||
#[serde(default = "Generation::none")]
|
||||
@@ -183,12 +168,12 @@ pub struct IndexLayerMetadata {
|
||||
pub shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl From<&LayerFileMetadata> for IndexLayerMetadata {
|
||||
fn from(other: &LayerFileMetadata) -> Self {
|
||||
IndexLayerMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
shard: other.shard,
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size,
|
||||
generation,
|
||||
shard,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -282,12 +267,12 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -299,6 +284,7 @@ mod tests {
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -323,12 +309,12 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -340,6 +326,7 @@ mod tests {
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -365,12 +352,12 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 2,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -383,6 +370,7 @@ mod tests {
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -428,6 +416,7 @@ mod tests {
|
||||
.unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||
@@ -451,12 +440,12 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
version: 4,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -468,6 +457,7 @@ mod tests {
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -492,12 +482,12 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
version: 5,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 23289856,
|
||||
generation: Generation::new(1),
|
||||
shard: ShardIndex::unsharded(),
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 1015808,
|
||||
generation: Generation::new(1),
|
||||
shard: ShardIndex::unsharded(),
|
||||
@@ -511,6 +501,57 @@ mod tests {
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v6_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version":6,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
"deleted_at": "2023-07-31T09:00:00.123",
|
||||
"lineage":{
|
||||
"original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
|
||||
"reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
|
||||
},
|
||||
"last_aux_file_policy": "V2"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 6,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
lineage: Lineage {
|
||||
reparenting_history_truncated: false,
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
last_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
|
||||
@@ -6,11 +6,9 @@ mod scheduler;
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
virtual_file::MaybeFatalIo,
|
||||
};
|
||||
|
||||
use self::{
|
||||
@@ -21,9 +19,8 @@ use self::{
|
||||
use super::{
|
||||
config::{SecondaryLocationConfig, TenantConfOpt},
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::{layer::local_layer_path, LayerName},
|
||||
storage_layer::LayerName,
|
||||
};
|
||||
|
||||
use pageserver_api::{
|
||||
@@ -178,13 +175,7 @@ impl SecondaryTenant {
|
||||
|
||||
/// Cancellation safe, but on cancellation the eviction will go through
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
|
||||
pub(crate) async fn evict_layer(
|
||||
self: &Arc<Self>,
|
||||
conf: &PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
name: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
) {
|
||||
pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let guard = match self.gate.enter() {
|
||||
@@ -197,41 +188,11 @@ impl SecondaryTenant {
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&self.tenant_shard_id,
|
||||
&timeline_id,
|
||||
&name,
|
||||
&metadata.generation,
|
||||
);
|
||||
|
||||
let this = self.clone();
|
||||
|
||||
// spawn it to be cancellation safe
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _guard = guard;
|
||||
// We tolerate ENOENT, because between planning eviction and executing
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
let deleted = std::fs::remove_file(local_path);
|
||||
|
||||
let not_found = deleted
|
||||
.as_ref()
|
||||
.is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
|
||||
|
||||
let deleted = if not_found {
|
||||
false
|
||||
} else {
|
||||
deleted
|
||||
.map(|()| true)
|
||||
.fatal_err("Deleting layer during eviction")
|
||||
};
|
||||
|
||||
if !deleted {
|
||||
// skip updating accounting and putting perhaps later timestamp
|
||||
return;
|
||||
}
|
||||
|
||||
// Update the timeline's state. This does not have to be synchronized with
|
||||
// the download process, because:
|
||||
@@ -250,8 +211,15 @@ impl SecondaryTenant {
|
||||
// of the cache.
|
||||
let mut detail = this.detail.lock().unwrap();
|
||||
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
|
||||
timeline_detail.on_disk_layers.remove(&name);
|
||||
timeline_detail.evicted_at.insert(name, now);
|
||||
let removed = timeline_detail.on_disk_layers.remove(&name);
|
||||
|
||||
// We might race with removal of the same layer during downloads, if it was removed
|
||||
// from the heatmap. If we see that the OnDiskState is gone, then no need to
|
||||
// do a physical deletion or store in evicted_at.
|
||||
if let Some(removed) = removed {
|
||||
removed.remove_blocking();
|
||||
timeline_detail.evicted_at.insert(name, now);
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
|
||||
@@ -45,10 +45,10 @@ use crate::tenant::{
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::{Future, StreamExt};
|
||||
use futures::Future;
|
||||
use pageserver_api::models::SecondaryProgress;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
|
||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
@@ -62,20 +62,10 @@ use super::{
|
||||
CommandRequest, DownloadCommand,
|
||||
};
|
||||
|
||||
/// For each tenant, how long must have passed since the last download_tenant call before
|
||||
/// calling it again. This is approximately the time by which local data is allowed
|
||||
/// to fall behind remote data.
|
||||
///
|
||||
/// TODO: this should just be a default, and the actual period should be controlled
|
||||
/// via the heatmap itself
|
||||
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
|
||||
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
|
||||
/// Range of concurrency we may use when downloading layers within a timeline. This is independent
|
||||
/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
|
||||
/// `PageServerConf::secondary_download_concurrency`
|
||||
const MAX_LAYER_CONCURRENCY: usize = 16;
|
||||
const MIN_LAYER_CONCURRENCY: usize = 1;
|
||||
/// For each tenant, default period for how long must have passed since the last download_tenant call before
|
||||
/// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
|
||||
/// download, if the uploader populated it.
|
||||
const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
|
||||
pub(super) async fn downloader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
@@ -85,19 +75,18 @@ pub(super) async fn downloader_task(
|
||||
cancel: CancellationToken,
|
||||
root_ctx: RequestContext,
|
||||
) {
|
||||
// How many tenants' secondary download operations we will run concurrently
|
||||
let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
|
||||
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
|
||||
|
||||
let generator = SecondaryDownloader {
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
root_ctx,
|
||||
};
|
||||
let mut scheduler = Scheduler::new(generator, tenant_concurrency);
|
||||
let mut scheduler = Scheduler::new(generator, concurrency);
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("secondary_downloads"))
|
||||
.instrument(info_span!("secondary_download_scheduler"))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -111,6 +100,7 @@ struct SecondaryDownloader {
|
||||
pub(super) struct OnDiskState {
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
local_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
impl OnDiskState {
|
||||
@@ -121,12 +111,26 @@ impl OnDiskState {
|
||||
_ame: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
local_path: Utf8PathBuf,
|
||||
) -> Self {
|
||||
Self {
|
||||
metadata,
|
||||
access_time,
|
||||
local_path,
|
||||
}
|
||||
}
|
||||
|
||||
// This is infallible, because all errors are either acceptable (ENOENT), or totally
|
||||
// unexpected (fatal).
|
||||
pub(super) fn remove_blocking(&self) {
|
||||
// We tolerate ENOENT, because between planning eviction and executing
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
std::fs::remove_file(&self.local_path)
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.fatal_err("Deleting secondary layer")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
@@ -137,14 +141,22 @@ pub(super) struct SecondaryDetailTimeline {
|
||||
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
|
||||
}
|
||||
|
||||
// Aspects of a heatmap that we remember after downloading it
|
||||
#[derive(Clone, Debug)]
|
||||
struct DownloadSummary {
|
||||
etag: Etag,
|
||||
#[allow(unused)]
|
||||
mtime: SystemTime,
|
||||
upload_period: Duration,
|
||||
}
|
||||
|
||||
/// This state is written by the secondary downloader, it is opaque
|
||||
/// to TenantManager
|
||||
#[derive(Debug)]
|
||||
pub(super) struct SecondaryDetail {
|
||||
pub(super) config: SecondaryLocationConfig,
|
||||
|
||||
last_download: Option<Instant>,
|
||||
last_etag: Option<Etag>,
|
||||
last_download: Option<DownloadSummary>,
|
||||
next_download: Option<Instant>,
|
||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
}
|
||||
@@ -174,7 +186,6 @@ impl SecondaryDetail {
|
||||
Self {
|
||||
config,
|
||||
last_download: None,
|
||||
last_etag: None,
|
||||
next_download: None,
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
@@ -228,9 +239,8 @@ impl SecondaryDetail {
|
||||
|
||||
struct PendingDownload {
|
||||
secondary_state: Arc<SecondaryTenant>,
|
||||
last_download: Option<Instant>,
|
||||
last_download: Option<DownloadSummary>,
|
||||
target_time: Option<Instant>,
|
||||
period: Option<Duration>,
|
||||
}
|
||||
|
||||
impl scheduler::PendingJob for PendingDownload {
|
||||
@@ -280,10 +290,17 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
|
||||
tracing::debug!("Secondary tenant download completed");
|
||||
|
||||
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
|
||||
// take priority to run again.
|
||||
let mut detail = secondary_state.detail.lock().unwrap();
|
||||
detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
|
||||
|
||||
let period = detail
|
||||
.last_download
|
||||
.as_ref()
|
||||
.map(|d| d.upload_period)
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
|
||||
|
||||
// We advance next_download irrespective of errors: we don't want error cases to result in
|
||||
// expensive busy-polling.
|
||||
detail.next_download = Some(Instant::now() + period_jitter(period, 5));
|
||||
}
|
||||
|
||||
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
|
||||
@@ -316,11 +333,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
if detail.next_download.is_none() {
|
||||
// Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times. Subsequent
|
||||
// rounds will use a smaller jitter to avoid accidentally synchronizing later.
|
||||
detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
|
||||
detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
|
||||
"Using our constant, which is known to be small compared with clock range",
|
||||
));
|
||||
}
|
||||
(detail.last_download, detail.next_download.unwrap())
|
||||
(detail.last_download.clone(), detail.next_download.unwrap())
|
||||
};
|
||||
|
||||
if now > next_download {
|
||||
@@ -328,7 +345,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
secondary_state: secondary_tenant,
|
||||
last_download,
|
||||
target_time: Some(next_download),
|
||||
period: Some(DOWNLOAD_FRESHEN_INTERVAL),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
@@ -354,7 +370,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
|
||||
Ok(PendingDownload {
|
||||
target_time: None,
|
||||
period: None,
|
||||
last_download: None,
|
||||
secondary_state: tenant,
|
||||
})
|
||||
@@ -371,7 +386,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
secondary_state,
|
||||
last_download,
|
||||
target_time,
|
||||
period,
|
||||
} = job;
|
||||
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
@@ -393,7 +407,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
tracing::warn!("Insufficient space while downloading. Will retry later.");
|
||||
}
|
||||
Err(UpdateError::Cancelled) => {
|
||||
tracing::debug!("Shut down while downloading");
|
||||
tracing::info!("Shut down while downloading");
|
||||
},
|
||||
Err(UpdateError::Deserialize(e)) => {
|
||||
tracing::error!("Corrupt content while downloading tenant: {e}");
|
||||
@@ -408,20 +422,15 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
|
||||
// If the job had a target execution time, we may check our final execution
|
||||
// time against that for observability purposes.
|
||||
if let (Some(target_time), Some(period)) = (target_time, period) {
|
||||
// Only track execution lag if this isn't our first download: otherwise, it is expected
|
||||
// that execution will have taken longer than our configured interval, for example
|
||||
// when starting up a pageserver and
|
||||
if last_download.is_some() {
|
||||
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||
let elapsed = Instant::now().duration_since(target_time);
|
||||
if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
|
||||
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||
let elapsed = Instant::now().duration_since(target_time);
|
||||
|
||||
warn_when_period_overrun(
|
||||
elapsed,
|
||||
period,
|
||||
BackgroundLoopKind::SecondaryDownload,
|
||||
);
|
||||
}
|
||||
warn_when_period_overrun(
|
||||
elapsed,
|
||||
last_download.upload_period,
|
||||
BackgroundLoopKind::SecondaryDownload,
|
||||
);
|
||||
}
|
||||
|
||||
CompleteDownload {
|
||||
@@ -510,12 +519,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
|
||||
// We will use the etag from last successful download to make the download conditional on changes
|
||||
let last_etag = self
|
||||
let last_download = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.last_etag
|
||||
.last_download
|
||||
.clone();
|
||||
|
||||
// Download the tenant's heatmap
|
||||
@@ -524,7 +533,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
etag: heatmap_etag,
|
||||
bytes: heatmap_bytes,
|
||||
} = match tokio::select!(
|
||||
bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
|
||||
bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
|
||||
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
||||
) {
|
||||
HeatMapDownload::Unmodified => {
|
||||
@@ -553,6 +562,39 @@ impl<'a> TenantDownloader<'a> {
|
||||
heatmap.timelines.len()
|
||||
);
|
||||
|
||||
// Get or initialize the local disk state for the timelines we will update
|
||||
let mut timeline_states = HashMap::new();
|
||||
for timeline in &heatmap.timelines {
|
||||
let timeline_state = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.get(&timeline.timeline_id)
|
||||
.cloned();
|
||||
|
||||
let timeline_state = match timeline_state {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
// We have no existing state: need to scan local disk for layers first.
|
||||
let timeline_state =
|
||||
init_timeline_state(self.conf, tenant_shard_id, timeline).await;
|
||||
|
||||
// Re-acquire detail lock now that we're done with async load from local FS
|
||||
self.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(timeline.timeline_id, timeline_state.clone());
|
||||
timeline_state
|
||||
}
|
||||
};
|
||||
|
||||
timeline_states.insert(timeline.timeline_id, timeline_state);
|
||||
}
|
||||
|
||||
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
|
||||
// principle that deletions should be done before writes wherever possible, and so that we can use this
|
||||
// phase to initialize our SecondaryProgress.
|
||||
@@ -563,6 +605,10 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
let timeline_state = timeline_states
|
||||
.remove(&timeline.timeline_id)
|
||||
.expect("Just populated above");
|
||||
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!(
|
||||
"Cancelled before downloading timeline {}",
|
||||
@@ -572,7 +618,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
let timeline_id = timeline.timeline_id;
|
||||
self.download_timeline(timeline, ctx)
|
||||
self.download_timeline(timeline, timeline_state, ctx)
|
||||
.instrument(tracing::info_span!(
|
||||
"secondary_download_timeline",
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
@@ -584,7 +630,30 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
// Only update last_etag after a full successful download: this way will not skip
|
||||
// the next download, even if the heatmap's actual etag is unchanged.
|
||||
self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
|
||||
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
|
||||
etag: heatmap_etag,
|
||||
mtime: heatmap_mtime,
|
||||
upload_period: heatmap
|
||||
.upload_period_ms
|
||||
.map(|ms| Duration::from_millis(ms as u64))
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
|
||||
});
|
||||
|
||||
// Robustness: we should have updated progress properly, but in case we didn't, make sure
|
||||
// we don't leave the tenant in a state where we claim to have successfully downloaded
|
||||
// everything, but our progress is incomplete. The invariant here should be that if
|
||||
// we have set `last_download` to this heatmap's etag, then the next time we see that
|
||||
// etag we can safely do no work (i.e. we must be complete).
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
debug_assert!(progress.layers_downloaded == progress.layers_total);
|
||||
debug_assert!(progress.bytes_downloaded == progress.bytes_total);
|
||||
if progress.layers_downloaded != progress.layers_total
|
||||
|| progress.bytes_downloaded != progress.bytes_total
|
||||
{
|
||||
tracing::warn!("Correcting drift in progress stats ({progress:?})");
|
||||
progress.layers_downloaded = progress.layers_total;
|
||||
progress.bytes_downloaded = progress.bytes_total;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -640,7 +709,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let mut layer_byte_count: u64 = timeline_state
|
||||
.on_disk_layers
|
||||
.values()
|
||||
.map(|l| l.metadata.file_size())
|
||||
.map(|l| l.metadata.file_size)
|
||||
.sum();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
@@ -651,7 +720,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
.get(layer_file_name)
|
||||
.unwrap()
|
||||
.metadata
|
||||
.file_size();
|
||||
.file_size;
|
||||
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
@@ -761,6 +830,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
async fn download_timeline(
|
||||
&self,
|
||||
timeline: HeatMapTimeline,
|
||||
timeline_state: SecondaryDetailTimeline,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -769,38 +839,8 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
// Clone a view of what layers already exist on disk
|
||||
let timeline_state = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.get(&timeline.timeline_id)
|
||||
.cloned();
|
||||
|
||||
let timeline_state = match timeline_state {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
// We have no existing state: need to scan local disk for layers first.
|
||||
let timeline_state =
|
||||
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
|
||||
|
||||
// Re-acquire detail lock now that we're done with async load from local FS
|
||||
self.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(timeline.timeline_id, timeline_state.clone());
|
||||
timeline_state
|
||||
}
|
||||
};
|
||||
|
||||
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
let mut download_futs = Vec::new();
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
@@ -816,20 +856,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
if cfg!(debug_assertions) {
|
||||
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||
// are already present on disk are really there.
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
match tokio::fs::metadata(&local_path).await {
|
||||
match tokio::fs::metadata(&on_disk.local_path).await {
|
||||
Ok(meta) => {
|
||||
tracing::debug!(
|
||||
"Layer {} present at {}, size {}",
|
||||
layer.name,
|
||||
local_path,
|
||||
on_disk.local_path,
|
||||
meta.len(),
|
||||
);
|
||||
}
|
||||
@@ -837,7 +869,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
tracing::warn!(
|
||||
"Layer {} not found at {} ({})",
|
||||
layer.name,
|
||||
local_path,
|
||||
on_disk.local_path,
|
||||
e
|
||||
);
|
||||
debug_assert!(false);
|
||||
@@ -845,9 +877,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
||||
|| on_disk.access_time != layer.access_time
|
||||
{
|
||||
if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
|
||||
// We already have this layer on disk. Update its access time.
|
||||
tracing::debug!(
|
||||
"Access time updated for layer {}: {} -> {}",
|
||||
@@ -883,31 +913,14 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
download_futs.push(self.download_layer(
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
layer,
|
||||
ctx,
|
||||
));
|
||||
}
|
||||
|
||||
// Break up layer downloads into chunks, so that for each chunk we can re-check how much
|
||||
// concurrency to use based on activity level of remote storage.
|
||||
while !download_futs.is_empty() {
|
||||
let chunk =
|
||||
download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
|
||||
|
||||
let concurrency = Self::layer_concurrency(self.remote_storage.activity());
|
||||
|
||||
let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
|
||||
let mut result_stream = std::pin::pin!(result_stream);
|
||||
while let Some(result) = result_stream.next().await {
|
||||
match result {
|
||||
Err(e) => return Err(e),
|
||||
Ok(None) => {
|
||||
// No error, but we didn't download the layer. Don't mark it touched
|
||||
}
|
||||
Ok(Some(layer)) => touched.push(layer),
|
||||
match self
|
||||
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
|
||||
.await?
|
||||
{
|
||||
Some(layer) => touched.push(layer),
|
||||
None => {
|
||||
// Not an error but we didn't download it: remote layer is missing. Don't add it to the list of
|
||||
// things to consider touched.
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -926,13 +939,21 @@ impl<'a> TenantDownloader<'a> {
|
||||
v.get_mut().access_time = t.access_time;
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&t.name,
|
||||
&t.metadata.generation,
|
||||
);
|
||||
e.insert(OnDiskState::new(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
t.name,
|
||||
LayerFileMetadata::from(&t.metadata),
|
||||
t.metadata.clone(),
|
||||
t.access_time,
|
||||
local_path,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -955,14 +976,28 @@ impl<'a> TenantDownloader<'a> {
|
||||
&self.secondary_state.cancel
|
||||
);
|
||||
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
tracing::info!(
|
||||
"Starting download of layer {}, size {}",
|
||||
layer.name,
|
||||
layer.metadata.file_size
|
||||
);
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
self.remote_storage,
|
||||
*tenant_shard_id,
|
||||
*timeline_id,
|
||||
&layer.name,
|
||||
&LayerFileMetadata::from(&layer.metadata),
|
||||
&layer.metadata,
|
||||
&local_path,
|
||||
&self.secondary_state.cancel,
|
||||
ctx,
|
||||
)
|
||||
@@ -977,6 +1012,14 @@ impl<'a> TenantDownloader<'a> {
|
||||
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||
layer.name
|
||||
);
|
||||
|
||||
// If the layer is 404, adjust the progress statistics to reflect that we will not download it.
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
progress.layers_total = progress.layers_total.saturating_sub(1);
|
||||
progress.bytes_total = progress
|
||||
.bytes_total
|
||||
.saturating_sub(layer.metadata.file_size);
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
@@ -1012,19 +1055,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
Ok(Some(layer))
|
||||
}
|
||||
|
||||
/// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
|
||||
fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
|
||||
// When less than 75% of units are available, use minimum concurrency. Else, do a linear mapping
|
||||
// of our concurrency range to the units available within the remaining 25%.
|
||||
let clamp_at = (activity.read_total * 3) / 4;
|
||||
if activity.read_available > clamp_at {
|
||||
(MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
|
||||
/ (activity.read_total - clamp_at)
|
||||
} else {
|
||||
MIN_LAYER_CONCURRENCY
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
|
||||
@@ -1114,8 +1144,9 @@ async fn init_timeline_state(
|
||||
tenant_shard_id,
|
||||
&heatmap.timeline_id,
|
||||
name,
|
||||
LayerFileMetadata::from(&remote_meta.metadata),
|
||||
remote_meta.metadata.clone(),
|
||||
remote_meta.access_time,
|
||||
file_path,
|
||||
),
|
||||
);
|
||||
}
|
||||
@@ -1147,58 +1178,3 @@ async fn init_timeline_state(
|
||||
|
||||
detail
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn layer_concurrency() {
|
||||
// Totally idle
|
||||
assert_eq!(
|
||||
TenantDownloader::layer_concurrency(RemoteStorageActivity {
|
||||
read_available: 16,
|
||||
read_total: 16,
|
||||
write_available: 16,
|
||||
write_total: 16
|
||||
}),
|
||||
MAX_LAYER_CONCURRENCY
|
||||
);
|
||||
|
||||
// Totally busy
|
||||
assert_eq!(
|
||||
TenantDownloader::layer_concurrency(RemoteStorageActivity {
|
||||
read_available: 0,
|
||||
read_total: 16,
|
||||
|
||||
write_available: 16,
|
||||
write_total: 16
|
||||
}),
|
||||
MIN_LAYER_CONCURRENCY
|
||||
);
|
||||
|
||||
// Edge of the range at which we interpolate
|
||||
assert_eq!(
|
||||
TenantDownloader::layer_concurrency(RemoteStorageActivity {
|
||||
read_available: 12,
|
||||
read_total: 16,
|
||||
|
||||
write_available: 16,
|
||||
write_total: 16
|
||||
}),
|
||||
MIN_LAYER_CONCURRENCY
|
||||
);
|
||||
|
||||
// Midpoint of the range in which we interpolate
|
||||
assert_eq!(
|
||||
TenantDownloader::layer_concurrency(RemoteStorageActivity {
|
||||
read_available: 14,
|
||||
read_total: 16,
|
||||
|
||||
write_available: 16,
|
||||
write_total: 16
|
||||
}),
|
||||
MAX_LAYER_CONCURRENCY / 2
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::time::SystemTime;
|
||||
|
||||
use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
|
||||
use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
|
||||
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapLayer {
|
||||
pub(super) name: LayerName,
|
||||
pub(super) metadata: IndexLayerMetadata,
|
||||
pub(super) metadata: LayerFileMetadata,
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(super) access_time: SystemTime,
|
||||
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
|
||||
impl HeatMapLayer {
|
||||
pub(crate) fn new(
|
||||
name: LayerName,
|
||||
metadata: IndexLayerMetadata,
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
) -> Self {
|
||||
Self {
|
||||
|
||||
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("heatmap_uploader"))
|
||||
.instrument(info_span!("heatmap_upload_scheduler"))
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
@@ -179,6 +179,13 @@ where
|
||||
// Schedule some work, if concurrency limit permits it
|
||||
self.spawn_pending();
|
||||
|
||||
// This message is printed every scheduling iteration as proof of liveness when looking at logs
|
||||
tracing::info!(
|
||||
"Status: {} tasks running, {} pending",
|
||||
self.running.len(),
|
||||
self.pending.len()
|
||||
);
|
||||
|
||||
// Between scheduling iterations, we will:
|
||||
// - Drain any complete tasks and spawn pending tasks
|
||||
// - Handle incoming administrative commands
|
||||
@@ -258,7 +265,11 @@ where
|
||||
|
||||
self.tasks.spawn(fut);
|
||||
|
||||
self.running.insert(tenant_shard_id, in_progress);
|
||||
let replaced = self.running.insert(tenant_shard_id, in_progress);
|
||||
debug_assert!(replaced.is_none());
|
||||
if replaced.is_some() {
|
||||
tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
|
||||
}
|
||||
}
|
||||
|
||||
/// For all pending tenants that are elegible for execution, spawn their task.
|
||||
@@ -268,7 +279,9 @@ where
|
||||
while !self.pending.is_empty() && self.running.len() < self.concurrency {
|
||||
// unwrap: loop condition includes !is_empty()
|
||||
let pending = self.pending.pop_front().unwrap();
|
||||
self.do_spawn(pending);
|
||||
if !self.running.contains_key(pending.get_tenant_shard_id()) {
|
||||
self.do_spawn(pending);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -321,7 +334,8 @@ where
|
||||
|
||||
let tenant_shard_id = job.get_tenant_shard_id();
|
||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||
tracing::info!("Command already running, waiting for it");
|
||||
tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Command already running, waiting for it");
|
||||
barrier
|
||||
} else {
|
||||
let running = self.spawn_now(job);
|
||||
|
||||
@@ -113,12 +113,20 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Bag of data accumulated during a vectored get
|
||||
/// Bag of data accumulated during a vectored get..
|
||||
pub(crate) struct ValuesReconstructState {
|
||||
/// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
|
||||
/// should not expect to get anything from this hashmap.
|
||||
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
||||
|
||||
/// The keys which are already retrieved
|
||||
keys_done: KeySpaceRandomAccum,
|
||||
|
||||
/// The keys covered by the image layers
|
||||
keys_with_image_coverage: Option<Range<Key>>,
|
||||
|
||||
// Statistics that are still accessible as a caller of `get_vectored_impl`.
|
||||
layers_visited: u32,
|
||||
delta_layers_visited: u32,
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
@@ -126,7 +134,9 @@ impl ValuesReconstructState {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
keys_with_image_coverage: None,
|
||||
layers_visited: 0,
|
||||
delta_layers_visited: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,8 +150,17 @@ impl ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn on_layer_visited(&mut self) {
|
||||
pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
|
||||
self.layers_visited += 1;
|
||||
if let ReadableLayer::PersistentLayer(layer) = layer {
|
||||
if layer.layer_desc().is_delta() {
|
||||
self.delta_layers_visited += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_delta_layers_visited(&self) -> u32 {
|
||||
self.delta_layers_visited
|
||||
}
|
||||
|
||||
pub(crate) fn get_layers_visited(&self) -> u32 {
|
||||
@@ -171,6 +190,16 @@ impl ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
/// On hitting image layer, we can mark all keys in this range as done, because
|
||||
/// if the image layer does not contain a key, it is deleted/never added.
|
||||
pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
|
||||
let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
|
||||
assert_eq!(
|
||||
prev_val, None,
|
||||
"should consume the keyspace before the next iteration"
|
||||
);
|
||||
}
|
||||
|
||||
/// Update the state collected for a given key.
|
||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||
///
|
||||
@@ -233,8 +262,12 @@ impl ValuesReconstructState {
|
||||
|
||||
/// Returns the key space describing the keys that have
|
||||
/// been marked as completed since the last call to this function.
|
||||
pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
|
||||
self.keys_done.consume_keyspace()
|
||||
/// Returns individual keys done, and the image layer coverage.
|
||||
pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
|
||||
(
|
||||
self.keys_done.consume_keyspace(),
|
||||
self.keys_with_image_coverage.take(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs::File;
|
||||
@@ -158,6 +158,7 @@ pub struct ImageLayerInner {
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
file: VirtualFile,
|
||||
@@ -419,6 +420,7 @@ impl ImageLayerInner {
|
||||
file,
|
||||
file_id,
|
||||
max_vectored_read_bytes,
|
||||
key_range: actual_summary.key_range,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -471,19 +473,27 @@ impl ImageLayerInner {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let reads = self
|
||||
.plan_reads(keyspace, ctx)
|
||||
.plan_reads(keyspace, None, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
reconstruct_state.on_image_layer_visited(&self.key_range);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Traverse the layer's index to build read operations on the overlap of the input keyspace
|
||||
/// and the keys in this layer.
|
||||
///
|
||||
/// If shard_identity is provided, it will be used to filter keys down to those stored on
|
||||
/// this shard.
|
||||
async fn plan_reads(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
shard_identity: Option<&ShardIdentity>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>> {
|
||||
let mut planner = VectoredReadPlanner::new(
|
||||
@@ -503,7 +513,6 @@ impl ImageLayerInner {
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
range.start.write_to_byte_slice(&mut search_key);
|
||||
|
||||
@@ -516,12 +525,22 @@ impl ImageLayerInner {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
assert!(key >= range.start);
|
||||
|
||||
let flag = if let Some(shard_identity) = shard_identity {
|
||||
if shard_identity.is_key_disposable(&key) {
|
||||
BlobFlag::Ignore
|
||||
} else {
|
||||
BlobFlag::None
|
||||
}
|
||||
} else {
|
||||
BlobFlag::None
|
||||
};
|
||||
|
||||
if key >= range.end {
|
||||
planner.handle_range_end(offset);
|
||||
range_end_handled = true;
|
||||
break;
|
||||
} else {
|
||||
planner.handle(key, self.lsn, offset, BlobFlag::None);
|
||||
planner.handle(key, self.lsn, offset, flag);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -534,6 +553,50 @@ impl ImageLayerInner {
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
/// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
|
||||
/// then execute vectored GET operations, passing the results of all read keys into the writer.
|
||||
pub(super) async fn filter(
|
||||
&self,
|
||||
shard_identity: &ShardIdentity,
|
||||
writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
// Fragment the range into the regions owned by this ShardIdentity
|
||||
let plan = self
|
||||
.plan_reads(
|
||||
KeySpace {
|
||||
// If asked for the total key space, plan_reads will give us all the keys in the layer
|
||||
ranges: vec![Key::MIN..Key::MAX],
|
||||
},
|
||||
Some(shard_identity),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
let mut key_count = 0;
|
||||
for read in plan.into_iter() {
|
||||
let buf_size = read.size();
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
|
||||
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
|
||||
key_count += 1;
|
||||
writer
|
||||
.put_image(meta.meta.key, img_buf, ctx)
|
||||
.await
|
||||
.context(format!("Storing key {}", meta.meta.key))?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(key_count)
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
@@ -646,7 +709,7 @@ impl ImageLayerWriterInner {
|
||||
lsn,
|
||||
},
|
||||
);
|
||||
info!("new image layer {path}");
|
||||
trace!("creating image layer {}", path);
|
||||
let mut file = {
|
||||
VirtualFile::open_with_options(
|
||||
&path,
|
||||
@@ -766,7 +829,7 @@ impl ImageLayerWriterInner {
|
||||
// FIXME: why not carry the virtualfile here, it supports renaming?
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
|
||||
trace!("created image layer {}", layer.local_path());
|
||||
info!("created image layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
@@ -851,3 +914,136 @@ impl Drop for ImageLayerWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
};
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
|
||||
|
||||
use super::ImageLayerWriter;
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_rewrite() {
|
||||
let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
// The LSN at which we will create an image layer to filter
|
||||
let lsn = Lsn(0xdeadbeef0000);
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
let range = input_start..input_end;
|
||||
|
||||
// Build an image layer to filter
|
||||
let resident = {
|
||||
let mut writer = ImageLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
writer.finish(&timeline, &ctx).await.unwrap()
|
||||
};
|
||||
let original_size = resident.metadata().file_size;
|
||||
|
||||
// Filter for various shards: this exercises cases like values at start of key range, end of key
|
||||
// range, middle of key range.
|
||||
for shard_number in 0..4 {
|
||||
let mut filtered_writer = ImageLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
|
||||
// to exercise filter()
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount::new(4),
|
||||
ShardStripeSize(0x8000),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let wrote_keys = resident
|
||||
.filter(&shard_identity, &mut filtered_writer, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let replacement = if wrote_keys > 0 {
|
||||
Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// This exact size and those below will need updating as/when the layer encoding changes, but
|
||||
// should be deterministic for a given version of the format, as we used no randomness generating the input.
|
||||
assert_eq!(original_size, 1597440);
|
||||
|
||||
match shard_number {
|
||||
0 => {
|
||||
// We should have written out just one stripe for our shard identity
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
let replacement = replacement.unwrap();
|
||||
|
||||
// We should have dropped some of the data
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
|
||||
// Assert that we dropped ~3/4 of the data.
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
}
|
||||
1 => {
|
||||
// Shard 1 has no keys in our input range
|
||||
assert_eq!(wrote_keys, 0x0);
|
||||
assert!(replacement.is_none());
|
||||
}
|
||||
2 => {
|
||||
// Shard 2 has one stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
}
|
||||
3 => {
|
||||
// Shard 3 has two stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x10000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 811008);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{
|
||||
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Weak};
|
||||
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
|
||||
use tracing::Instrument;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::heavier_once_cell;
|
||||
use utils::sync::{gate, heavier_once_cell};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
|
||||
|
||||
use super::delta_layer::{self, DeltaEntry};
|
||||
use super::image_layer;
|
||||
use super::image_layer::{self};
|
||||
use super::{
|
||||
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
|
||||
ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
|
||||
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
||||
PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
|
||||
};
|
||||
|
||||
use utils::generation::Generation;
|
||||
@@ -161,7 +161,7 @@ impl Layer {
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
file_name,
|
||||
metadata.file_size(),
|
||||
metadata.file_size,
|
||||
);
|
||||
|
||||
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
|
||||
@@ -194,7 +194,7 @@ impl Layer {
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
file_name,
|
||||
metadata.file_size(),
|
||||
metadata.file_size,
|
||||
);
|
||||
|
||||
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
|
||||
@@ -227,7 +227,7 @@ impl Layer {
|
||||
|
||||
timeline
|
||||
.metrics
|
||||
.resident_physical_size_add(metadata.file_size());
|
||||
.resident_physical_size_add(metadata.file_size);
|
||||
|
||||
ResidentLayer { downloaded, owner }
|
||||
}
|
||||
@@ -1108,6 +1108,7 @@ impl LayerInner {
|
||||
.download_layer_file(
|
||||
&self.desc.layer_name(),
|
||||
&self.metadata(),
|
||||
&self.path,
|
||||
&timeline.cancel,
|
||||
ctx,
|
||||
)
|
||||
@@ -1263,6 +1264,7 @@ impl LayerInner {
|
||||
lsn_end: lsn_range.end,
|
||||
remote: !resident,
|
||||
access_stats,
|
||||
l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
|
||||
}
|
||||
} else {
|
||||
let lsn = self.desc.image_layer_lsn();
|
||||
@@ -1331,7 +1333,7 @@ impl LayerInner {
|
||||
|
||||
is_good_to_continue(&rx.borrow_and_update())?;
|
||||
|
||||
let Ok(_gate) = timeline.gate.enter() else {
|
||||
let Ok(gate) = timeline.gate.enter() else {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
};
|
||||
|
||||
@@ -1419,7 +1421,7 @@ impl LayerInner {
|
||||
Self::spawn_blocking(move || {
|
||||
let _span = span.entered();
|
||||
|
||||
let res = self.evict_blocking(&timeline, &permit);
|
||||
let res = self.evict_blocking(&timeline, &gate, &permit);
|
||||
|
||||
let waiters = self.inner.initializer_count();
|
||||
|
||||
@@ -1445,6 +1447,7 @@ impl LayerInner {
|
||||
fn evict_blocking(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
_gate: &gate::GateGuard,
|
||||
_permit: &heavier_once_cell::InitPermit,
|
||||
) -> Result<(), EvictionCancelled> {
|
||||
// now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
|
||||
@@ -1799,16 +1802,15 @@ impl ResidentLayer {
|
||||
use LayerKind::*;
|
||||
|
||||
let owner = &self.owner.0;
|
||||
|
||||
match self.downloaded.get(owner, ctx).await? {
|
||||
Delta(ref d) => {
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
owner
|
||||
.access_stats
|
||||
.record_access(LayerAccessKind::KeyIter, ctx);
|
||||
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
delta_layer::DeltaLayerInner::load_keys(d, ctx)
|
||||
.await
|
||||
.with_context(|| format!("Layer index is corrupted for {self}"))
|
||||
@@ -1817,6 +1819,23 @@ impl ResidentLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Read all they keys in this layer which match the ShardIdentity, and write them all to
|
||||
/// the provided writer. Return the number of keys written.
|
||||
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
|
||||
pub(crate) async fn filter<'a>(
|
||||
&'a self,
|
||||
shard_identity: &ShardIdentity,
|
||||
writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self.downloaded.get(&self.owner.0, ctx).await? {
|
||||
Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
|
||||
Image(i) => i.filter(shard_identity, writer, ctx).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the amount of keys and values written to the writer.
|
||||
pub(crate) async fn copy_delta_prefix(
|
||||
&self,
|
||||
|
||||
@@ -347,37 +347,33 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
|
||||
mod test {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn image_layer_parse() -> anyhow::Result<()> {
|
||||
fn image_layer_parse() {
|
||||
let expected = LayerName::Image(ImageLayerName {
|
||||
key_range: Key::from_i128(0)
|
||||
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
|
||||
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
|
||||
});
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
|
||||
assert_eq!(parsed, expected,);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
|
||||
assert_eq!(parsed, expected,);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delta_layer_parse() -> anyhow::Result<()> {
|
||||
fn delta_layer_parse() {
|
||||
let expected = LayerName::Delta(DeltaLayerName {
|
||||
key_range: Key::from_i128(0)
|
||||
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
|
||||
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
|
||||
..Lsn::from_hex("000000000154C481").unwrap(),
|
||||
});
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,14 +18,14 @@ use fail::fail_point;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::{
|
||||
AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
|
||||
NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
|
||||
},
|
||||
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
|
||||
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
|
||||
DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
|
||||
TimelineState,
|
||||
AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
|
||||
InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
|
||||
},
|
||||
reltag::BlockNumber,
|
||||
shard::{ShardIdentity, ShardNumber, TenantShardId},
|
||||
@@ -41,6 +41,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
fs_ext,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
vec_map::VecMap,
|
||||
};
|
||||
@@ -60,7 +61,7 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::init::LocalLayerFileMetadata;
|
||||
use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
tenant::{
|
||||
@@ -267,6 +268,8 @@ pub struct Timeline {
|
||||
// Atomic would be more appropriate here.
|
||||
last_freeze_ts: RwLock<Instant>,
|
||||
|
||||
pub(crate) standby_horizon: AtomicLsn,
|
||||
|
||||
// WAL redo manager. `None` only for broken tenants.
|
||||
walredo_mgr: Option<Arc<super::WalRedoManager>>,
|
||||
|
||||
@@ -346,8 +349,8 @@ pub struct Timeline {
|
||||
// though let's keep them both for better error visibility.
|
||||
pub initdb_lsn: Lsn,
|
||||
|
||||
/// When did we last calculate the partitioning?
|
||||
partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
/// When did we last calculate the partitioning? Make it pub to test cases.
|
||||
pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
@@ -413,7 +416,11 @@ pub struct Timeline {
|
||||
/// Keep aux directory cache to avoid it's reconstruction on each update
|
||||
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
|
||||
|
||||
/// Size estimator for aux file v2
|
||||
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
|
||||
|
||||
/// Indicate whether aux file v2 storage is enabled.
|
||||
pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -477,6 +484,11 @@ impl GcCutoffs {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct TimelineVisitOutcome {
|
||||
completed_keyspace: KeySpace,
|
||||
image_covered_keyspace: KeySpace,
|
||||
}
|
||||
|
||||
/// An error happened in a get() operation.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum PageReconstructError {
|
||||
@@ -501,6 +513,13 @@ pub(crate) enum PageReconstructError {
|
||||
MissingKey(MissingKeyError),
|
||||
}
|
||||
|
||||
impl GetVectoredError {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn is_missing_key_error(&self) -> bool {
|
||||
matches!(self, Self::MissingKey(_))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
@@ -778,6 +797,11 @@ pub(crate) enum ShutdownMode {
|
||||
Hard,
|
||||
}
|
||||
|
||||
struct ImageLayerCreationOutcome {
|
||||
image: Option<ResidentLayer>,
|
||||
next_start_key: Key,
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -879,7 +903,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
if self.conf.validate_vectored_get {
|
||||
@@ -1024,7 +1048,12 @@ impl Timeline {
|
||||
}
|
||||
GetVectoredImpl::Vectored => {
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
if self.conf.validate_vectored_get {
|
||||
@@ -1112,7 +1141,7 @@ impl Timeline {
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
ValuesReconstructState::default(),
|
||||
&mut ValuesReconstructState::default(),
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -1189,7 +1218,7 @@ impl Timeline {
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
mut reconstruct_state: ValuesReconstructState,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let get_kind = if keyspace.total_raw_size() == 1 {
|
||||
@@ -1201,7 +1230,7 @@ impl Timeline {
|
||||
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
|
||||
.for_get_kind(get_kind)
|
||||
.start_timer();
|
||||
self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
|
||||
self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
|
||||
.await?;
|
||||
get_data_timer.stop_and_record();
|
||||
|
||||
@@ -1210,7 +1239,8 @@ impl Timeline {
|
||||
.start_timer();
|
||||
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
|
||||
let layers_visited = reconstruct_state.get_layers_visited();
|
||||
for (key, res) in reconstruct_state.keys {
|
||||
|
||||
for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
|
||||
match res {
|
||||
Err(err) => {
|
||||
results.insert(key, Err(err));
|
||||
@@ -1393,7 +1423,7 @@ impl Timeline {
|
||||
let layer_map = guard.layer_map();
|
||||
let mut size = 0;
|
||||
for l in layer_map.iter_historic_layers() {
|
||||
size += l.file_size();
|
||||
size += l.file_size;
|
||||
}
|
||||
size
|
||||
}
|
||||
@@ -1501,6 +1531,20 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Obtains a temporary lease blocking garbage collection for the given LSN
|
||||
pub(crate) fn make_lsn_lease(
|
||||
&self,
|
||||
_lsn: Lsn,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
|
||||
let lease = LsnLease {
|
||||
valid_until: SystemTime::now() + LEASE_LENGTH,
|
||||
};
|
||||
// TODO: dummy implementation
|
||||
Ok(lease)
|
||||
}
|
||||
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||
@@ -1655,7 +1699,7 @@ impl Timeline {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
match self.get_compaction_algorithm() {
|
||||
match self.get_compaction_algorithm_settings().kind {
|
||||
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
|
||||
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
|
||||
}
|
||||
@@ -2051,12 +2095,14 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
|
||||
fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
|
||||
let tenant_conf = &self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.compaction_algorithm
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
|
||||
.as_ref()
|
||||
.unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm)
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn get_eviction_policy(&self) -> EvictionPolicy {
|
||||
@@ -2133,6 +2179,7 @@ impl Timeline {
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
state: TimelineState,
|
||||
aux_file_policy: Option<AuxFilePolicy>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
@@ -2249,6 +2296,8 @@ impl Timeline {
|
||||
compaction_lock: tokio::sync::Mutex::default(),
|
||||
gc_lock: tokio::sync::Mutex::default(),
|
||||
|
||||
standby_horizon: AtomicLsn::new(0),
|
||||
|
||||
timeline_get_throttle: resources.timeline_get_throttle,
|
||||
|
||||
aux_files: tokio::sync::Mutex::new(AuxFilesState {
|
||||
@@ -2257,6 +2306,8 @@ impl Timeline {
|
||||
}),
|
||||
|
||||
aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
|
||||
|
||||
last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
|
||||
};
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
@@ -2402,8 +2453,6 @@ impl Timeline {
|
||||
let span = tracing::Span::current();
|
||||
|
||||
// Copy to move into the task we're about to spawn
|
||||
let generation = self.generation;
|
||||
let shard = self.get_shard_index();
|
||||
let this = self.myself.upgrade().expect("&self method holds the arc");
|
||||
|
||||
let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
|
||||
@@ -2417,11 +2466,14 @@ impl Timeline {
|
||||
|
||||
for discovered in discovered {
|
||||
let (name, kind) = match discovered {
|
||||
Discovered::Layer(layer_file_name, local_path, file_size) => {
|
||||
discovered_layers.push((layer_file_name, local_path, file_size));
|
||||
Discovered::Layer(layer_file_name, local_metadata) => {
|
||||
discovered_layers.push((layer_file_name, local_metadata));
|
||||
continue;
|
||||
}
|
||||
Discovered::IgnoredBackup => {
|
||||
Discovered::IgnoredBackup(path) => {
|
||||
std::fs::remove_file(path)
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.fatal_err("Removing .old file");
|
||||
continue;
|
||||
}
|
||||
Discovered::Unknown(file_name) => {
|
||||
@@ -2447,13 +2499,8 @@ impl Timeline {
|
||||
);
|
||||
}
|
||||
|
||||
let decided = init::reconcile(
|
||||
discovered_layers,
|
||||
index_part.as_ref(),
|
||||
disk_consistent_lsn,
|
||||
generation,
|
||||
shard,
|
||||
);
|
||||
let decided =
|
||||
init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
|
||||
|
||||
let mut loaded_layers = Vec::new();
|
||||
let mut needs_cleanup = Vec::new();
|
||||
@@ -2461,21 +2508,6 @@ impl Timeline {
|
||||
|
||||
for (name, decision) in decided {
|
||||
let decision = match decision {
|
||||
Ok(UseRemote { local, remote }) => {
|
||||
// Remote is authoritative, but we may still choose to retain
|
||||
// the local file if the contents appear to match
|
||||
if local.metadata.file_size() == remote.file_size() {
|
||||
// Use the local file, but take the remote metadata so that we pick up
|
||||
// the correct generation.
|
||||
UseLocal(LocalLayerFileMetadata {
|
||||
metadata: remote,
|
||||
local_path: local.local_path,
|
||||
})
|
||||
} else {
|
||||
init::cleanup_local_file_for_remote(&local, &remote)?;
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
}
|
||||
Ok(decision) => decision,
|
||||
Err(DismissedLayer::Future { local }) => {
|
||||
if let Some(local) = local {
|
||||
@@ -2493,6 +2525,11 @@ impl Timeline {
|
||||
// this file never existed remotely, we will have to do rework
|
||||
continue;
|
||||
}
|
||||
Err(DismissedLayer::BadMetadata(local)) => {
|
||||
init::cleanup_local_file_for_remote(&local)?;
|
||||
// this file never existed remotely, we will have to do rework
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match &name {
|
||||
@@ -2503,14 +2540,12 @@ impl Timeline {
|
||||
tracing::debug!(layer=%name, ?decision, "applied");
|
||||
|
||||
let layer = match decision {
|
||||
UseLocal(local) => {
|
||||
total_physical_size += local.metadata.file_size();
|
||||
Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
|
||||
Resident { local, remote } => {
|
||||
total_physical_size += local.file_size;
|
||||
Layer::for_resident(conf, &this, local.local_path, name, remote)
|
||||
.drop_eviction_guard()
|
||||
}
|
||||
Evicted(remote) | UseRemote { remote, .. } => {
|
||||
Layer::for_evicted(conf, &this, name, remote)
|
||||
}
|
||||
Evicted(remote) => Layer::for_evicted(conf, &this, name, remote),
|
||||
};
|
||||
|
||||
loaded_layers.push(layer);
|
||||
@@ -3019,7 +3054,7 @@ impl Timeline {
|
||||
|
||||
HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
(&layer.metadata()).into(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
)
|
||||
});
|
||||
@@ -3280,12 +3315,15 @@ impl Timeline {
|
||||
|
||||
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
||||
|
||||
loop {
|
||||
let missing_keyspace = loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
let completed = Self::get_vectored_reconstruct_data_timeline(
|
||||
let TimelineVisitOutcome {
|
||||
completed_keyspace: completed,
|
||||
image_covered_keyspace,
|
||||
} = Self::get_vectored_reconstruct_data_timeline(
|
||||
timeline,
|
||||
keyspace.clone(),
|
||||
cont_lsn,
|
||||
@@ -3304,12 +3342,31 @@ impl Timeline {
|
||||
ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
|
||||
});
|
||||
|
||||
// Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
|
||||
// into ancestor timelines). TODO: is there any other metadata which we want to inherit?
|
||||
if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
|
||||
break;
|
||||
// Keyspace is fully retrieved
|
||||
if keyspace.is_empty() {
|
||||
break None;
|
||||
}
|
||||
|
||||
// Not fully retrieved but no ancestor timeline.
|
||||
if timeline.ancestor_timeline.is_none() {
|
||||
break Some(keyspace);
|
||||
}
|
||||
|
||||
// Now we see if there are keys covered by the image layer but does not exist in the
|
||||
// image layer, which means that the key does not exist.
|
||||
|
||||
// The block below will stop the vectored search if any of the keys encountered an image layer
|
||||
// which did not contain a snapshot for said key. Since we have already removed all completed
|
||||
// keys from `keyspace`, we expect there to be no overlap between it and the image covered key
|
||||
// space. If that's not the case, we had at least one key encounter a gap in the image layer
|
||||
// and stop the search as a result of that.
|
||||
let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
|
||||
if !removed.is_empty() {
|
||||
break Some(removed);
|
||||
}
|
||||
// If we reached this point, `remove_overlapping_with` should not have made any change to the
|
||||
// keyspace.
|
||||
|
||||
// Take the min to avoid reconstructing a page with data newer than request Lsn.
|
||||
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
|
||||
timeline_owned = timeline
|
||||
@@ -3317,14 +3374,14 @@ impl Timeline {
|
||||
.await
|
||||
.map_err(GetVectoredError::GetReadyAncestorError)?;
|
||||
timeline = &*timeline_owned;
|
||||
}
|
||||
};
|
||||
|
||||
if keyspace.total_raw_size() != 0 {
|
||||
if let Some(missing_keyspace) = missing_keyspace {
|
||||
return Err(GetVectoredError::MissingKey(MissingKeyError {
|
||||
key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
shard: self
|
||||
.shard_identity
|
||||
.get_shard_number(&keyspace.start().unwrap()),
|
||||
.get_shard_number(&missing_keyspace.start().unwrap()),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
@@ -3349,6 +3406,9 @@ impl Timeline {
|
||||
///
|
||||
/// At each iteration pop the top of the fringe (the layer with the highest Lsn)
|
||||
/// and get all the required reconstruct data from the layer in one go.
|
||||
///
|
||||
/// Returns the completed keyspace and the keyspaces with image coverage. The caller
|
||||
/// decides how to deal with these two keyspaces.
|
||||
async fn get_vectored_reconstruct_data_timeline(
|
||||
timeline: &Timeline,
|
||||
keyspace: KeySpace,
|
||||
@@ -3356,20 +3416,27 @@ impl Timeline {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<KeySpace, GetVectoredError> {
|
||||
) -> Result<TimelineVisitOutcome, GetVectoredError> {
|
||||
let mut unmapped_keyspace = keyspace.clone();
|
||||
let mut fringe = LayerFringe::new();
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
let mut image_covered_keyspace = KeySpaceRandomAccum::new();
|
||||
|
||||
loop {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
let keys_done_last_step = reconstruct_state.consume_done_keys();
|
||||
let (keys_done_last_step, keys_with_image_coverage) =
|
||||
reconstruct_state.consume_done_keys();
|
||||
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
|
||||
completed_keyspace.merge(&keys_done_last_step);
|
||||
if let Some(keys_with_image_coverage) = keys_with_image_coverage {
|
||||
unmapped_keyspace
|
||||
.remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone()));
|
||||
image_covered_keyspace.add_range(keys_with_image_coverage);
|
||||
}
|
||||
|
||||
// Do not descent any further if the last layer we visited
|
||||
// completed all keys in the keyspace it inspected. This is not
|
||||
@@ -3441,13 +3508,16 @@ impl Timeline {
|
||||
unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = next_cont_lsn;
|
||||
|
||||
reconstruct_state.on_layer_visited();
|
||||
reconstruct_state.on_layer_visited(&layer_to_read);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(completed_keyspace)
|
||||
Ok(TimelineVisitOutcome {
|
||||
completed_keyspace,
|
||||
image_covered_keyspace: image_covered_keyspace.consume_keyspace(),
|
||||
})
|
||||
}
|
||||
|
||||
/// # Cancel-safety
|
||||
@@ -4127,6 +4197,176 @@ impl Timeline {
|
||||
false
|
||||
}
|
||||
|
||||
/// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large,
|
||||
/// so that at most one image layer will be produced from this function.
|
||||
async fn create_image_layer_for_rel_blocks(
|
||||
self: &Arc<Self>,
|
||||
partition: &KeySpace,
|
||||
mut image_layer_writer: ImageLayerWriter,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
img_range: Range<Key>,
|
||||
start: Key,
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
let mut wrote_keys = false;
|
||||
|
||||
let mut key_request_accum = KeySpaceAccum::new();
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
// Decide whether to retain this key: usually we do, but sharded tenants may
|
||||
// need to drop keys that don't belong to them. If we retain the key, add it
|
||||
// to `key_request_accum` for later issuing a vectored get
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
debug!(
|
||||
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||
key,
|
||||
self.shard_identity.get_shard_number(&key)
|
||||
);
|
||||
} else {
|
||||
key_request_accum.add_key(key);
|
||||
}
|
||||
|
||||
let last_key_in_range = key.next() == range.end;
|
||||
key = key.next();
|
||||
|
||||
// Maybe flush `key_rest_accum`
|
||||
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| (last_key_in_range && key_request_accum.raw_size() > 0)
|
||||
{
|
||||
let results = self
|
||||
.get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
|
||||
.await?;
|
||||
|
||||
for (img_key, img) in results {
|
||||
let img = match img {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(CreateImageLayersError::PageReconstructError(err));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Write all the keys we just read into our new image layer.
|
||||
image_layer_writer.put_image(img_key, img, ctx).await?;
|
||||
wrote_keys = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if wrote_keys {
|
||||
// Normal path: we have written some data into the new image layer for this
|
||||
// partition, so flush it to disk.
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: Some(image_layer),
|
||||
next_start_key: img_range.end,
|
||||
})
|
||||
} else {
|
||||
// Special case: the image layer may be empty if this is a sharded tenant and the
|
||||
// partition does not cover any keys owned by this shard. In this case, to ensure
|
||||
// we don't leave gaps between image layers, leave `start` where it is, so that the next
|
||||
// layer we write will cover the key range that we just scanned.
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: None,
|
||||
next_start_key: start,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an image layer for metadata keys. This function produces one image layer for all metadata
|
||||
/// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
|
||||
/// would not be too large to fit in a single image layer.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn create_image_layer_for_metadata_keys(
|
||||
self: &Arc<Self>,
|
||||
partition: &KeySpace,
|
||||
mut image_layer_writer: ImageLayerWriter,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
img_range: Range<Key>,
|
||||
mode: ImageLayerCreationMode,
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
assert!(!matches!(mode, ImageLayerCreationMode::Initial));
|
||||
|
||||
// Metadata keys image layer creation.
|
||||
let mut reconstruct_state = ValuesReconstructState::default();
|
||||
let data = self
|
||||
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
let (data, total_kb_retrieved, total_key_retrieved) = {
|
||||
let mut new_data = BTreeMap::new();
|
||||
let mut total_kb_retrieved = 0;
|
||||
let mut total_key_retrieved = 0;
|
||||
for (k, v) in data {
|
||||
let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
|
||||
total_kb_retrieved += KEY_SIZE + v.len();
|
||||
total_key_retrieved += 1;
|
||||
new_data.insert(k, v);
|
||||
}
|
||||
(new_data, total_kb_retrieved / 1024, total_key_retrieved)
|
||||
};
|
||||
let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
|
||||
|
||||
let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
|
||||
debug!(
|
||||
"generate image layers for metadata keys: trigger_generation={trigger_generation}, \
|
||||
delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
|
||||
total_key_retrieved={total_key_retrieved}"
|
||||
);
|
||||
if !trigger_generation && mode == ImageLayerCreationMode::Try {
|
||||
return Ok(ImageLayerCreationOutcome {
|
||||
image: None,
|
||||
next_start_key: img_range.end,
|
||||
});
|
||||
}
|
||||
let has_keys = !data.is_empty();
|
||||
for (k, v) in data {
|
||||
// Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
|
||||
// considers this situation properly.
|
||||
// if v.is_empty() {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// No need to handle sharding b/c metadata keys are always on the 0-th shard.
|
||||
|
||||
// TODO: split image layers to avoid too large layer files. Too large image files are not handled
|
||||
// on the normal data path either.
|
||||
image_layer_writer.put_image(k, v, ctx).await?;
|
||||
}
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: if has_keys {
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
Some(image_layer)
|
||||
} else {
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
None
|
||||
},
|
||||
next_start_key: img_range.end,
|
||||
})
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
|
||||
async fn create_image_layers(
|
||||
self: &Arc<Timeline>,
|
||||
@@ -4168,19 +4408,17 @@ impl Timeline {
|
||||
|
||||
for partition in partitioning.parts.iter() {
|
||||
let img_range = start..partition.ranges.last().unwrap().end;
|
||||
|
||||
if partition.overlaps(&Key::metadata_key_range()) {
|
||||
// TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
|
||||
// rather big change. Keep this patch small for now.
|
||||
match mode {
|
||||
ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
|
||||
// skip image layer creation anyways for metadata keys.
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
ImageLayerCreationMode::Initial => {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
}
|
||||
let compact_metadata = partition.overlaps(&Key::metadata_key_range());
|
||||
if compact_metadata {
|
||||
for range in &partition.ranges {
|
||||
assert!(
|
||||
range.start.field1 >= METADATA_KEY_BEGIN_PREFIX
|
||||
&& range.end.field1 <= METADATA_KEY_END_PREFIX,
|
||||
"metadata keys must be partitioned separately"
|
||||
);
|
||||
}
|
||||
if mode == ImageLayerCreationMode::Initial {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
}
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
@@ -4191,7 +4429,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
let image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
@@ -4207,87 +4445,39 @@ impl Timeline {
|
||||
)))
|
||||
});
|
||||
|
||||
let mut wrote_keys = false;
|
||||
if !compact_metadata {
|
||||
let ImageLayerCreationOutcome {
|
||||
image,
|
||||
next_start_key,
|
||||
} = self
|
||||
.create_image_layer_for_rel_blocks(
|
||||
partition,
|
||||
image_layer_writer,
|
||||
lsn,
|
||||
ctx,
|
||||
img_range,
|
||||
start,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut key_request_accum = KeySpaceAccum::new();
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
// Decide whether to retain this key: usually we do, but sharded tenants may
|
||||
// need to drop keys that don't belong to them. If we retain the key, add it
|
||||
// to `key_request_accum` for later issuing a vectored get
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
debug!(
|
||||
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||
key,
|
||||
self.shard_identity.get_shard_number(&key)
|
||||
);
|
||||
} else {
|
||||
key_request_accum.add_key(key);
|
||||
}
|
||||
|
||||
let last_key_in_range = key.next() == range.end;
|
||||
key = key.next();
|
||||
|
||||
// Maybe flush `key_rest_accum`
|
||||
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| (last_key_in_range && key_request_accum.raw_size() > 0)
|
||||
{
|
||||
let results = self
|
||||
.get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
|
||||
.await?;
|
||||
|
||||
for (img_key, img) in results {
|
||||
let img = match img {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
|
||||
{
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(CreateImageLayersError::PageReconstructError(
|
||||
err,
|
||||
));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Write all the keys we just read into our new image layer.
|
||||
image_layer_writer.put_image(img_key, img, ctx).await?;
|
||||
wrote_keys = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if wrote_keys {
|
||||
// Normal path: we have written some data into the new image layer for this
|
||||
// partition, so flush it to disk.
|
||||
start = img_range.end;
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
image_layers.push(image_layer);
|
||||
start = next_start_key;
|
||||
image_layers.extend(image);
|
||||
} else {
|
||||
// Special case: the image layer may be empty if this is a sharded tenant and the
|
||||
// partition does not cover any keys owned by this shard. In this case, to ensure
|
||||
// we don't leave gaps between image layers, leave `start` where it is, so that the next
|
||||
// layer we write will cover the key range that we just scanned.
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
let ImageLayerCreationOutcome {
|
||||
image,
|
||||
next_start_key,
|
||||
} = self
|
||||
.create_image_layer_for_metadata_keys(
|
||||
partition,
|
||||
image_layer_writer,
|
||||
lsn,
|
||||
ctx,
|
||||
img_range,
|
||||
mode,
|
||||
)
|
||||
.await?;
|
||||
start = next_start_key;
|
||||
image_layers.extend(image);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4401,6 +4591,14 @@ impl Timeline {
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
detach_ancestor::complete(self, tenant, prepared, ctx).await
|
||||
}
|
||||
|
||||
/// Switch aux file policy and schedule upload to the index part.
|
||||
pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
|
||||
self.last_aux_file_policy.store(Some(policy));
|
||||
self.remote_client
|
||||
.schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Top-level failure to compact.
|
||||
@@ -4510,11 +4708,16 @@ impl Timeline {
|
||||
|
||||
async fn rewrite_layers(
|
||||
self: &Arc<Self>,
|
||||
replace_layers: Vec<(Layer, ResidentLayer)>,
|
||||
drop_layers: Vec<Layer>,
|
||||
mut replace_layers: Vec<(Layer, ResidentLayer)>,
|
||||
mut drop_layers: Vec<Layer>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
// Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
|
||||
// to avoid double-removing, and avoid rewriting something that was removed.
|
||||
replace_layers.retain(|(l, _)| guard.contains(l));
|
||||
drop_layers.retain(|l| guard.contains(l));
|
||||
|
||||
guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
|
||||
|
||||
let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
|
||||
@@ -4657,7 +4860,32 @@ impl Timeline {
|
||||
(horizon_cutoff, pitr_cutoff, retain_lsns)
|
||||
};
|
||||
|
||||
let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
||||
let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
||||
let standby_horizon = self.standby_horizon.load();
|
||||
// Hold GC for the standby, but as a safety guard do it only within some
|
||||
// reasonable lag.
|
||||
if standby_horizon != Lsn::INVALID {
|
||||
if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) {
|
||||
const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB
|
||||
if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG {
|
||||
new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff);
|
||||
trace!("holding off GC for standby apply LSN {}", standby_horizon);
|
||||
} else {
|
||||
warn!(
|
||||
"standby is lagging for more than {}MB, not holding gc for it",
|
||||
MAX_ALLOWED_STANDBY_LAG / 1024 / 1024
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reset standby horizon to ignore it if it is not updated till next GC.
|
||||
// It is an easy way to unset it when standby disappears without adding
|
||||
// more conf options.
|
||||
self.standby_horizon.store(Lsn::INVALID);
|
||||
self.metrics
|
||||
.standby_horizon_gauge
|
||||
.set(Lsn::INVALID.0 as i64);
|
||||
|
||||
let res = self
|
||||
.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
|
||||
@@ -5364,26 +5592,6 @@ fn is_send() {
|
||||
_assert_send::<TimelineWriter<'_>>();
|
||||
}
|
||||
|
||||
/// Add a suffix to a layer file's name: .{num}.old
|
||||
/// Uses the first available num (starts at 0)
|
||||
fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
let filename = path
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow!("Path {path} don't have a file name"))?;
|
||||
let mut new_path = path.to_owned();
|
||||
|
||||
for i in 0u32.. {
|
||||
new_path.set_file_name(format!("{filename}.{i}.old"));
|
||||
if !new_path.exists() {
|
||||
std::fs::rename(path, &new_path)
|
||||
.with_context(|| format!("rename {path:?} to {new_path:?}"))?;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
bail!("couldn't find an unused backup number for {:?}", path)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
@@ -9,7 +9,10 @@ use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
|
||||
use super::{
|
||||
CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
|
||||
RecordedDuration, Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use enumset::EnumSet;
|
||||
@@ -22,14 +25,13 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
|
||||
use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
|
||||
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::DeltaLayer;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
use crate::{page_cache, ZERO_PAGE};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::Key;
|
||||
@@ -116,9 +118,13 @@ impl Timeline {
|
||||
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let dense_layers = self
|
||||
let mut partitioning = dense_partitioning;
|
||||
partitioning
|
||||
.parts
|
||||
.extend(sparse_partitioning.into_dense().parts);
|
||||
let image_layers = self
|
||||
.create_image_layers(
|
||||
&dense_partitioning,
|
||||
&partitioning,
|
||||
lsn,
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
@@ -130,24 +136,8 @@ impl Timeline {
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
|
||||
// For now, nothing will be produced...
|
||||
let sparse_layers = self
|
||||
.create_image_layers(
|
||||
&sparse_partitioning.clone().into_dense(),
|
||||
lsn,
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
assert!(sparse_layers.is_empty());
|
||||
|
||||
self.upload_new_image_layers(dense_layers)?;
|
||||
dense_partitioning.parts.len()
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
partitioning.parts.len()
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -186,13 +176,24 @@ impl Timeline {
|
||||
async fn compact_shard_ancestors(
|
||||
self: &Arc<Self>,
|
||||
rewrite_max: usize,
|
||||
_ctx: &RequestContext,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut drop_layers = Vec::new();
|
||||
let layers_to_rewrite: Vec<Layer> = Vec::new();
|
||||
let mut layers_to_rewrite: Vec<Layer> = Vec::new();
|
||||
|
||||
// We will use the PITR cutoff as a condition for rewriting layers.
|
||||
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
|
||||
// We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
|
||||
// layer is behind this Lsn, it indicates that the layer is being retained beyond the
|
||||
// pitr_interval, for example because a branchpoint references it.
|
||||
//
|
||||
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
|
||||
// are rewriting layers.
|
||||
let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
|
||||
|
||||
tracing::info!(
|
||||
"latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
*latest_gc_cutoff,
|
||||
self.gc_info.read().unwrap().cutoffs.pitr
|
||||
);
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
for layer_desc in layers.layer_map().iter_historic_layers() {
|
||||
@@ -251,9 +252,9 @@ impl Timeline {
|
||||
|
||||
// Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
|
||||
// without incurring the I/O cost of a rewrite.
|
||||
if layer_desc.get_lsn_range().end >= pitr_cutoff {
|
||||
debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
|
||||
layer_desc.get_lsn_range().end, pitr_cutoff);
|
||||
if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
|
||||
debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
|
||||
layer_desc.get_lsn_range().end, *latest_gc_cutoff);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -263,13 +264,10 @@ impl Timeline {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Only rewrite layers if they would have different remote paths: either they belong to this
|
||||
// shard but an old generation, or they belonged to another shard. This also implicitly
|
||||
// guarantees that the layer is persistent in remote storage (as only remote persistent
|
||||
// layers are carried across shard splits, any local-only layer would be in the current generation)
|
||||
if layer.metadata().generation == self.generation
|
||||
&& layer.metadata().shard.shard_count == self.shard_identity.count
|
||||
{
|
||||
// Only rewrite layers if their generations differ. This guarantees:
|
||||
// - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
|
||||
// - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
|
||||
if layer.metadata().generation == self.generation {
|
||||
debug!(%layer, "Skipping rewrite, is not from old generation");
|
||||
continue;
|
||||
}
|
||||
@@ -282,18 +280,69 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Fall through: all our conditions for doing a rewrite passed.
|
||||
// TODO: implement rewriting
|
||||
tracing::debug!(%layer, "Would rewrite layer");
|
||||
layers_to_rewrite.push(layer);
|
||||
}
|
||||
|
||||
// Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
|
||||
// Drop read lock on layer map before we start doing time-consuming I/O
|
||||
drop(layers);
|
||||
|
||||
// TODO: collect layers to rewrite
|
||||
let replace_layers = Vec::new();
|
||||
let mut replace_image_layers = Vec::new();
|
||||
|
||||
for layer in layers_to_rewrite {
|
||||
tracing::info!(layer=%layer, "Rewriting layer after shard split...");
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&layer.layer_desc().key_range,
|
||||
layer.layer_desc().image_layer_lsn(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Safety of layer rewrites:
|
||||
// - We are writing to a different local file path than we are reading from, so the old Layer
|
||||
// cannot interfere with the new one.
|
||||
// - In the page cache, contents for a particular VirtualFile are stored with a file_id that
|
||||
// is different for two layers with the same name (in `ImageLayerInner::new` we always
|
||||
// acquire a fresh id from [`crate::page_cache::next_file_id`]. So readers do not risk
|
||||
// reading the index from one layer file, and then data blocks from the rewritten layer file.
|
||||
// - Any readers that have a reference to the old layer will keep it alive until they are done
|
||||
// with it. If they are trying to promote from remote storage, that will fail, but this is the same
|
||||
// as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
|
||||
// - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
|
||||
// - GC, which at worst witnesses us "undelete" a layer that they just deleted.
|
||||
// - ingestion, which only inserts layers, therefore cannot collide with us.
|
||||
let resident = layer.download_and_keep_resident().await?;
|
||||
|
||||
let keys_written = resident
|
||||
.filter(&self.shard_identity, &mut image_layer_writer, ctx)
|
||||
.await?;
|
||||
|
||||
if keys_written > 0 {
|
||||
let new_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
|
||||
layer.metadata().file_size,
|
||||
new_layer.metadata().file_size);
|
||||
|
||||
replace_image_layers.push((layer, new_layer));
|
||||
} else {
|
||||
// Drop the old layer. Usually for this case we would already have noticed that
|
||||
// the layer has no data for us with the ShardedRange check above, but
|
||||
drop_layers.push(layer);
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
|
||||
// metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
|
||||
// to remote index) and be removed. This is inefficient but safe.
|
||||
fail::fail_point!("compact-shard-ancestors-localonly");
|
||||
|
||||
// Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
|
||||
self.rewrite_layers(replace_layers, drop_layers).await?;
|
||||
self.rewrite_layers(replace_image_layers, drop_layers)
|
||||
.await?;
|
||||
|
||||
fail::fail_point!("compact-shard-ancestors-enqueued");
|
||||
|
||||
// We wait for all uploads to complete before finishing this compaction stage. This is not
|
||||
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
|
||||
@@ -301,6 +350,8 @@ impl Timeline {
|
||||
// load.
|
||||
self.remote_client.wait_completion().await?;
|
||||
|
||||
fail::fail_point!("compact-shard-ancestors-persistent");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -499,8 +550,11 @@ impl Timeline {
|
||||
|
||||
for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
|
||||
// just first fast filter, do not create hole entries for metadata keys. The last hole in the
|
||||
// compaction is the gap between data key and metadata keys.
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
|
||||
&& !Key::is_metadata_key(&prev_key)
|
||||
{
|
||||
let key_range = prev_key..next_key;
|
||||
// Measuring hole by just subtraction of i128 representation of key range boundaries
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
@@ -1159,10 +1213,10 @@ impl TimelineAdaptor {
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
) -> Result<(), CreateImageLayersError> {
|
||||
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
|
||||
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
let image_layer_writer = ImageLayerWriter::new(
|
||||
self.timeline.conf,
|
||||
self.timeline.timeline_id,
|
||||
self.timeline.tenant_shard_id,
|
||||
@@ -1173,47 +1227,34 @@ impl TimelineAdaptor {
|
||||
.await?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
"failpoint image-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
|
||||
for range in &keyspace_ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
let img = match self.timeline.get(key, lsn, ctx).await {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
|
||||
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
image_layer_writer.put_image(key, img, ctx).await?;
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
|
||||
|
||||
self.new_images.push(image_layer);
|
||||
let keyspace = KeySpace {
|
||||
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
|
||||
};
|
||||
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
|
||||
let start = Key::MIN;
|
||||
let ImageLayerCreationOutcome {
|
||||
image,
|
||||
next_start_key: _,
|
||||
} = self
|
||||
.timeline
|
||||
.create_image_layer_for_rel_blocks(
|
||||
&keyspace,
|
||||
image_layer_writer,
|
||||
lsn,
|
||||
ctx,
|
||||
key_range.clone(),
|
||||
start,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(image_layer) = image {
|
||||
self.new_images.push(image_layer);
|
||||
}
|
||||
|
||||
timer.stop_and_record();
|
||||
|
||||
|
||||
@@ -280,6 +280,8 @@ impl DeleteTimelineFlow {
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
// Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
|
||||
None,
|
||||
)
|
||||
.context("create_timeline_struct")?;
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
|
||||
use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum Error {
|
||||
@@ -41,6 +41,27 @@ pub(crate) enum Error {
|
||||
Unexpected(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<Error> for ApiError {
|
||||
fn from(value: Error) -> Self {
|
||||
match value {
|
||||
e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
|
||||
// TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
|
||||
e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
|
||||
Error::ShuttingDown => ApiError::ShuttingDown,
|
||||
Error::OtherTimelineDetachOngoing(_) => {
|
||||
ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
|
||||
}
|
||||
// All of these contain shutdown errors, in fact, it's the most common
|
||||
e @ Error::FlushAncestor(_)
|
||||
| e @ Error::RewrittenDeltaDownloadFailed(_)
|
||||
| e @ Error::CopyDeltaPrefix(_)
|
||||
| e @ Error::UploadRewritten(_)
|
||||
| e @ Error::CopyFailed(_)
|
||||
| e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct PreparedTimelineDetach {
|
||||
layers: Vec<Layer>,
|
||||
}
|
||||
@@ -56,7 +77,7 @@ impl Default for Options {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
|
||||
copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
|
||||
copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -75,6 +96,11 @@ pub(super) async fn prepare(
|
||||
.as_ref()
|
||||
.map(|tl| (tl.clone(), detached.ancestor_lsn))
|
||||
else {
|
||||
// TODO: check if we have already been detached; for this we need to read the stored data
|
||||
// on remote client, for that we need a follow-up which makes uploads cheaper and maintains
|
||||
// a projection of the commited data.
|
||||
//
|
||||
// the error is wrong per openapi
|
||||
return Err(NoAncestor);
|
||||
};
|
||||
|
||||
@@ -84,7 +110,7 @@ pub(super) async fn prepare(
|
||||
|
||||
if ancestor.ancestor_timeline.is_some() {
|
||||
// non-technical requirement; we could flatten N ancestors just as easily but we chose
|
||||
// not to
|
||||
// not to, at least initially
|
||||
return Err(TooManyAncestors);
|
||||
}
|
||||
|
||||
|
||||
@@ -7,19 +7,20 @@ use crate::{
|
||||
index::{IndexPart, LayerFileMetadata},
|
||||
},
|
||||
storage_layer::LayerName,
|
||||
Generation,
|
||||
},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
use std::{
|
||||
collections::{hash_map, HashMap},
|
||||
str::FromStr,
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Identified files in the timeline directory.
|
||||
pub(super) enum Discovered {
|
||||
/// The only one we care about
|
||||
Layer(LayerName, Utf8PathBuf, u64),
|
||||
Layer(LayerName, LocalLayerFileMetadata),
|
||||
/// Old ephmeral files from previous launches, should be removed
|
||||
Ephemeral(String),
|
||||
/// Old temporary timeline files, unsure what these really are, should be removed
|
||||
@@ -27,7 +28,7 @@ pub(super) enum Discovered {
|
||||
/// Temporary on-demand download files, should be removed
|
||||
TemporaryDownload(String),
|
||||
/// Backup file from previously future layers
|
||||
IgnoredBackup,
|
||||
IgnoredBackup(Utf8PathBuf),
|
||||
/// Unrecognized, warn about these
|
||||
Unknown(String),
|
||||
}
|
||||
@@ -43,12 +44,15 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
|
||||
let discovered = match LayerName::from_str(&file_name) {
|
||||
Ok(file_name) => {
|
||||
let file_size = direntry.metadata()?.len();
|
||||
Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
|
||||
Discovered::Layer(
|
||||
file_name,
|
||||
LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
|
||||
)
|
||||
}
|
||||
Err(_) => {
|
||||
if file_name.ends_with(".old") {
|
||||
// ignore these
|
||||
Discovered::IgnoredBackup
|
||||
Discovered::IgnoredBackup(direntry.path().to_owned())
|
||||
} else if remote_timeline_client::is_temp_download_file(direntry.path()) {
|
||||
Discovered::TemporaryDownload(file_name)
|
||||
} else if is_ephemeral_file(&file_name) {
|
||||
@@ -71,37 +75,32 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
|
||||
/// this structure extends it with metadata describing the layer's presence in local storage.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(super) struct LocalLayerFileMetadata {
|
||||
pub(super) metadata: LayerFileMetadata,
|
||||
pub(super) file_size: u64,
|
||||
pub(super) local_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
impl LocalLayerFileMetadata {
|
||||
pub fn new(
|
||||
local_path: Utf8PathBuf,
|
||||
file_size: u64,
|
||||
generation: Generation,
|
||||
shard: ShardIndex,
|
||||
) -> Self {
|
||||
pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
|
||||
Self {
|
||||
local_path,
|
||||
metadata: LayerFileMetadata::new(file_size, generation, shard),
|
||||
file_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Decision on what to do with a layer file after considering its local and remote metadata.
|
||||
/// For a layer that is present in remote metadata, this type describes how to handle
|
||||
/// it during startup: it is either Resident (and we have some metadata about a local file),
|
||||
/// or it is Evicted (and we only have remote metadata).
|
||||
#[derive(Clone, Debug)]
|
||||
pub(super) enum Decision {
|
||||
/// The layer is not present locally.
|
||||
Evicted(LayerFileMetadata),
|
||||
/// The layer is present locally, but local metadata does not match remote; we must
|
||||
/// delete it and treat it as evicted.
|
||||
UseRemote {
|
||||
/// The layer is present locally, and metadata matches: we may hook up this layer to the
|
||||
/// existing file in local storage.
|
||||
Resident {
|
||||
local: LocalLayerFileMetadata,
|
||||
remote: LayerFileMetadata,
|
||||
},
|
||||
/// The layer is present locally, and metadata matches.
|
||||
UseLocal(LocalLayerFileMetadata),
|
||||
}
|
||||
|
||||
/// A layer needs to be left out of the layer map.
|
||||
@@ -117,77 +116,81 @@ pub(super) enum DismissedLayer {
|
||||
/// In order to make crash safe updates to layer map, we must dismiss layers which are only
|
||||
/// found locally or not yet included in the remote `index_part.json`.
|
||||
LocalOnly(LocalLayerFileMetadata),
|
||||
|
||||
/// The layer exists in remote storage but the local layer's metadata (e.g. file size)
|
||||
/// does not match it
|
||||
BadMetadata(LocalLayerFileMetadata),
|
||||
}
|
||||
|
||||
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
|
||||
pub(super) fn reconcile(
|
||||
discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
|
||||
local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
|
||||
index_part: Option<&IndexPart>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
generation: Generation,
|
||||
shard: ShardIndex,
|
||||
) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
|
||||
use Decision::*;
|
||||
let Some(index_part) = index_part else {
|
||||
// If we have no remote metadata, no local layer files are considered valid to load
|
||||
return local_layers
|
||||
.into_iter()
|
||||
.map(|(layer_name, local_metadata)| {
|
||||
(layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
|
||||
})
|
||||
.collect();
|
||||
};
|
||||
|
||||
// name => (local_metadata, remote_metadata)
|
||||
type Collected =
|
||||
HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut discovered = discovered
|
||||
.into_iter()
|
||||
.map(|(layer_name, local_path, file_size)| {
|
||||
(
|
||||
layer_name,
|
||||
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
|
||||
// it is not in IndexPart, in which case using our current generation makes sense
|
||||
// because it will be uploaded in this generation.
|
||||
(
|
||||
Some(LocalLayerFileMetadata::new(
|
||||
local_path, file_size, generation, shard,
|
||||
)),
|
||||
None,
|
||||
),
|
||||
)
|
||||
})
|
||||
.collect::<Collected>();
|
||||
let mut remote_layers = HashMap::new();
|
||||
|
||||
// merge any index_part information, when available
|
||||
// Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise
|
||||
// construct DismissedLayers to get rid of them.
|
||||
for (layer_name, local_metadata) in local_layers {
|
||||
let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
|
||||
result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
|
||||
continue;
|
||||
};
|
||||
|
||||
if remote_metadata.file_size != local_metadata.file_size {
|
||||
result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
|
||||
continue;
|
||||
}
|
||||
|
||||
remote_layers.insert(
|
||||
layer_name,
|
||||
Decision::Resident {
|
||||
local: local_metadata,
|
||||
remote: remote_metadata.clone(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Construct Decision for layers that were not found locally
|
||||
index_part
|
||||
.as_ref()
|
||||
.map(|ip| ip.layer_metadata.iter())
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
|
||||
.layer_metadata
|
||||
.iter()
|
||||
.for_each(|(name, metadata)| {
|
||||
if let Some(existing) = discovered.get_mut(name) {
|
||||
existing.1 = Some(metadata);
|
||||
} else {
|
||||
discovered.insert(name.to_owned(), (None, Some(metadata)));
|
||||
if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
|
||||
entry.insert(Decision::Evicted(metadata.clone()));
|
||||
}
|
||||
});
|
||||
|
||||
discovered
|
||||
.into_iter()
|
||||
.map(|(name, (local, remote))| {
|
||||
let decision = if name.is_in_future(disk_consistent_lsn) {
|
||||
Err(DismissedLayer::Future { local })
|
||||
} else {
|
||||
match (local, remote) {
|
||||
(Some(local), Some(remote)) if local.metadata != remote => {
|
||||
Ok(UseRemote { local, remote })
|
||||
}
|
||||
(Some(x), Some(_)) => Ok(UseLocal(x)),
|
||||
(None, Some(x)) => Ok(Evicted(x)),
|
||||
(Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
|
||||
(None, None) => {
|
||||
unreachable!("there must not be any non-local non-remote files")
|
||||
}
|
||||
}
|
||||
};
|
||||
// For layers that were found in authoritative remote metadata, apply a final check that they are within
|
||||
// the disk_consistent_lsn.
|
||||
result.extend(remote_layers.into_iter().map(|(name, decision)| {
|
||||
if name.is_in_future(disk_consistent_lsn) {
|
||||
match decision {
|
||||
Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
|
||||
Decision::Resident {
|
||||
local,
|
||||
remote: _remote,
|
||||
} => (name, Err(DismissedLayer::Future { local: Some(local) })),
|
||||
}
|
||||
} else {
|
||||
(name, Ok(decision))
|
||||
}
|
||||
}));
|
||||
|
||||
(name, decision)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
result
|
||||
}
|
||||
|
||||
pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
|
||||
@@ -196,25 +199,15 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
|
||||
std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
|
||||
}
|
||||
|
||||
pub(super) fn cleanup_local_file_for_remote(
|
||||
local: &LocalLayerFileMetadata,
|
||||
remote: &LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_size = local.metadata.file_size();
|
||||
let remote_size = remote.file_size();
|
||||
pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
|
||||
let local_size = local.file_size;
|
||||
let path = &local.local_path;
|
||||
|
||||
let file_name = path.file_name().expect("must be file path");
|
||||
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
|
||||
assert!(
|
||||
path.exists(),
|
||||
"we would leave the local_layer without a file if this does not hold: {path}",
|
||||
);
|
||||
Err(err)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
tracing::warn!(
|
||||
"removing local file {file_name:?} because it has unexpected length {local_size};"
|
||||
);
|
||||
|
||||
std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
|
||||
}
|
||||
|
||||
pub(super) fn cleanup_future_layer(
|
||||
@@ -236,8 +229,8 @@ pub(super) fn cleanup_local_only_file(
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = name.kind();
|
||||
tracing::info!(
|
||||
"found local-only {kind} layer {name}, metadata {:?}",
|
||||
local.metadata
|
||||
"found local-only {kind} layer {name} size {}",
|
||||
local.file_size
|
||||
);
|
||||
std::fs::remove_file(&local.local_path)?;
|
||||
Ok(())
|
||||
|
||||
@@ -212,13 +212,34 @@ impl LayerManager {
|
||||
&mut self,
|
||||
rewrite_layers: &[(Layer, ResidentLayer)],
|
||||
drop_layers: &[Layer],
|
||||
_metrics: &TimelineMetrics,
|
||||
metrics: &TimelineMetrics,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for (old_layer, new_layer) in rewrite_layers {
|
||||
debug_assert_eq!(
|
||||
old_layer.layer_desc().key_range,
|
||||
new_layer.layer_desc().key_range
|
||||
);
|
||||
debug_assert_eq!(
|
||||
old_layer.layer_desc().lsn_range,
|
||||
new_layer.layer_desc().lsn_range
|
||||
);
|
||||
|
||||
// TODO: implement rewrites (currently this code path only used for drops)
|
||||
assert!(rewrite_layers.is_empty());
|
||||
// Safety: we may never rewrite the same file in-place. Callers are responsible
|
||||
// for ensuring that they only rewrite layers after something changes the path,
|
||||
// such as an increment in the generation number.
|
||||
assert_ne!(old_layer.local_path(), new_layer.local_path());
|
||||
|
||||
Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
|
||||
|
||||
Self::insert_historic_layer(
|
||||
new_layer.as_ref().clone(),
|
||||
&mut updates,
|
||||
&mut self.layer_fmgr,
|
||||
);
|
||||
|
||||
metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
|
||||
}
|
||||
for l in drop_layers {
|
||||
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
|
||||
@@ -705,6 +705,7 @@ impl ConnectionManagerState {
|
||||
commit_lsn: info.commit_lsn,
|
||||
safekeeper_connstr: info.safekeeper_connstr,
|
||||
availability_zone: info.availability_zone,
|
||||
standby_horizon: info.standby_horizon,
|
||||
}
|
||||
}
|
||||
MessageType::SafekeeperDiscoveryResponse => {
|
||||
@@ -725,6 +726,21 @@ impl ConnectionManagerState {
|
||||
|
||||
WALRECEIVER_BROKER_UPDATES.inc();
|
||||
|
||||
trace!(
|
||||
"safekeeper info update: standby_horizon(cutoff)={}",
|
||||
timeline_update.standby_horizon
|
||||
);
|
||||
if timeline_update.standby_horizon != 0 {
|
||||
// ignore reports from safekeepers not connected to replicas
|
||||
self.timeline
|
||||
.standby_horizon
|
||||
.store(Lsn(timeline_update.standby_horizon));
|
||||
self.timeline
|
||||
.metrics
|
||||
.standby_horizon_gauge
|
||||
.set(timeline_update.standby_horizon as i64);
|
||||
}
|
||||
|
||||
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
|
||||
let old_entry = self.wal_stream_candidates.insert(
|
||||
new_safekeeper_id,
|
||||
@@ -1094,6 +1110,7 @@ mod tests {
|
||||
commit_lsn,
|
||||
safekeeper_connstr: safekeeper_connstr.to_owned(),
|
||||
availability_zone: None,
|
||||
standby_horizon: 0,
|
||||
},
|
||||
latest_update,
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
use utils::lsn::AtomicLsn;
|
||||
@@ -60,6 +61,9 @@ pub(crate) struct UploadQueueInitialized {
|
||||
/// Part of the flattened "next" `index_part.json`.
|
||||
pub(crate) latest_lineage: Lineage,
|
||||
|
||||
/// The last aux file policy used on this timeline.
|
||||
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
|
||||
/// `disk_consistent_lsn` from the last metadata file that was successfully
|
||||
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
|
||||
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
|
||||
@@ -189,6 +193,7 @@ impl UploadQueue {
|
||||
dangling_files: HashMap::new(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
@@ -208,10 +213,7 @@ impl UploadQueue {
|
||||
|
||||
let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
|
||||
for (layer_name, layer_metadata) in &index_part.layer_metadata {
|
||||
files.insert(
|
||||
layer_name.to_owned(),
|
||||
LayerFileMetadata::from(layer_metadata),
|
||||
);
|
||||
files.insert(layer_name.to_owned(), layer_metadata.clone());
|
||||
}
|
||||
|
||||
info!(
|
||||
@@ -239,6 +241,7 @@ impl UploadQueue {
|
||||
dangling_files: HashMap::new(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: index_part.last_aux_file_policy(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
@@ -316,9 +319,7 @@ impl std::fmt::Display for UploadOp {
|
||||
write!(
|
||||
f,
|
||||
"UploadLayer({}, size={:?}, gen={:?})",
|
||||
layer,
|
||||
metadata.file_size(),
|
||||
metadata.generation
|
||||
layer, metadata.file_size, metadata.generation
|
||||
)
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => {
|
||||
|
||||
78
patches/pgvector.patch
Normal file
78
patches/pgvector.patch
Normal file
@@ -0,0 +1,78 @@
|
||||
From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
|
||||
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
|
||||
Date: Fri, 2 Feb 2024 22:26:45 +0200
|
||||
Subject: [PATCH 1/1] Make v0.6.0 work with Neon
|
||||
|
||||
Now that the WAL-logging happens as a separate step at the end of the
|
||||
build, we need a few neon-specific hints to make it work.
|
||||
---
|
||||
src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 36 insertions(+)
|
||||
|
||||
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
|
||||
index 680789b..ec54dea 100644
|
||||
--- a/src/hnswbuild.c
|
||||
+++ b/src/hnswbuild.c
|
||||
@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
|
||||
|
||||
hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_start_unlogged_build(RelationGetSmgr(indexRel));
|
||||
+#endif
|
||||
+
|
||||
/* Perform inserts */
|
||||
HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
|
||||
+#endif
|
||||
+
|
||||
/* Close relations within worker */
|
||||
index_close(indexRel, indexLockmode);
|
||||
table_close(heapRel, heapLockmode);
|
||||
@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
|
||||
SeedRandom(42);
|
||||
#endif
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_start_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
InitBuildState(buildstate, heap, index, indexInfo, forkNum);
|
||||
|
||||
BuildGraph(buildstate, forkNum);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
if (RelationNeedsWAL(index))
|
||||
+ {
|
||||
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ {
|
||||
+#if PG_VERSION_NUM >= 160000
|
||||
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
|
||||
+#else
|
||||
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
|
||||
+#endif
|
||||
+
|
||||
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
|
||||
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
|
||||
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
|
||||
+ }
|
||||
+#endif
|
||||
+ }
|
||||
+
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_end_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
FreeBuildState(buildstate);
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
||||
@@ -49,9 +49,8 @@ char *neon_auth_token;
|
||||
int readahead_buffer_size = 128;
|
||||
int flush_every_n_requests = 8;
|
||||
|
||||
int neon_protocol_version = 1;
|
||||
int neon_protocol_version = 2;
|
||||
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
static int stripe_size;
|
||||
|
||||
@@ -95,18 +94,44 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
static PagestoreShmemState *pagestore_shared;
|
||||
static uint64 pagestore_local_counter = 0;
|
||||
|
||||
typedef enum PSConnectionState {
|
||||
PS_Disconnected, /* no connection yet */
|
||||
PS_Connecting_Startup, /* connection starting up */
|
||||
PS_Connecting_PageStream, /* negotiating pagestream */
|
||||
PS_Connected, /* connected, pagestream established */
|
||||
} PSConnectionState;
|
||||
|
||||
/* This backend's per-shard connections */
|
||||
typedef struct
|
||||
{
|
||||
PGconn *conn;
|
||||
TimestampTz last_connect_time; /* read-only debug value */
|
||||
TimestampTz last_reconnect_time;
|
||||
uint32 delay_us;
|
||||
int n_reconnect_attempts;
|
||||
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
* Pageserver connection state, i.e.
|
||||
* disconnected: conn == NULL, wes == NULL;
|
||||
* conn_startup: connection initiated, waiting for connection establishing
|
||||
* conn_ps: PageStream query sent, waiting for confirmation
|
||||
* connected: PageStream established
|
||||
*/
|
||||
WaitEventSet *wes;
|
||||
PSConnectionState state;
|
||||
PGconn *conn;
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *wes_read;
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_WRITABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *wes_write;
|
||||
} PageServer;
|
||||
|
||||
static PageServer page_servers[MAX_SHARDS];
|
||||
@@ -303,119 +328,269 @@ get_shard_number(BufferTag *tag)
|
||||
return hash % n_shards;
|
||||
}
|
||||
|
||||
static inline void
|
||||
CLEANUP_AND_DISCONNECT(PageServer *shard)
|
||||
{
|
||||
if (shard->wes_read)
|
||||
{
|
||||
FreeWaitEventSet(shard->wes_read);
|
||||
shard->wes_read = NULL;
|
||||
}
|
||||
if (shard->wes_write)
|
||||
{
|
||||
FreeWaitEventSet(shard->wes_write);
|
||||
shard->wes_write = NULL;
|
||||
}
|
||||
if (shard->conn)
|
||||
{
|
||||
PQfinish(shard->conn);
|
||||
shard->conn = NULL;
|
||||
}
|
||||
|
||||
shard->state = PS_Disconnected;
|
||||
}
|
||||
|
||||
/*
|
||||
* Connect to a pageserver, or continue to try to connect if we're yet to
|
||||
* complete the connection (e.g. due to receiving an earlier cancellation
|
||||
* during connection start).
|
||||
* Returns true if successfully connected; false if the connection failed.
|
||||
*
|
||||
* Throws errors in unrecoverable situations, or when this backend's query
|
||||
* is canceled.
|
||||
*/
|
||||
static bool
|
||||
pageserver_connect(shardno_t shard_no, int elevel)
|
||||
{
|
||||
char *query;
|
||||
int ret;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
PGconn *conn;
|
||||
WaitEventSet *wes;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
char connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
|
||||
static TimestampTz last_connect_time = 0;
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
bool broke_from_loop = false;
|
||||
|
||||
Assert(page_servers[shard_no].conn == NULL);
|
||||
|
||||
/*
|
||||
* Get the connection string for this shard. If the shard map has been
|
||||
* updated since we last looked, this will also disconnect any existing
|
||||
* pageserver connections as a side effect.
|
||||
* Note that connstr is used both during connection start, and when we
|
||||
* log the successful connection.
|
||||
*/
|
||||
load_shard_map(shard_no, connstr, NULL);
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_connect = now - last_connect_time;
|
||||
if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
|
||||
switch (shard->state)
|
||||
{
|
||||
pg_usleep(delay_us);
|
||||
delay_us *= 2;
|
||||
}
|
||||
else
|
||||
case PS_Disconnected:
|
||||
{
|
||||
delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
}
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n_pgsql_params;
|
||||
TimestampTz now;
|
||||
int64 us_since_last_attempt;
|
||||
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so when
|
||||
* we set the password before the connection string, the connection string
|
||||
* can override the password from the env variable. Seems useful, although
|
||||
* we don't currently use that capability anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (neon_auth_token)
|
||||
{
|
||||
keywords[n] = "password";
|
||||
values[n] = neon_auth_token;
|
||||
n++;
|
||||
/* Make sure we start with a clean slate */
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
|
||||
shard->last_reconnect_time = now;
|
||||
|
||||
/*
|
||||
* If we did other tasks between reconnect attempts, then we won't
|
||||
* need to wait as long as a full delay.
|
||||
*/
|
||||
if (us_since_last_attempt < shard->delay_us)
|
||||
{
|
||||
pg_usleep(shard->delay_us - us_since_last_attempt);
|
||||
}
|
||||
|
||||
/* update the delay metric */
|
||||
shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
|
||||
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so when
|
||||
* we set the password before the connection string, the connection string
|
||||
* can override the password from the env variable. Seems useful, although
|
||||
* we don't currently use that capability anywhere.
|
||||
*/
|
||||
keywords[0] = "dbname";
|
||||
values[0] = connstr;
|
||||
n_pgsql_params = 1;
|
||||
|
||||
if (neon_auth_token)
|
||||
{
|
||||
keywords[1] = "password";
|
||||
values[1] = neon_auth_token;
|
||||
n_pgsql_params++;
|
||||
}
|
||||
|
||||
keywords[n_pgsql_params] = NULL;
|
||||
values[n_pgsql_params] = NULL;
|
||||
|
||||
shard->conn = PQconnectStartParams(keywords, values, 1);
|
||||
if (!shard->conn)
|
||||
{
|
||||
neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
|
||||
return false;
|
||||
}
|
||||
|
||||
shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
|
||||
|
||||
shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
|
||||
PQsocket(shard->conn),
|
||||
NULL, NULL);
|
||||
|
||||
shard->state = PS_Connecting_Startup;
|
||||
/* fallthrough */
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = connstr;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
conn = PQconnectdbParams(keywords, values, 1);
|
||||
last_connect_time = GetCurrentTimestamp();
|
||||
|
||||
if (PQstatus(conn) == CONNECTION_BAD)
|
||||
case PS_Connecting_Startup:
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
char *pagestream_query;
|
||||
int ps_send_query_ret;
|
||||
bool connected = false;
|
||||
|
||||
PQfinish(conn);
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
|
||||
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
switch (neon_protocol_version)
|
||||
{
|
||||
do
|
||||
{
|
||||
WaitEvent event;
|
||||
int poll_result = PQconnectPoll(shard->conn);
|
||||
|
||||
switch (poll_result)
|
||||
{
|
||||
default: /* unknown/unused states are handled as a failed connection */
|
||||
case PGRES_POLLING_FAILED:
|
||||
{
|
||||
char *pqerr = PQerrorMessage(shard->conn);
|
||||
char *msg = NULL;
|
||||
neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
|
||||
|
||||
if (pqerr)
|
||||
msg = pchomp(pqerr);
|
||||
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
if (msg)
|
||||
{
|
||||
neon_shard_log(shard_no, elevel,
|
||||
"could not connect to pageserver: %s",
|
||||
msg);
|
||||
pfree(msg);
|
||||
}
|
||||
else
|
||||
neon_shard_log(shard_no, elevel,
|
||||
"could not connect to pageserver");
|
||||
|
||||
return false;
|
||||
}
|
||||
case PGRES_POLLING_READING:
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
|
||||
PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
/* query cancellation, backend shutdown */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* PQconnectPoll() handles the socket polling state updates */
|
||||
|
||||
break;
|
||||
case PGRES_POLLING_WRITING:
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
|
||||
PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
/* query cancellation, backend shutdown */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* PQconnectPoll() handles the socket polling state updates */
|
||||
|
||||
break;
|
||||
case PGRES_POLLING_OK:
|
||||
neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
|
||||
connected = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
while (!connected);
|
||||
|
||||
/* No more polling needed; connection succeeded */
|
||||
shard->last_connect_time = GetCurrentTimestamp();
|
||||
|
||||
switch (neon_protocol_version)
|
||||
{
|
||||
case 2:
|
||||
query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
|
||||
pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
case 1:
|
||||
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
|
||||
}
|
||||
ret = PQsendQuery(conn, query);
|
||||
pfree(query);
|
||||
if (ret != 1)
|
||||
{
|
||||
PQfinish(conn);
|
||||
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
|
||||
if (PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||
|
||||
PG_TRY();
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
|
||||
ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
|
||||
pfree(pagestream_query);
|
||||
if (ps_send_query_ret != 1)
|
||||
{
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
|
||||
return false;
|
||||
}
|
||||
|
||||
shard->state = PS_Connecting_PageStream;
|
||||
/* fallthrough */
|
||||
}
|
||||
case PS_Connecting_PageStream:
|
||||
{
|
||||
while (PQisBusy(conn))
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
|
||||
|
||||
if (PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
|
||||
while (PQisBusy(shard->conn))
|
||||
{
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
@@ -423,40 +598,37 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
/* Data available in socket? */
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(conn))
|
||||
if (!PQconsumeInput(shard->conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
/* Returning from inside PG_TRY is bad, so we break/return later */
|
||||
broke_from_loop = true;
|
||||
break;
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (broke_from_loop)
|
||||
{
|
||||
return false;
|
||||
shard->state = PS_Connected;
|
||||
/* fallthrough */
|
||||
}
|
||||
case PS_Connected:
|
||||
/*
|
||||
* We successfully connected. Future connections to this PageServer
|
||||
* will do fast retries again, with exponential backoff.
|
||||
*/
|
||||
shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
|
||||
page_servers[shard_no].conn = conn;
|
||||
page_servers[shard_no].wes = wes;
|
||||
|
||||
return true;
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
|
||||
return true;
|
||||
default:
|
||||
neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
|
||||
}
|
||||
/* This shouldn't be hit */
|
||||
Assert(false);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -476,7 +648,7 @@ retry:
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
@@ -502,7 +674,8 @@ retry:
|
||||
|
||||
/*
|
||||
* Reset prefetch and drop connection to the shard.
|
||||
* It also drops connection to all other shards involved in prefetch.
|
||||
* It also drops connection to all other shards involved in prefetch, through
|
||||
* prefetch_on_ps_disconnect().
|
||||
*/
|
||||
static void
|
||||
pageserver_disconnect(shardno_t shard_no)
|
||||
@@ -512,9 +685,6 @@ pageserver_disconnect(shardno_t shard_no)
|
||||
* whole prefetch queue, even for other pageservers. It should not
|
||||
* cause big problems, because connection loss is supposed to be a
|
||||
* rare event.
|
||||
*
|
||||
* Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
|
||||
* because prefetch request may be registered before connection is established.
|
||||
*/
|
||||
prefetch_on_ps_disconnect();
|
||||
|
||||
@@ -527,37 +697,36 @@ pageserver_disconnect(shardno_t shard_no)
|
||||
static void
|
||||
pageserver_disconnect_shard(shardno_t shard_no)
|
||||
{
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
/*
|
||||
* If anything goes wrong while we were sending a request, it's not clear
|
||||
* what state the connection is in. For example, if we sent the request
|
||||
* but didn't receive a response yet, we might receive the response some
|
||||
* time later after we have already sent a new unrelated request. Close
|
||||
* the connection to avoid getting confused.
|
||||
* Similarly, even when we're in PS_DISCONNECTED, we may have junk to
|
||||
* clean up: It is possible that we encountered an error allocating any
|
||||
* of the wait event sets or the psql connection, or failed when we tried
|
||||
* to attach wait events to the WaitEventSets.
|
||||
*/
|
||||
if (page_servers[shard_no].conn)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
|
||||
PQfinish(page_servers[shard_no].conn);
|
||||
page_servers[shard_no].conn = NULL;
|
||||
}
|
||||
if (page_servers[shard_no].wes != NULL)
|
||||
{
|
||||
FreeWaitEventSet(page_servers[shard_no].wes);
|
||||
page_servers[shard_no].wes = NULL;
|
||||
}
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
shard->state = PS_Disconnected;
|
||||
}
|
||||
|
||||
static bool
|
||||
pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
{
|
||||
StringInfoData req_buff;
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn;
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
|
||||
pageserver_disconnect(shard_no);
|
||||
pageserver_conn = NULL;
|
||||
}
|
||||
|
||||
req_buff = nm_pack_request(request);
|
||||
@@ -571,17 +740,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
|
||||
* connection in case of failure.
|
||||
*/
|
||||
if (!page_servers[shard_no].conn)
|
||||
if (shard->state != PS_Connected)
|
||||
{
|
||||
while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
{
|
||||
HandleMainLoopInterrupts();
|
||||
n_reconnect_attempts += 1;
|
||||
shard->n_reconnect_attempts += 1;
|
||||
}
|
||||
n_reconnect_attempts = 0;
|
||||
shard->n_reconnect_attempts = 0;
|
||||
} else {
|
||||
Assert(shard->conn != NULL);
|
||||
}
|
||||
|
||||
pageserver_conn = page_servers[shard_no].conn;
|
||||
pageserver_conn = shard->conn;
|
||||
|
||||
/*
|
||||
* Send request.
|
||||
@@ -590,13 +761,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
* should use async mode and check for interrupts while waiting. In
|
||||
* practice, our requests are small enough to always fit in the output and
|
||||
* TCP buffer.
|
||||
*
|
||||
* Note that this also will fail when the connection is in the
|
||||
* PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
|
||||
* point, but on the grand scheme of things it's only a small issue.
|
||||
*/
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
|
||||
pfree(msg);
|
||||
pfree(req_buff.data);
|
||||
return false;
|
||||
@@ -611,6 +786,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -619,58 +795,68 @@ pageserver_receive(shardno_t shard_no)
|
||||
{
|
||||
StringInfoData resp_buff;
|
||||
NeonResponse *resp;
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn = shard->conn;
|
||||
/* read response */
|
||||
int rc;
|
||||
|
||||
if (!pageserver_conn)
|
||||
return NULL;
|
||||
|
||||
PG_TRY();
|
||||
if (shard->state != PS_Connected)
|
||||
{
|
||||
/* read response */
|
||||
int rc;
|
||||
neon_shard_log(shard_no, LOG,
|
||||
"pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
|
||||
shard->state);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
|
||||
if (rc >= 0)
|
||||
Assert(pageserver_conn);
|
||||
|
||||
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
|
||||
if (rc >= 0)
|
||||
{
|
||||
/* call_PQgetCopyData handles rc == 0 */
|
||||
Assert(rc > 0);
|
||||
|
||||
PG_TRY();
|
||||
{
|
||||
resp_buff.len = rc;
|
||||
resp_buff.cursor = 0;
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
else if (rc == -1)
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
|
||||
pageserver_disconnect(shard_no);
|
||||
resp = NULL;
|
||||
PG_RE_THROW();
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
PG_END_TRY();
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
else if (rc == -1)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
return (NeonResponse *) resp;
|
||||
}
|
||||
@@ -681,7 +867,7 @@ pageserver_flush(shardno_t shard_no)
|
||||
{
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
if (!pageserver_conn)
|
||||
if (page_servers[shard_no].state != PS_Connected)
|
||||
{
|
||||
neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
|
||||
}
|
||||
@@ -697,6 +883,7 @@ pageserver_flush(shardno_t shard_no)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -860,7 +1047,7 @@ pg_init_libpagestore(void)
|
||||
"Version of compute<->page server protocol",
|
||||
NULL,
|
||||
&neon_protocol_version,
|
||||
1, /* default to old protocol for now */
|
||||
2, /* use protocol version 2 */
|
||||
1, /* min */
|
||||
2, /* max */
|
||||
PGC_SU_BACKEND,
|
||||
@@ -891,5 +1078,7 @@ pg_init_libpagestore(void)
|
||||
dbsize_hook = neon_dbsize;
|
||||
}
|
||||
|
||||
memset(page_servers, 0, sizeof(page_servers));
|
||||
|
||||
lfc_init();
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/parallel.h"
|
||||
#include "access/xact.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlogdefs.h"
|
||||
@@ -93,6 +94,10 @@ static char *hexdump_page(char *page);
|
||||
|
||||
const int SmgrTrace = DEBUG5;
|
||||
|
||||
#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
|
||||
neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
|
||||
##__VA_ARGS__)
|
||||
|
||||
page_server_api *page_server;
|
||||
|
||||
/* unlogged relation build states */
|
||||
@@ -525,6 +530,8 @@ prefetch_flush_requests(void)
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
* NOTE: callers should make sure they can handle query cancellations in this
|
||||
* function's call path.
|
||||
*/
|
||||
static bool
|
||||
prefetch_wait_for(uint64 ring_index)
|
||||
@@ -560,6 +567,8 @@ prefetch_wait_for(uint64 ring_index)
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
*
|
||||
* NOTE: this does IO, and can get canceled out-of-line.
|
||||
*/
|
||||
static bool
|
||||
prefetch_read(PrefetchRequest *slot)
|
||||
@@ -571,6 +580,14 @@ prefetch_read(PrefetchRequest *slot)
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_receive);
|
||||
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(slot->shard_no, ERROR,
|
||||
"Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
|
||||
slot->status, (size_t) (void *) slot->response,
|
||||
slot->my_ring_index, MyPState->ring_receive);
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = (NeonResponse *) page_server->receive(slot->shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
@@ -588,6 +605,11 @@ prefetch_read(PrefetchRequest *slot)
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_shard_log(slot->shard_no, WARNING,
|
||||
"No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
|
||||
slot->my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
|
||||
slot->buftag.forkNum, slot->buftag.blockNum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -602,6 +624,7 @@ void
|
||||
prefetch_on_ps_disconnect(void)
|
||||
{
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
|
||||
while (MyPState->ring_receive < MyPState->ring_unused)
|
||||
{
|
||||
PrefetchRequest *slot;
|
||||
@@ -624,6 +647,7 @@ prefetch_on_ps_disconnect(void)
|
||||
slot->status = PRFS_TAG_REMAINS;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
MyPState->ring_receive += 1;
|
||||
|
||||
prefetch_set_unused(ring_index);
|
||||
}
|
||||
}
|
||||
@@ -690,6 +714,8 @@ static void
|
||||
prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
|
||||
{
|
||||
bool found;
|
||||
uint64 mySlotNo = slot->my_ring_index;
|
||||
|
||||
NeonGetPageRequest request = {
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
/* lsn and not_modified_since are filled in below */
|
||||
@@ -698,6 +724,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
.blkno = slot->buftag.blockNum,
|
||||
};
|
||||
|
||||
Assert(mySlotNo == MyPState->ring_unused);
|
||||
|
||||
if (force_request_lsns)
|
||||
slot->request_lsns = *force_request_lsns;
|
||||
else
|
||||
@@ -710,7 +738,11 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_unused);
|
||||
|
||||
while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
|
||||
while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
|
||||
{
|
||||
Assert(mySlotNo == MyPState->ring_unused);
|
||||
/* loop */
|
||||
}
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_requests_inflight += 1;
|
||||
@@ -721,7 +753,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
|
||||
/* update slot state */
|
||||
slot->status = PRFS_REQUESTED;
|
||||
|
||||
prfh_insert(MyPState->prf_hash, slot, &found);
|
||||
Assert(!found);
|
||||
}
|
||||
@@ -893,6 +924,10 @@ Retry:
|
||||
return ring_index;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: this function can get canceled and use a long jump to the next catch
|
||||
* context. Take care.
|
||||
*/
|
||||
static NeonResponse *
|
||||
page_server_request(void const *req)
|
||||
{
|
||||
@@ -924,19 +959,38 @@ page_server_request(void const *req)
|
||||
* Current sharding model assumes that all metadata is present only at shard 0.
|
||||
* We still need to call get_shard_no() to check if shard map is up-to-date.
|
||||
*/
|
||||
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
|
||||
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
|
||||
((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
|
||||
{
|
||||
shard_no = 0;
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
|
||||
consume_prefetch_responses();
|
||||
resp = page_server->receive(shard_no);
|
||||
} while (resp == NULL);
|
||||
return resp;
|
||||
PG_TRY();
|
||||
{
|
||||
while (!page_server->send(shard_no, (NeonRequest *) req)
|
||||
|| !page_server->flush(shard_no))
|
||||
{
|
||||
/* do nothing */
|
||||
}
|
||||
consume_prefetch_responses();
|
||||
resp = page_server->receive(shard_no);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
/*
|
||||
* Cancellation in this code needs to be handled better at some
|
||||
* point, but this currently seems fine for now.
|
||||
*/
|
||||
page_server->disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
} while (resp == NULL);
|
||||
|
||||
return resp;
|
||||
}
|
||||
|
||||
|
||||
@@ -1348,6 +1402,10 @@ PageIsEmptyHeapPage(char *buffer)
|
||||
return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* A page is being evicted from the shared buffer cache. Update the
|
||||
* last-written LSN of the page, and WAL-log it if needed.
|
||||
*/
|
||||
static void
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
|
||||
@@ -1356,12 +1414,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn = PageGetLSN((Page) buffer);
|
||||
|
||||
if (ShutdownRequestPending)
|
||||
return;
|
||||
/* Don't log any pages if we're not allowed to do so. */
|
||||
if (!XLogInsertAllowed())
|
||||
return;
|
||||
bool log_page;
|
||||
|
||||
/*
|
||||
* Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
|
||||
@@ -1370,9 +1423,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
* correctness, the non-logged updates are not critical. But we want to
|
||||
* have a reasonably up-to-date VM and FSM in the page server.
|
||||
*/
|
||||
if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
|
||||
log_page = false;
|
||||
if (force)
|
||||
{
|
||||
Assert(XLogInsertAllowed());
|
||||
log_page = true;
|
||||
}
|
||||
else if (XLogInsertAllowed() &&
|
||||
!ShutdownRequestPending &&
|
||||
(forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
|
||||
{
|
||||
log_page = true;
|
||||
}
|
||||
|
||||
if (log_page)
|
||||
{
|
||||
/* FSM is never WAL-logged and we don't care. */
|
||||
XLogRecPtr recptr;
|
||||
|
||||
recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
|
||||
@@ -1385,7 +1450,8 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
}
|
||||
else if (lsn == InvalidXLogRecPtr)
|
||||
|
||||
if (lsn == InvalidXLogRecPtr)
|
||||
{
|
||||
/*
|
||||
* When PostgreSQL extends a relation, it calls smgrextend() with an
|
||||
@@ -1421,19 +1487,31 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
}
|
||||
else
|
||||
else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
|
||||
{
|
||||
ereport(PANIC,
|
||||
/*
|
||||
* Its a bad sign if there is a page with zero LSN in the buffer
|
||||
* cache in a standby, too. However, PANICing seems like a cure
|
||||
* worse than the disease, as the damage has likely already been
|
||||
* done in the primary. So in a standby, make this an assertion,
|
||||
* and in a release build just LOG the error and soldier on. We
|
||||
* update the last-written LSN of the page with a conservative
|
||||
* value in that case, which is the last replayed LSN.
|
||||
*/
|
||||
ereport(RecoveryInProgress() ? LOG : PANIC,
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
Assert(false);
|
||||
|
||||
lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(SmgrTrace,
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
|
||||
(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
@@ -1526,8 +1604,92 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
|
||||
|
||||
if (RecoveryInProgress())
|
||||
{
|
||||
/* Request the page at the last replayed LSN. */
|
||||
result.request_lsn = GetXLogReplayRecPtr(NULL);
|
||||
/*---
|
||||
* In broad strokes, a replica always requests the page at the current
|
||||
* replay LSN. But looking closer, what exactly is the replay LSN? Is
|
||||
* it the last replayed record, or the record being replayed? And does
|
||||
* the startup process performing the replay need to do something
|
||||
* differently than backends running queries? Let's take a closer look
|
||||
* at the different scenarios:
|
||||
*
|
||||
* 1. Startup process reads a page, last_written_lsn is old.
|
||||
*
|
||||
* Read the old version of the page. We will apply the WAL record on
|
||||
* it to bring it up-to-date.
|
||||
*
|
||||
* We could read the new version, with the changes from this WAL
|
||||
* record already applied, to offload the work of replaying the record
|
||||
* to the pageserver. The pageserver might not have received the WAL
|
||||
* record yet, though, so a read of the old page version and applying
|
||||
* the record ourselves is likely faster. Also, the redo function
|
||||
* might be surprised if the changes have already applied. That's
|
||||
* normal during crash recovery, but not in hot standby.
|
||||
*
|
||||
* 2. Startup process reads a page, last_written_lsn == record we're
|
||||
* replaying.
|
||||
*
|
||||
* Can this happen? There are a few theoretical cases when it might:
|
||||
*
|
||||
* A) The redo function reads the same page twice. We had already read
|
||||
* and applied the changes once, and now we're reading it for the
|
||||
* second time. That would be a rather silly thing for a redo
|
||||
* function to do, and I'm not aware of any that would do it.
|
||||
*
|
||||
* B) The redo function modifies multiple pages, and it already
|
||||
* applied the changes to one of the pages, released the lock on
|
||||
* it, and is now reading a second page. Furthermore, the first
|
||||
* page was already evicted from the buffer cache, and also from
|
||||
* the last-written LSN cache, so that the per-relation or global
|
||||
* last-written LSN was already updated. All the WAL redo functions
|
||||
* hold the locks on pages that they modify, until all the changes
|
||||
* have been modified (?), which would make that impossible.
|
||||
* However, we skip the locking, if the page isn't currently in the
|
||||
* page cache (see neon_redo_read_buffer_filter below).
|
||||
*
|
||||
* Even if the one of the above cases were possible in theory, they
|
||||
* would also require the pages being modified by the redo function to
|
||||
* be immediately evicted from the page cache.
|
||||
*
|
||||
* So this probably does not happen in practice. But if it does, we
|
||||
* request the new version, including the changes from the record
|
||||
* being replayed. That seems like the correct behavior in any case.
|
||||
*
|
||||
* 3. Backend process reads a page with old last-written LSN
|
||||
*
|
||||
* Nothing special here. Read the old version.
|
||||
*
|
||||
* 4. Backend process reads a page with last_written_lsn == record being replayed
|
||||
*
|
||||
* This can happen, if the redo function has started to run, and saw
|
||||
* that the page isn't present in the page cache (see
|
||||
* neon_redo_read_buffer_filter below). Normally, in a normal
|
||||
* Postgres server, the redo function would hold a lock on the page,
|
||||
* so we would get blocked waiting the redo function to release the
|
||||
* lock. To emulate that, wait for the WAL replay of the record to
|
||||
* finish.
|
||||
*/
|
||||
/* Request the page at the end of the last fully replayed LSN. */
|
||||
XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
|
||||
|
||||
if (last_written_lsn > replay_lsn)
|
||||
{
|
||||
/* GetCurrentReplayRecPtr was introduced in v15 */
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Cases 2 and 4. If this is a backend (case 4), the
|
||||
* neon_read_at_lsn() call later will wait for the WAL record to be
|
||||
* fully replayed.
|
||||
*/
|
||||
result.request_lsn = last_written_lsn;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* cases 1 and 3 */
|
||||
result.request_lsn = replay_lsn;
|
||||
}
|
||||
result.not_modified_since = last_written_lsn;
|
||||
result.effective_request_lsn = result.request_lsn;
|
||||
Assert(last_written_lsn <= result.request_lsn);
|
||||
@@ -1796,7 +1958,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
|
||||
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
return exists;
|
||||
@@ -2248,7 +2412,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
/*
|
||||
* Try to find prefetched page in the list of received pages.
|
||||
*/
|
||||
Retry:
|
||||
Retry:
|
||||
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
|
||||
|
||||
if (entry != NULL)
|
||||
@@ -2334,7 +2498,9 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
|
||||
"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
|
||||
T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
|
||||
/* buffer was used, clean up for later reuse */
|
||||
@@ -2605,7 +2771,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
|
||||
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
|
||||
@@ -2658,7 +2826,9 @@ neon_dbsize(Oid dbNode)
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
|
||||
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
|
||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
@@ -2822,10 +2992,14 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
|
||||
|
||||
/*
|
||||
* Create the local file. In a parallel build, the leader is expected to
|
||||
* call this first and do it.
|
||||
*
|
||||
* FIXME: should we pass isRedo true to create the tablespace dir if it
|
||||
* doesn't exist? Is it needed?
|
||||
*/
|
||||
mdcreate(reln, MAIN_FORKNUM, false);
|
||||
if (!IsParallelWorker())
|
||||
mdcreate(reln, MAIN_FORKNUM, false);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2849,7 +3023,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
|
||||
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
|
||||
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
|
||||
/*
|
||||
* In a parallel build, (only) the leader process performs the 2nd
|
||||
* phase.
|
||||
*/
|
||||
if (IsParallelWorker())
|
||||
{
|
||||
unlogged_build_rel = NULL;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
}
|
||||
else
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2983,7 +3167,9 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
|
||||
T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
|
||||
@@ -3201,7 +3387,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
BufferTag tag;
|
||||
uint32 hash;
|
||||
LWLock *partitionLock;
|
||||
Buffer buffer;
|
||||
int buf_id;
|
||||
bool no_redo_needed;
|
||||
|
||||
if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
|
||||
@@ -3239,20 +3425,20 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
else
|
||||
{
|
||||
/* Try to find the relevant buffer */
|
||||
buffer = BufTableLookup(&tag, hash);
|
||||
buf_id = BufTableLookup(&tag, hash);
|
||||
|
||||
no_redo_needed = buffer < 0;
|
||||
no_redo_needed = buf_id < 0;
|
||||
}
|
||||
/* In both cases st lwlsn past this WAL record */
|
||||
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
|
||||
|
||||
/*
|
||||
* we don't have the buffer in memory, update lwLsn past this record, also
|
||||
* evict page from file cache
|
||||
*/
|
||||
if (no_redo_needed)
|
||||
{
|
||||
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
|
||||
lfc_evict(rinfo, forknum, blkno);
|
||||
|
||||
}
|
||||
|
||||
LWLockRelease(partitionLock);
|
||||
|
||||
|
||||
@@ -1852,34 +1852,30 @@ static void
|
||||
CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
|
||||
{
|
||||
hs->ts = 0;
|
||||
hs->xmin.value = ~0; /* largest unsigned value */
|
||||
hs->catalog_xmin.value = ~0; /* largest unsigned value */
|
||||
hs->xmin = InvalidFullTransactionId;
|
||||
hs->catalog_xmin = InvalidFullTransactionId;
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
if (wp->safekeeper[i].appendResponse.hs.ts != 0)
|
||||
|
||||
if (wp->safekeeper[i].state == SS_ACTIVE)
|
||||
{
|
||||
HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;
|
||||
|
||||
if (FullTransactionIdIsNormal(skhs->xmin)
|
||||
&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
|
||||
&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
|
||||
{
|
||||
hs->xmin = skhs->xmin;
|
||||
hs->ts = skhs->ts;
|
||||
}
|
||||
if (FullTransactionIdIsNormal(skhs->catalog_xmin)
|
||||
&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
|
||||
&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
|
||||
{
|
||||
hs->catalog_xmin = skhs->catalog_xmin;
|
||||
hs->ts = skhs->ts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (hs->xmin.value == ~0)
|
||||
hs->xmin = InvalidFullTransactionId;
|
||||
if (hs->catalog_xmin.value == ~0)
|
||||
hs->catalog_xmin = InvalidFullTransactionId;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1946,14 +1942,28 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
|
||||
}
|
||||
|
||||
CombineHotStanbyFeedbacks(&hsFeedback, wp);
|
||||
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
|
||||
if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
|
||||
{
|
||||
FullTransactionId xmin = hsFeedback.xmin;
|
||||
FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
|
||||
FullTransactionId next_xid = ReadNextFullTransactionId();
|
||||
/*
|
||||
* Page server is updating nextXid in checkpoint each 1024 transactions,
|
||||
* so feedback xmin can be actually larger then nextXid and
|
||||
* function TransactionIdInRecentPast return false in this case,
|
||||
* preventing update of slot's xmin.
|
||||
*/
|
||||
if (FullTransactionIdPrecedes(next_xid, xmin))
|
||||
xmin = next_xid;
|
||||
if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
|
||||
catalog_xmin = next_xid;
|
||||
agg_hs_feedback = hsFeedback;
|
||||
elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
ProcessStandbyHSFeedback(hsFeedback.ts,
|
||||
XidFromFullTransactionId(hsFeedback.xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.xmin),
|
||||
XidFromFullTransactionId(hsFeedback.catalog_xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
XidFromFullTransactionId(xmin),
|
||||
EpochFromFullTransactionId(xmin),
|
||||
XidFromFullTransactionId(catalog_xmin),
|
||||
EpochFromFullTransactionId(catalog_xmin));
|
||||
}
|
||||
|
||||
CheckGracefulShutdown(wp);
|
||||
|
||||
21
poetry.lock
generated
21
poetry.lock
generated
@@ -2405,6 +2405,7 @@ files = [
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
@@ -2529,13 +2530,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.31.0"
|
||||
version = "2.32.0"
|
||||
description = "Python HTTP for Humans."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
|
||||
{file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
|
||||
{file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
|
||||
{file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2959,6 +2960,16 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
@@ -3196,4 +3207,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"
|
||||
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
|
||||
|
||||
@@ -9,6 +9,7 @@ default = []
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
ahash.workspace = true
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
@@ -24,8 +25,10 @@ camino.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crossbeam-deque.workspace = true
|
||||
dashmap.workspace = true
|
||||
env_logger.workspace = true
|
||||
framed-websockets.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hashbrown.workspace = true
|
||||
@@ -35,7 +38,6 @@ hmac.workspace = true
|
||||
hostname.workspace = true
|
||||
http.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper-tungstenite.workspace = true
|
||||
hyper.workspace = true
|
||||
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
|
||||
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
|
||||
@@ -52,7 +54,6 @@ opentelemetry.workspace = true
|
||||
parking_lot.workspace = true
|
||||
parquet.workspace = true
|
||||
parquet_derive.workspace = true
|
||||
pbkdf2 = { workspace = true, features = ["simple", "std"] }
|
||||
pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
pq_proto.workspace = true
|
||||
@@ -76,12 +77,12 @@ smol_str.workspace = true
|
||||
smallvec.workspace = true
|
||||
socket2.workspace = true
|
||||
subtle.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
task-local-extensions.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio = { workspace = true, features = ["signal"] }
|
||||
@@ -96,8 +97,6 @@ utils.workspace = true
|
||||
uuid.workspace = true
|
||||
webpki-roots.workspace = true
|
||||
x509-parser.workspace = true
|
||||
native-tls.workspace = true
|
||||
postgres-native-tls.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
redis.workspace = true
|
||||
|
||||
@@ -106,6 +105,8 @@ workspace_hack.workspace = true
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
tokio-tungstenite.workspace = true
|
||||
pbkdf2 = { workspace = true, features = ["simple", "std"] }
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
|
||||
@@ -365,7 +365,10 @@ async fn authenticate_with_secret(
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
if let Some(password) = unauthenticated_password {
|
||||
let auth_outcome = validate_password_and_exchange(&password, secret).await?;
|
||||
let ep = EndpointIdInt::from(&info.endpoint);
|
||||
|
||||
let auth_outcome =
|
||||
validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
|
||||
let keys = match auth_outcome {
|
||||
crate::sasl::Outcome::Success(key) => key,
|
||||
crate::sasl::Outcome::Failure(reason) => {
|
||||
@@ -386,7 +389,7 @@ async fn authenticate_with_secret(
|
||||
// Currently, we use it for websocket connections (latency).
|
||||
if allow_cleartext {
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
return hacks::authenticate_cleartext(ctx, info, client, secret).await;
|
||||
return hacks::authenticate_cleartext(ctx, info, client, secret, config).await;
|
||||
}
|
||||
|
||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||
@@ -554,7 +557,7 @@ mod tests {
|
||||
context::RequestMonitoring,
|
||||
proxy::NeonOptions,
|
||||
rate_limiter::{EndpointRateLimiter, RateBucketInfo},
|
||||
scram::ServerSecret,
|
||||
scram::{threadpool::ThreadPool, ServerSecret},
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
|
||||
@@ -596,6 +599,7 @@ mod tests {
|
||||
}
|
||||
|
||||
static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
|
||||
thread_pool: ThreadPool::new(1),
|
||||
scram_protocol_timeout: std::time::Duration::from_secs(5),
|
||||
rate_limiter_enabled: true,
|
||||
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
|
||||
|
||||
@@ -3,8 +3,10 @@ use super::{
|
||||
};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow},
|
||||
config::AuthenticationConfig,
|
||||
console::AuthSecret,
|
||||
context::RequestMonitoring,
|
||||
intern::EndpointIdInt,
|
||||
sasl,
|
||||
stream::{self, Stream},
|
||||
};
|
||||
@@ -20,6 +22,7 @@ pub async fn authenticate_cleartext(
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
secret: AuthSecret,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
@@ -27,8 +30,14 @@ pub async fn authenticate_cleartext(
|
||||
// pause the timer while we communicate with the client
|
||||
let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let ep = EndpointIdInt::from(&info.endpoint);
|
||||
|
||||
let auth_flow = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword(secret))
|
||||
.begin(auth::CleartextPassword {
|
||||
secret,
|
||||
endpoint: ep,
|
||||
pool: config.thread_pool.clone(),
|
||||
})
|
||||
.await?;
|
||||
drop(paused);
|
||||
// cleartext auth is only allowed to the ws/http protocol.
|
||||
|
||||
@@ -5,12 +5,14 @@ use crate::{
|
||||
config::TlsServerEndPoint,
|
||||
console::AuthSecret,
|
||||
context::RequestMonitoring,
|
||||
sasl, scram,
|
||||
intern::EndpointIdInt,
|
||||
sasl,
|
||||
scram::{self, threadpool::ThreadPool},
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
|
||||
use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
|
||||
use std::io;
|
||||
use std::{io, sync::Arc};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
|
||||
@@ -53,7 +55,11 @@ impl AuthMethod for PasswordHack {
|
||||
|
||||
/// Use clear-text password auth called `password` in docs
|
||||
/// <https://www.postgresql.org/docs/current/auth-password.html>
|
||||
pub struct CleartextPassword(pub AuthSecret);
|
||||
pub struct CleartextPassword {
|
||||
pub pool: Arc<ThreadPool>,
|
||||
pub endpoint: EndpointIdInt,
|
||||
pub secret: AuthSecret,
|
||||
}
|
||||
|
||||
impl AuthMethod for CleartextPassword {
|
||||
#[inline(always)]
|
||||
@@ -126,7 +132,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
.strip_suffix(&[0])
|
||||
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
|
||||
|
||||
let outcome = validate_password_and_exchange(password, self.state.0).await?;
|
||||
let outcome = validate_password_and_exchange(
|
||||
&self.state.pool,
|
||||
self.state.endpoint,
|
||||
password,
|
||||
self.state.secret,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let sasl::Outcome::Success(_) = &outcome {
|
||||
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
@@ -181,6 +193,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
}
|
||||
|
||||
pub(crate) async fn validate_password_and_exchange(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
password: &[u8],
|
||||
secret: AuthSecret,
|
||||
) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
@@ -194,7 +208,7 @@ pub(crate) async fn validate_password_and_exchange(
|
||||
}
|
||||
// perform scram authentication as both client and server to validate the keys
|
||||
AuthSecret::Scram(scram_secret) => {
|
||||
let outcome = crate::scram::exchange(&scram_secret, password).await?;
|
||||
let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
|
||||
|
||||
let client_key = match outcome {
|
||||
sasl::Outcome::Success(client_key) => client_key,
|
||||
|
||||
@@ -27,6 +27,7 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
|
||||
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use proxy::redis::elasticache;
|
||||
use proxy::redis::notifications;
|
||||
use proxy::scram::threadpool::ThreadPool;
|
||||
use proxy::serverless::cancel_set::CancelSet;
|
||||
use proxy::serverless::GlobalConnPoolOptions;
|
||||
use proxy::usage_metrics;
|
||||
@@ -132,6 +133,9 @@ struct ProxyCliArgs {
|
||||
/// timeout for scram authentication protocol
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
scram_protocol_timeout: tokio::time::Duration,
|
||||
/// size of the threadpool for password hashing
|
||||
#[clap(long, default_value_t = 4)]
|
||||
scram_thread_pool_size: u8,
|
||||
/// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
require_client_ip: bool,
|
||||
@@ -489,6 +493,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
|
||||
Metrics::install(thread_pool.metrics.clone());
|
||||
|
||||
let tls_config = match (&args.tls_key, &args.tls_cert) {
|
||||
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(
|
||||
key_path,
|
||||
@@ -624,6 +631,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
thread_pool,
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
rate_limiter_enabled: args.auth_rate_limit_enabled,
|
||||
rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
|
||||
|
||||
@@ -11,10 +11,12 @@ use crate::{
|
||||
use futures::{FutureExt, TryFutureExt};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::{io, net::SocketAddr, time::Duration};
|
||||
use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError};
|
||||
use std::{io, net::SocketAddr, sync::Arc, time::Duration};
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_postgres::tls::MakeTlsConnect;
|
||||
use tokio_postgres_rustls::MakeRustlsConnect;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
|
||||
@@ -30,7 +32,7 @@ pub enum ConnectionError {
|
||||
CouldNotConnect(#[from] io::Error),
|
||||
|
||||
#[error("{COULD_NOT_CONNECT}: {0}")]
|
||||
TlsError(#[from] native_tls::Error),
|
||||
TlsError(#[from] InvalidDnsNameError),
|
||||
|
||||
#[error("{COULD_NOT_CONNECT}: {0}")]
|
||||
WakeComputeError(#[from] WakeComputeError),
|
||||
@@ -257,7 +259,7 @@ pub struct PostgresConnection {
|
||||
/// Socket connected to a compute node.
|
||||
pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
|
||||
tokio::net::TcpStream,
|
||||
postgres_native_tls::TlsStream<tokio::net::TcpStream>,
|
||||
tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
|
||||
>,
|
||||
/// PostgreSQL connection parameters.
|
||||
pub params: std::collections::HashMap<String, String>,
|
||||
@@ -282,12 +284,24 @@ impl ConnCfg {
|
||||
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
|
||||
drop(pause);
|
||||
|
||||
let tls_connector = native_tls::TlsConnector::builder()
|
||||
.danger_accept_invalid_certs(allow_self_signed_compute)
|
||||
.build()
|
||||
.unwrap();
|
||||
let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
|
||||
let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
|
||||
let client_config = if allow_self_signed_compute {
|
||||
let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
|
||||
rustls::ClientConfig::builder()
|
||||
.dangerous()
|
||||
.with_custom_certificate_verifier(verifier)
|
||||
} else {
|
||||
let root_store = rustls::RootCertStore {
|
||||
roots: webpki_roots::TLS_SERVER_ROOTS.to_vec(),
|
||||
};
|
||||
rustls::ClientConfig::builder().with_root_certificates(root_store)
|
||||
};
|
||||
let client_config = client_config.with_no_client_auth();
|
||||
|
||||
let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
|
||||
let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
|
||||
&mut mk_tls,
|
||||
host,
|
||||
)?;
|
||||
|
||||
// connect_raw() will not use TLS if sslmode is "disable"
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
|
||||
@@ -340,6 +354,50 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
|
||||
Some(options)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct AcceptEverythingVerifier;
|
||||
impl ServerCertVerifier for AcceptEverythingVerifier {
|
||||
fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
|
||||
use rustls::SignatureScheme::*;
|
||||
// The schemes for which `SignatureScheme::supported_in_tls13` returns true.
|
||||
vec![
|
||||
ECDSA_NISTP521_SHA512,
|
||||
ECDSA_NISTP384_SHA384,
|
||||
ECDSA_NISTP256_SHA256,
|
||||
RSA_PSS_SHA512,
|
||||
RSA_PSS_SHA384,
|
||||
RSA_PSS_SHA256,
|
||||
ED25519,
|
||||
]
|
||||
}
|
||||
fn verify_server_cert(
|
||||
&self,
|
||||
_end_entity: &rustls::pki_types::CertificateDer<'_>,
|
||||
_intermediates: &[rustls::pki_types::CertificateDer<'_>],
|
||||
_server_name: &rustls::pki_types::ServerName<'_>,
|
||||
_ocsp_response: &[u8],
|
||||
_now: rustls::pki_types::UnixTime,
|
||||
) -> Result<rustls::client::danger::ServerCertVerified, rustls::Error> {
|
||||
Ok(rustls::client::danger::ServerCertVerified::assertion())
|
||||
}
|
||||
fn verify_tls12_signature(
|
||||
&self,
|
||||
_message: &[u8],
|
||||
_cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
_dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
|
||||
Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
|
||||
}
|
||||
fn verify_tls13_signature(
|
||||
&self,
|
||||
_message: &[u8],
|
||||
_cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
_dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
|
||||
Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -2,6 +2,7 @@ use crate::{
|
||||
auth::{self, backend::AuthRateLimiter},
|
||||
console::locks::ApiLocks,
|
||||
rate_limiter::RateBucketInfo,
|
||||
scram::threadpool::ThreadPool,
|
||||
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
|
||||
Host,
|
||||
};
|
||||
@@ -61,6 +62,7 @@ pub struct HttpConfig {
|
||||
}
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
pub thread_pool: Arc<ThreadPool>,
|
||||
pub scram_protocol_timeout: tokio::time::Duration,
|
||||
pub rate_limiter_enabled: bool,
|
||||
pub rate_limiter: AuthRateLimiter,
|
||||
|
||||
@@ -307,7 +307,7 @@ where
|
||||
}
|
||||
|
||||
async fn upload_parquet(
|
||||
w: SerializedFileWriter<Writer<BytesMut>>,
|
||||
mut w: SerializedFileWriter<Writer<BytesMut>>,
|
||||
len: i64,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<Writer<BytesMut>> {
|
||||
@@ -319,11 +319,15 @@ async fn upload_parquet(
|
||||
|
||||
// I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
|
||||
// finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
|
||||
let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish())
|
||||
let (mut buffer, metadata) =
|
||||
tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> {
|
||||
let metadata = w.finish()?;
|
||||
let buffer = std::mem::take(w.inner_mut().get_mut());
|
||||
Ok((buffer, metadata))
|
||||
})
|
||||
.await
|
||||
.unwrap()?;
|
||||
|
||||
let mut buffer = writer.into_inner();
|
||||
let data = buffer.split().freeze();
|
||||
|
||||
let compression = len as f64 / len_uncompressed as f64;
|
||||
@@ -351,7 +355,7 @@ async fn upload_parquet(
|
||||
"{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
|
||||
))?;
|
||||
let cancel = CancellationToken::new();
|
||||
backoff::retry(
|
||||
let maybe_err = backoff::retry(
|
||||
|| async {
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
|
||||
storage
|
||||
@@ -368,7 +372,12 @@ async fn upload_parquet(
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("request_data_upload")?;
|
||||
.context("request_data_upload")
|
||||
.err();
|
||||
|
||||
if let Some(err) = maybe_err {
|
||||
tracing::warn!(%id, %err, "failed to upload request data");
|
||||
}
|
||||
|
||||
Ok(buffer.writer())
|
||||
}
|
||||
@@ -474,10 +483,11 @@ mod tests {
|
||||
RequestData {
|
||||
session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
|
||||
peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
|
||||
timestamp: chrono::NaiveDateTime::from_timestamp_millis(
|
||||
timestamp: chrono::DateTime::from_timestamp_millis(
|
||||
rng.gen_range(1703862754..1803862754),
|
||||
)
|
||||
.unwrap(),
|
||||
.unwrap()
|
||||
.naive_utc(),
|
||||
application_name: Some("test".to_owned()),
|
||||
username: Some(hex::encode(rng.gen::<[u8; 4]>())),
|
||||
endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
|
||||
@@ -560,15 +570,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1315008, 3, 6000),
|
||||
(1315001, 3, 6000),
|
||||
(1315061, 3, 6000),
|
||||
(1315018, 3, 6000),
|
||||
(1315148, 3, 6000),
|
||||
(1314990, 3, 6000),
|
||||
(1314782, 3, 6000),
|
||||
(1315018, 3, 6000),
|
||||
(438575, 1, 2000)
|
||||
(1315314, 3, 6000),
|
||||
(1315307, 3, 6000),
|
||||
(1315367, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(1315454, 3, 6000),
|
||||
(1315296, 3, 6000),
|
||||
(1315088, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(438713, 1, 2000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -598,11 +608,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1221738, 5, 10000),
|
||||
(1227888, 5, 10000),
|
||||
(1229682, 5, 10000),
|
||||
(1229044, 5, 10000),
|
||||
(1220322, 5, 10000)
|
||||
(1222212, 5, 10000),
|
||||
(1228362, 5, 10000),
|
||||
(1230156, 5, 10000),
|
||||
(1229518, 5, 10000),
|
||||
(1220796, 5, 10000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -634,11 +644,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1207385, 5, 10000),
|
||||
(1207116, 5, 10000),
|
||||
(1207409, 5, 10000),
|
||||
(1207397, 5, 10000),
|
||||
(1207652, 5, 10000)
|
||||
(1207859, 5, 10000),
|
||||
(1207590, 5, 10000),
|
||||
(1207883, 5, 10000),
|
||||
(1207871, 5, 10000),
|
||||
(1208126, 5, 10000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -663,15 +673,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1315008, 3, 6000),
|
||||
(1315001, 3, 6000),
|
||||
(1315061, 3, 6000),
|
||||
(1315018, 3, 6000),
|
||||
(1315148, 3, 6000),
|
||||
(1314990, 3, 6000),
|
||||
(1314782, 3, 6000),
|
||||
(1315018, 3, 6000),
|
||||
(438575, 1, 2000)
|
||||
(1315314, 3, 6000),
|
||||
(1315307, 3, 6000),
|
||||
(1315367, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(1315454, 3, 6000),
|
||||
(1315296, 3, 6000),
|
||||
(1315088, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(438713, 1, 2000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -708,7 +718,7 @@ mod tests {
|
||||
// files are smaller than the size threshold, but they took too long to fill so were flushed early
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
|
||||
[(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::sync::OnceLock;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use lasso::ThreadedRodeo;
|
||||
use measured::{
|
||||
label::StaticLabelSet,
|
||||
label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
|
||||
metric::{histogram::Thresholds, name::MetricName},
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
|
||||
MetricGroup,
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
|
||||
LabelGroup, MetricGroup,
|
||||
};
|
||||
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
|
||||
|
||||
@@ -14,26 +14,36 @@ use tokio::time::{self, Instant};
|
||||
use crate::console::messages::ColdStartInfo;
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
|
||||
pub struct Metrics {
|
||||
#[metric(namespace = "proxy")]
|
||||
#[metric(init = ProxyMetrics::new(thread_pool))]
|
||||
pub proxy: ProxyMetrics,
|
||||
|
||||
#[metric(namespace = "wake_compute_lock")]
|
||||
pub wake_compute_lock: ApiLockMetrics,
|
||||
}
|
||||
|
||||
static SELF: OnceLock<Metrics> = OnceLock::new();
|
||||
impl Metrics {
|
||||
pub fn install(thread_pool: Arc<ThreadPoolMetrics>) {
|
||||
SELF.set(Metrics::new(thread_pool))
|
||||
.ok()
|
||||
.expect("proxy metrics must not be installed more than once");
|
||||
}
|
||||
|
||||
pub fn get() -> &'static Self {
|
||||
static SELF: OnceLock<Metrics> = OnceLock::new();
|
||||
SELF.get_or_init(|| Metrics {
|
||||
proxy: ProxyMetrics::default(),
|
||||
wake_compute_lock: ApiLockMetrics::new(),
|
||||
})
|
||||
#[cfg(test)]
|
||||
return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0))));
|
||||
|
||||
#[cfg(not(test))]
|
||||
SELF.get()
|
||||
.expect("proxy metrics must be installed by the main() function")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new())]
|
||||
#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
|
||||
pub struct ProxyMetrics {
|
||||
#[metric(flatten)]
|
||||
pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
|
||||
@@ -129,6 +139,10 @@ pub struct ProxyMetrics {
|
||||
|
||||
#[metric(namespace = "connect_compute_lock")]
|
||||
pub connect_compute_lock: ApiLockMetrics,
|
||||
|
||||
#[metric(namespace = "scram_pool")]
|
||||
#[metric(init = thread_pool)]
|
||||
pub scram_pool: Arc<ThreadPoolMetrics>,
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
@@ -146,12 +160,6 @@ pub struct ApiLockMetrics {
|
||||
pub semaphore_acquire_seconds: Histogram<16>,
|
||||
}
|
||||
|
||||
impl Default for ProxyMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ApiLockMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
@@ -553,3 +561,52 @@ pub enum RedisEventsCount {
|
||||
PasswordUpdate,
|
||||
AllowedIpsUpdate,
|
||||
}
|
||||
|
||||
pub struct ThreadPoolWorkers(usize);
|
||||
pub struct ThreadPoolWorkerId(pub usize);
|
||||
|
||||
impl LabelValue for ThreadPoolWorkerId {
|
||||
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
|
||||
v.write_int(self.0 as i64)
|
||||
}
|
||||
}
|
||||
|
||||
impl LabelGroup for ThreadPoolWorkerId {
|
||||
fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
|
||||
v.write_value(LabelName::from_str("worker"), self);
|
||||
}
|
||||
}
|
||||
|
||||
impl LabelSet for ThreadPoolWorkers {
|
||||
type Value<'a> = ThreadPoolWorkerId;
|
||||
|
||||
fn dynamic_cardinality(&self) -> Option<usize> {
|
||||
Some(self.0)
|
||||
}
|
||||
|
||||
fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
|
||||
(value.0 < self.0).then_some(value.0)
|
||||
}
|
||||
|
||||
fn decode(&self, value: usize) -> Self::Value<'_> {
|
||||
ThreadPoolWorkerId(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedCardinalitySet for ThreadPoolWorkers {
|
||||
fn cardinality(&self) -> usize {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(workers: usize))]
|
||||
pub struct ThreadPoolMetrics {
|
||||
pub injector_queue_depth: Gauge,
|
||||
#[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
|
||||
pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
|
||||
#[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
|
||||
pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
|
||||
#[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
|
||||
pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
|
||||
}
|
||||
|
||||
@@ -6,11 +6,14 @@
|
||||
//! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/backend/libpq/auth-scram.c>
|
||||
//! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/interfaces/libpq/fe-auth-scram.c>
|
||||
|
||||
mod countmin;
|
||||
mod exchange;
|
||||
mod key;
|
||||
mod messages;
|
||||
mod pbkdf2;
|
||||
mod secret;
|
||||
mod signature;
|
||||
pub mod threadpool;
|
||||
|
||||
pub use exchange::{exchange, Exchange};
|
||||
pub use key::ScramKey;
|
||||
@@ -56,9 +59,13 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::sasl::{Mechanism, Step};
|
||||
use crate::{
|
||||
intern::EndpointIdInt,
|
||||
sasl::{Mechanism, Step},
|
||||
EndpointId,
|
||||
};
|
||||
|
||||
use super::{Exchange, ServerSecret};
|
||||
use super::{threadpool::ThreadPool, Exchange, ServerSecret};
|
||||
|
||||
#[test]
|
||||
fn snapshot() {
|
||||
@@ -112,8 +119,13 @@ mod tests {
|
||||
}
|
||||
|
||||
async fn run_round_trip_test(server_password: &str, client_password: &str) {
|
||||
let pool = ThreadPool::new(1);
|
||||
|
||||
let ep = EndpointId::from("foo");
|
||||
let ep = EndpointIdInt::from(ep);
|
||||
|
||||
let scram_secret = ServerSecret::build(server_password).await.unwrap();
|
||||
let outcome = super::exchange(&scram_secret, client_password.as_bytes())
|
||||
let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
||||
173
proxy/src/scram/countmin.rs
Normal file
173
proxy/src/scram/countmin.rs
Normal file
@@ -0,0 +1,173 @@
|
||||
use std::hash::Hash;
|
||||
|
||||
/// estimator of hash jobs per second.
|
||||
/// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
|
||||
pub struct CountMinSketch {
|
||||
// one for each depth
|
||||
hashers: Vec<ahash::RandomState>,
|
||||
width: usize,
|
||||
depth: usize,
|
||||
// buckets, width*depth
|
||||
buckets: Vec<u32>,
|
||||
}
|
||||
|
||||
impl CountMinSketch {
|
||||
/// Given parameters (ε, δ),
|
||||
/// set width = ceil(e/ε)
|
||||
/// set depth = ceil(ln(1/δ))
|
||||
///
|
||||
/// guarantees:
|
||||
/// actual <= estimate
|
||||
/// estimate <= actual + ε * N with probability 1 - δ
|
||||
/// where N is the cardinality of the stream
|
||||
pub fn with_params(epsilon: f64, delta: f64) -> Self {
|
||||
CountMinSketch::new(
|
||||
(std::f64::consts::E / epsilon).ceil() as usize,
|
||||
(1.0_f64 / delta).ln().ceil() as usize,
|
||||
)
|
||||
}
|
||||
|
||||
fn new(width: usize, depth: usize) -> Self {
|
||||
Self {
|
||||
#[cfg(test)]
|
||||
hashers: (0..depth)
|
||||
.map(|i| {
|
||||
// digits of pi for good randomness
|
||||
ahash::RandomState::with_seeds(
|
||||
314159265358979323,
|
||||
84626433832795028,
|
||||
84197169399375105,
|
||||
82097494459230781 + i as u64,
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
#[cfg(not(test))]
|
||||
hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(),
|
||||
width,
|
||||
depth,
|
||||
buckets: vec![0; width * depth],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
|
||||
let mut min = u32::MAX;
|
||||
for row in 0..self.depth {
|
||||
let col = (self.hashers[row].hash_one(t) as usize) % self.width;
|
||||
|
||||
let row = &mut self.buckets[row * self.width..][..self.width];
|
||||
row[col] = row[col].saturating_add(x);
|
||||
min = std::cmp::min(min, row[col]);
|
||||
}
|
||||
min
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) {
|
||||
self.buckets.clear();
|
||||
self.buckets.resize(self.width * self.depth, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
|
||||
|
||||
use super::CountMinSketch;
|
||||
|
||||
fn eval_precision(n: usize, p: f64, q: f64) -> usize {
|
||||
// fixed value of phi for consistent test
|
||||
let mut rng = StdRng::seed_from_u64(16180339887498948482);
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
let mut N = 0;
|
||||
|
||||
let mut ids = vec![];
|
||||
|
||||
for _ in 0..n {
|
||||
// number of insert operations
|
||||
let n = rng.gen_range(1..100);
|
||||
// number to insert at once
|
||||
let m = rng.gen_range(1..4096);
|
||||
|
||||
let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
|
||||
ids.push((id, n, m));
|
||||
|
||||
// N = sum(actual)
|
||||
N += n * m;
|
||||
}
|
||||
|
||||
// q% of counts will be within p of the actual value
|
||||
let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
|
||||
|
||||
dbg!(sketch.buckets.len());
|
||||
|
||||
// insert a bunch of entries in a random order
|
||||
let mut ids2 = ids.clone();
|
||||
while !ids2.is_empty() {
|
||||
ids2.shuffle(&mut rng);
|
||||
|
||||
let mut i = 0;
|
||||
while i < ids2.len() {
|
||||
sketch.inc_and_return(&ids2[i].0, ids2[i].1);
|
||||
ids2[i].2 -= 1;
|
||||
if ids2[i].2 == 0 {
|
||||
ids2.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut within_p = 0;
|
||||
for (id, n, m) in ids {
|
||||
let actual = n * m;
|
||||
let estimate = sketch.inc_and_return(&id, 0);
|
||||
|
||||
// This estimate has the guarantee that actual <= estimate
|
||||
assert!(actual <= estimate);
|
||||
|
||||
// This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ.
|
||||
// ε = p / N, δ = 1 - q;
|
||||
// therefore, estimate <= actual + p with probability q.
|
||||
if estimate as f64 <= actual as f64 + p {
|
||||
within_p += 1;
|
||||
}
|
||||
}
|
||||
within_p
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn precision() {
|
||||
assert_eq!(eval_precision(100, 100.0, 0.99), 100);
|
||||
assert_eq!(eval_precision(1000, 100.0, 0.99), 1000);
|
||||
assert_eq!(eval_precision(100, 4096.0, 0.99), 100);
|
||||
assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000);
|
||||
|
||||
// seems to be more precise than the literature indicates?
|
||||
// probably numbers are too small to truly represent the probabilities.
|
||||
assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
|
||||
assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
|
||||
assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
|
||||
assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
|
||||
}
|
||||
|
||||
// returns memory usage in bytes, and the time complexity per insert.
|
||||
fn eval_cost(p: f64, q: f64) -> (usize, usize) {
|
||||
#[allow(non_snake_case)]
|
||||
// N = sum(actual)
|
||||
// Let's assume 1021 samples, all of 4096
|
||||
let N = 1021 * 4096;
|
||||
let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
|
||||
|
||||
let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
|
||||
let time = sketch.depth;
|
||||
(memory, time)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn memory_usage() {
|
||||
assert_eq!(eval_cost(100.0, 0.99), (2273580, 5));
|
||||
assert_eq!(eval_cost(4096.0, 0.99), (55520, 5));
|
||||
assert_eq!(eval_cost(4096.0, 0.90), (33312, 3));
|
||||
assert_eq!(eval_cost(4096.0, 0.1), (11104, 1));
|
||||
}
|
||||
}
|
||||
@@ -4,15 +4,17 @@ use std::convert::Infallible;
|
||||
|
||||
use hmac::{Hmac, Mac};
|
||||
use sha2::Sha256;
|
||||
use tokio::task::yield_now;
|
||||
|
||||
use super::messages::{
|
||||
ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
|
||||
};
|
||||
use super::pbkdf2::Pbkdf2;
|
||||
use super::secret::ServerSecret;
|
||||
use super::signature::SignatureBuilder;
|
||||
use super::threadpool::ThreadPool;
|
||||
use super::ScramKey;
|
||||
use crate::config;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::sasl::{self, ChannelBinding, Error as SaslError};
|
||||
|
||||
/// The only channel binding mode we currently support.
|
||||
@@ -74,37 +76,18 @@ impl<'a> Exchange<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
|
||||
async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
|
||||
let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
|
||||
let mut prev = hmac
|
||||
.clone()
|
||||
.chain_update(salt)
|
||||
.chain_update(1u32.to_be_bytes())
|
||||
.finalize()
|
||||
.into_bytes();
|
||||
|
||||
let mut hi = prev;
|
||||
|
||||
for i in 1..iterations {
|
||||
prev = hmac.clone().chain_update(prev).finalize().into_bytes();
|
||||
|
||||
for (hi, prev) in hi.iter_mut().zip(prev) {
|
||||
*hi ^= prev;
|
||||
}
|
||||
// yield every ~250us
|
||||
// hopefully reduces tail latencies
|
||||
if i % 1024 == 0 {
|
||||
yield_now().await
|
||||
}
|
||||
}
|
||||
|
||||
hi.into()
|
||||
}
|
||||
|
||||
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
|
||||
async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
|
||||
let salted_password = pbkdf2(password, salt, iterations).await;
|
||||
async fn derive_client_key(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
password: &[u8],
|
||||
salt: &[u8],
|
||||
iterations: u32,
|
||||
) -> ScramKey {
|
||||
let salted_password = pool
|
||||
.spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
|
||||
.await
|
||||
.expect("job should not be cancelled");
|
||||
|
||||
let make_key = |name| {
|
||||
let key = Hmac::<Sha256>::new_from_slice(&salted_password)
|
||||
@@ -119,11 +102,13 @@ async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> Scr
|
||||
}
|
||||
|
||||
pub async fn exchange(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
secret: &ServerSecret,
|
||||
password: &[u8],
|
||||
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
|
||||
let salt = base64::decode(&secret.salt_base64)?;
|
||||
let client_key = derive_client_key(password, &salt, secret.iterations).await;
|
||||
let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
|
||||
|
||||
if secret.is_password_invalid(&client_key).into() {
|
||||
Ok(sasl::Outcome::Failure("password doesn't match"))
|
||||
|
||||
89
proxy/src/scram/pbkdf2.rs
Normal file
89
proxy/src/scram/pbkdf2.rs
Normal file
@@ -0,0 +1,89 @@
|
||||
use hmac::{
|
||||
digest::{consts::U32, generic_array::GenericArray},
|
||||
Hmac, Mac,
|
||||
};
|
||||
use sha2::Sha256;
|
||||
|
||||
pub struct Pbkdf2 {
|
||||
hmac: Hmac<Sha256>,
|
||||
prev: GenericArray<u8, U32>,
|
||||
hi: GenericArray<u8, U32>,
|
||||
iterations: u32,
|
||||
}
|
||||
|
||||
// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
|
||||
impl Pbkdf2 {
|
||||
pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
|
||||
let hmac =
|
||||
Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
|
||||
|
||||
let prev = hmac
|
||||
.clone()
|
||||
.chain_update(salt)
|
||||
.chain_update(1u32.to_be_bytes())
|
||||
.finalize()
|
||||
.into_bytes();
|
||||
|
||||
Self {
|
||||
hmac,
|
||||
// one consumed for the hash above
|
||||
iterations: iterations - 1,
|
||||
hi: prev,
|
||||
prev,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cost(&self) -> u32 {
|
||||
(self.iterations).clamp(0, 4096)
|
||||
}
|
||||
|
||||
pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
|
||||
let Self {
|
||||
hmac,
|
||||
prev,
|
||||
hi,
|
||||
iterations,
|
||||
} = self;
|
||||
|
||||
// only do 4096 iterations per turn before sharing the thread for fairness
|
||||
let n = (*iterations).clamp(0, 4096);
|
||||
for _ in 0..n {
|
||||
*prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
|
||||
|
||||
for (hi, prev) in hi.iter_mut().zip(*prev) {
|
||||
*hi ^= prev;
|
||||
}
|
||||
}
|
||||
|
||||
*iterations -= n;
|
||||
if *iterations == 0 {
|
||||
std::task::Poll::Ready((*hi).into())
|
||||
} else {
|
||||
std::task::Poll::Pending
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Pbkdf2;
|
||||
use pbkdf2::pbkdf2_hmac_array;
|
||||
use sha2::Sha256;
|
||||
|
||||
#[test]
|
||||
fn works() {
|
||||
let salt = b"sodium chloride";
|
||||
let pass = b"Ne0n_!5_50_C007";
|
||||
|
||||
let mut job = Pbkdf2::start(pass, salt, 600000);
|
||||
let hash = loop {
|
||||
let std::task::Poll::Ready(hash) = job.turn() else {
|
||||
continue;
|
||||
};
|
||||
break hash;
|
||||
};
|
||||
|
||||
let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
|
||||
assert_eq!(hash, expected)
|
||||
}
|
||||
}
|
||||
321
proxy/src/scram/threadpool.rs
Normal file
321
proxy/src/scram/threadpool.rs
Normal file
@@ -0,0 +1,321 @@
|
||||
//! Custom threadpool implementation for password hashing.
|
||||
//!
|
||||
//! Requirements:
|
||||
//! 1. Fairness per endpoint.
|
||||
//! 2. Yield support for high iteration counts.
|
||||
|
||||
use std::sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
};
|
||||
|
||||
use crossbeam_deque::{Injector, Stealer, Worker};
|
||||
use itertools::Itertools;
|
||||
use parking_lot::{Condvar, Mutex};
|
||||
use rand::Rng;
|
||||
use rand::{rngs::SmallRng, SeedableRng};
|
||||
use tokio::sync::oneshot;
|
||||
|
||||
use crate::{
|
||||
intern::EndpointIdInt,
|
||||
metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
|
||||
scram::countmin::CountMinSketch,
|
||||
};
|
||||
|
||||
use super::pbkdf2::Pbkdf2;
|
||||
|
||||
pub struct ThreadPool {
|
||||
queue: Injector<JobSpec>,
|
||||
stealers: Vec<Stealer<JobSpec>>,
|
||||
parkers: Vec<(Condvar, Mutex<ThreadState>)>,
|
||||
/// bitpacked representation.
|
||||
/// lower 8 bits = number of sleeping threads
|
||||
/// next 8 bits = number of idle threads (searching for work)
|
||||
counters: AtomicU64,
|
||||
|
||||
pub metrics: Arc<ThreadPoolMetrics>,
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum ThreadState {
|
||||
Parked,
|
||||
Active,
|
||||
}
|
||||
|
||||
impl ThreadPool {
|
||||
pub fn new(n_workers: u8) -> Arc<Self> {
|
||||
let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
|
||||
let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
|
||||
|
||||
let parkers = (0..n_workers)
|
||||
.map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
|
||||
.collect_vec();
|
||||
|
||||
let pool = Arc::new(Self {
|
||||
queue: Injector::new(),
|
||||
stealers,
|
||||
parkers,
|
||||
// threads start searching for work
|
||||
counters: AtomicU64::new((n_workers as u64) << 8),
|
||||
metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
|
||||
});
|
||||
|
||||
for (i, worker) in workers.into_iter().enumerate() {
|
||||
let pool = Arc::clone(&pool);
|
||||
std::thread::spawn(move || thread_rt(pool, worker, i));
|
||||
}
|
||||
|
||||
pool
|
||||
}
|
||||
|
||||
pub fn spawn_job(
|
||||
&self,
|
||||
endpoint: EndpointIdInt,
|
||||
pbkdf2: Pbkdf2,
|
||||
) -> oneshot::Receiver<[u8; 32]> {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
let queue_was_empty = self.queue.is_empty();
|
||||
|
||||
self.metrics.injector_queue_depth.inc();
|
||||
self.queue.push(JobSpec {
|
||||
response: tx,
|
||||
pbkdf2,
|
||||
endpoint,
|
||||
});
|
||||
|
||||
// inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
|
||||
let counts = self.counters.load(Ordering::SeqCst);
|
||||
let num_awake_but_idle = (counts >> 8) & 0xff;
|
||||
let num_sleepers = counts & 0xff;
|
||||
|
||||
// If the queue is non-empty, then we always wake up a worker
|
||||
// -- clearly the existing idle jobs aren't enough. Otherwise,
|
||||
// check to see if we have enough idle workers.
|
||||
if !queue_was_empty || num_awake_but_idle == 0 {
|
||||
let num_to_wake = Ord::min(1, num_sleepers);
|
||||
self.wake_any_threads(num_to_wake);
|
||||
}
|
||||
|
||||
rx
|
||||
}
|
||||
|
||||
#[cold]
|
||||
fn wake_any_threads(&self, mut num_to_wake: u64) {
|
||||
if num_to_wake > 0 {
|
||||
for i in 0..self.parkers.len() {
|
||||
if self.wake_specific_thread(i) {
|
||||
num_to_wake -= 1;
|
||||
if num_to_wake == 0 {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn wake_specific_thread(&self, index: usize) -> bool {
|
||||
let (condvar, lock) = &self.parkers[index];
|
||||
|
||||
let mut state = lock.lock();
|
||||
if *state == ThreadState::Parked {
|
||||
condvar.notify_one();
|
||||
|
||||
// When the thread went to sleep, it will have incremented
|
||||
// this value. When we wake it, its our job to decrement
|
||||
// it. We could have the thread do it, but that would
|
||||
// introduce a delay between when the thread was
|
||||
// *notified* and when this counter was decremented. That
|
||||
// might mislead people with new work into thinking that
|
||||
// there are sleeping threads that they should try to
|
||||
// wake, when in fact there is nothing left for them to
|
||||
// do.
|
||||
self.counters.fetch_sub(1, Ordering::SeqCst);
|
||||
*state = ThreadState::Active;
|
||||
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
|
||||
// announce thread as idle
|
||||
self.counters.fetch_add(256, Ordering::SeqCst);
|
||||
|
||||
// try steal from the global queue
|
||||
loop {
|
||||
match self.queue.steal_batch_and_pop(worker) {
|
||||
crossbeam_deque::Steal::Success(job) => {
|
||||
self.metrics
|
||||
.injector_queue_depth
|
||||
.set(self.queue.len() as i64);
|
||||
// no longer idle
|
||||
self.counters.fetch_sub(256, Ordering::SeqCst);
|
||||
return Some(job);
|
||||
}
|
||||
crossbeam_deque::Steal::Retry => continue,
|
||||
crossbeam_deque::Steal::Empty => break,
|
||||
}
|
||||
}
|
||||
|
||||
// try steal from our neighbours
|
||||
loop {
|
||||
let mut retry = false;
|
||||
let start = rng.gen_range(0..self.stealers.len());
|
||||
let job = (start..self.stealers.len())
|
||||
.chain(0..start)
|
||||
.filter(|i| *i != skip)
|
||||
.find_map(
|
||||
|victim| match self.stealers[victim].steal_batch_and_pop(worker) {
|
||||
crossbeam_deque::Steal::Success(job) => Some(job),
|
||||
crossbeam_deque::Steal::Empty => None,
|
||||
crossbeam_deque::Steal::Retry => {
|
||||
retry = true;
|
||||
None
|
||||
}
|
||||
},
|
||||
);
|
||||
if job.is_some() {
|
||||
// no longer idle
|
||||
self.counters.fetch_sub(256, Ordering::SeqCst);
|
||||
return job;
|
||||
}
|
||||
if !retry {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
|
||||
/// interval when we should steal from the global queue
|
||||
/// so that tail latencies are managed appropriately
|
||||
const STEAL_INTERVAL: usize = 61;
|
||||
|
||||
/// How often to reset the sketch values
|
||||
const SKETCH_RESET_INTERVAL: usize = 1021;
|
||||
|
||||
let mut rng = SmallRng::from_entropy();
|
||||
|
||||
// used to determine whether we should temporarily skip tasks for fairness.
|
||||
// 99% of estimates will overcount by no more than 4096 samples
|
||||
let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
|
||||
|
||||
let (condvar, lock) = &pool.parkers[index];
|
||||
|
||||
'wait: loop {
|
||||
// wait for notification of work
|
||||
{
|
||||
let mut lock = lock.lock();
|
||||
|
||||
// queue is empty
|
||||
pool.metrics
|
||||
.worker_queue_depth
|
||||
.set(ThreadPoolWorkerId(index), 0);
|
||||
|
||||
// subtract 1 from idle count, add 1 to sleeping count.
|
||||
pool.counters.fetch_sub(255, Ordering::SeqCst);
|
||||
|
||||
*lock = ThreadState::Parked;
|
||||
condvar.wait(&mut lock);
|
||||
}
|
||||
|
||||
for i in 0.. {
|
||||
let mut job = match worker
|
||||
.pop()
|
||||
.or_else(|| pool.steal(&mut rng, index, &worker))
|
||||
{
|
||||
Some(job) => job,
|
||||
None => continue 'wait,
|
||||
};
|
||||
|
||||
pool.metrics
|
||||
.worker_queue_depth
|
||||
.set(ThreadPoolWorkerId(index), worker.len() as i64);
|
||||
|
||||
// receiver is closed, cancel the task
|
||||
if !job.response.is_closed() {
|
||||
let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
|
||||
|
||||
const P: f64 = 2000.0;
|
||||
// probability decreases as rate increases.
|
||||
// lower probability, higher chance of being skipped
|
||||
//
|
||||
// estimates (rate in terms of 4096 rounds):
|
||||
// rate = 0 => probability = 100%
|
||||
// rate = 10 => probability = 71.3%
|
||||
// rate = 50 => probability = 62.1%
|
||||
// rate = 500 => probability = 52.3%
|
||||
// rate = 1021 => probability = 49.8%
|
||||
//
|
||||
// My expectation is that the pool queue will only begin backing up at ~1000rps
|
||||
// in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
|
||||
// are in requests per second.
|
||||
let probability = P.ln() / (P + rate as f64).ln();
|
||||
if pool.queue.len() > 32 || rng.gen_bool(probability) {
|
||||
pool.metrics
|
||||
.worker_task_turns_total
|
||||
.inc(ThreadPoolWorkerId(index));
|
||||
|
||||
match job.pbkdf2.turn() {
|
||||
std::task::Poll::Ready(result) => {
|
||||
let _ = job.response.send(result);
|
||||
}
|
||||
std::task::Poll::Pending => worker.push(job),
|
||||
}
|
||||
} else {
|
||||
pool.metrics
|
||||
.worker_task_skips_total
|
||||
.inc(ThreadPoolWorkerId(index));
|
||||
|
||||
// skip for now
|
||||
worker.push(job)
|
||||
}
|
||||
}
|
||||
|
||||
// if we get stuck with a few long lived jobs in the queue
|
||||
// it's better to try and steal from the queue too for fairness
|
||||
if i % STEAL_INTERVAL == 0 {
|
||||
let _ = pool.queue.steal_batch(&worker);
|
||||
}
|
||||
|
||||
if i % SKETCH_RESET_INTERVAL == 0 {
|
||||
sketch.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct JobSpec {
|
||||
response: oneshot::Sender<[u8; 32]>,
|
||||
pbkdf2: Pbkdf2,
|
||||
endpoint: EndpointIdInt,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::EndpointId;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn hash_is_correct() {
|
||||
let pool = ThreadPool::new(1);
|
||||
|
||||
let ep = EndpointId::from("foo");
|
||||
let ep = EndpointIdInt::from(ep);
|
||||
|
||||
let salt = [0x55; 32];
|
||||
let actual = pool
|
||||
.spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let expected = [
|
||||
10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
|
||||
178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
|
||||
];
|
||||
assert_eq!(actual, expected)
|
||||
}
|
||||
}
|
||||
@@ -102,7 +102,7 @@ pub async fn task_main(
|
||||
let connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
connections.close(); // allows `connections.wait to complete`
|
||||
|
||||
let server = Builder::new(hyper_util::rt::TokioExecutor::new());
|
||||
let server = Builder::new(TokioExecutor::new());
|
||||
|
||||
while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
|
||||
let (conn, peer_addr) = res.context("could not accept TCP stream")?;
|
||||
@@ -255,7 +255,6 @@ async fn connection_handler(
|
||||
.in_current_span()
|
||||
.map_ok_or_else(api_error_into_response, |r| r),
|
||||
);
|
||||
|
||||
async move {
|
||||
let res = handler.await;
|
||||
cancel_request.disarm();
|
||||
@@ -301,7 +300,7 @@ async fn request_handler(
|
||||
.map(|s| s.to_string());
|
||||
|
||||
// Check if the request is a websocket upgrade request.
|
||||
if hyper_tungstenite::is_upgrade_request(&request) {
|
||||
if framed_websockets::upgrade::is_upgrade_request(&request) {
|
||||
let ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
peer_addr,
|
||||
@@ -312,7 +311,7 @@ async fn request_handler(
|
||||
let span = ctx.span.clone();
|
||||
info!(parent: &span, "performing websocket upgrade");
|
||||
|
||||
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
|
||||
let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
ws_connections.spawn(
|
||||
@@ -334,7 +333,7 @@ async fn request_handler(
|
||||
);
|
||||
|
||||
// Return the response so the spawned future can continue.
|
||||
Ok(response)
|
||||
Ok(response.map(|_: http_body_util::Empty<Bytes>| Full::new(Bytes::new())))
|
||||
} else if request.uri().path() == "/sql" && *request.method() == Method::POST {
|
||||
let ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
|
||||
@@ -15,6 +15,7 @@ use crate::{
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
error::{ErrorKind, ReportableError, UserFacingError},
|
||||
intern::EndpointIdInt,
|
||||
proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
Host,
|
||||
@@ -66,8 +67,14 @@ impl PoolingBackend {
|
||||
return Err(AuthError::auth_failed(&*user_info.user));
|
||||
}
|
||||
};
|
||||
let auth_outcome =
|
||||
crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
|
||||
let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
|
||||
let auth_outcome = crate::auth::validate_password_and_exchange(
|
||||
&config.thread_pool,
|
||||
ep,
|
||||
&conn_info.password,
|
||||
secret,
|
||||
)
|
||||
.await?;
|
||||
let res = match auth_outcome {
|
||||
crate::sasl::Outcome::Success(key) => {
|
||||
info!("user successfully authenticated");
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user