mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-16 09:00:38 +00:00
Compare commits
1 Commits
skyzh/flus
...
fix-XLogWa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8dc1c48605 |
@@ -17,7 +17,6 @@
|
||||
!libs/
|
||||
!neon_local/
|
||||
!pageserver/
|
||||
!patches/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!s3_scrubber/
|
||||
|
||||
@@ -3,13 +3,13 @@ description: 'Create Branch using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
description: 'Neon API key'
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
project_id:
|
||||
description: 'ID of the Project to create Branch in'
|
||||
desctiption: 'ID of the Project to create Branch in'
|
||||
required: true
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
outputs:
|
||||
dsn:
|
||||
|
||||
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
description: 'Neon API key'
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
project_id:
|
||||
description: 'ID of the Project which should be deleted'
|
||||
desctiption: 'ID of the Project which should be deleted'
|
||||
required: true
|
||||
branch_id:
|
||||
description: 'ID of the branch to delete'
|
||||
desctiption: 'ID of the branch to delete'
|
||||
required: true
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
|
||||
runs:
|
||||
|
||||
14
.github/actions/neon-project-create/action.yml
vendored
14
.github/actions/neon-project-create/action.yml
vendored
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
description: 'Neon API key'
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
region_id:
|
||||
description: 'Region ID, if not set the project will be created in the default region'
|
||||
desctiption: 'Region ID, if not set the project will be created in the default region'
|
||||
default: aws-us-east-2
|
||||
postgres_version:
|
||||
description: 'Postgres version; default is 15'
|
||||
default: '15'
|
||||
desctiption: 'Postgres version; default is 15'
|
||||
default: 15
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
provisioner:
|
||||
description: 'k8s-pod or k8s-neonvm'
|
||||
desctiption: 'k8s-pod or k8s-neonvm'
|
||||
default: 'k8s-pod'
|
||||
compute_units:
|
||||
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
default: '[1, 1]'
|
||||
|
||||
outputs:
|
||||
|
||||
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
description: 'Neon API key'
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
project_id:
|
||||
description: 'ID of the Project to delete'
|
||||
desctiption: 'ID of the Project to delete'
|
||||
required: true
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
|
||||
runs:
|
||||
|
||||
248
.github/workflows/build_and_test.yml
vendored
248
.github/workflows/build_and_test.yml
vendored
@@ -548,7 +548,7 @@ jobs:
|
||||
|
||||
report-benchmarks-failures:
|
||||
needs: [ benchmarks, create-test-report ]
|
||||
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
|
||||
if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -723,13 +723,9 @@ jobs:
|
||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||
secrets: inherit
|
||||
|
||||
neon-image-arch:
|
||||
neon-image:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -751,6 +747,12 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
@@ -762,52 +764,25 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
|
||||
tags: |
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
neon-image:
|
||||
needs: [ neon-image-arch, tag ]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Push multi-arch image to ECR
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
compute-node-image-arch:
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -854,14 +829,15 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
if: matrix.version == 'v16'
|
||||
if: ${{ matrix.version == 'v16' }}
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
target: compute-tools-image
|
||||
@@ -875,57 +851,14 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
tags: |
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
compute-node-image:
|
||||
needs: [ compute-node-image-arch, tag ]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch compute-node image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- name: Create multi-arch compute-tools image
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Push multi-arch compute-tools image to ECR
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
@@ -933,8 +866,11 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.29.3
|
||||
VM_BUILDER_VERSION: v0.28.1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -947,48 +883,26 @@ jobs:
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
|
||||
# it won't have the proper authentication (written at v0.6.0)
|
||||
- name: Pulling compute-node image
|
||||
run: |
|
||||
docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-spec=vm-image-spec.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
test-images:
|
||||
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -1006,7 +920,7 @@ jobs:
|
||||
- name: Verify image versions
|
||||
shell: bash # ensure no set -e for better error messages
|
||||
run: |
|
||||
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
|
||||
echo "Pageserver version string: $pageserver_version"
|
||||
|
||||
@@ -1032,48 +946,78 @@ jobs:
|
||||
|
||||
promote-images:
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
VERSIONS: v14 v15 v16
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
# Don't add if-condition here.
|
||||
# The job should always be run because we have dependant other jobs that shouldn't be skipped
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Copy vm-compute-node images to ECR
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
for version in ${VERSIONS}; do
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Copy vm-compute-node images to Docker Hub
|
||||
run: |
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: github.ref_name == 'main'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
|
||||
docker buildx imagetools create -t $repo/neon:latest \
|
||||
$repo/neon:${{ needs.tag.outputs.build-tag }}
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
docker buildx imagetools create -t $repo/compute-tools:latest \
|
||||
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
- name: Push images to production ECR
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
|
||||
|
||||
for version in ${VERSIONS}; do
|
||||
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
|
||||
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
|
||||
echo "" > /github/home/.docker/config.json
|
||||
crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
|
||||
|
||||
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
|
||||
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
done
|
||||
- name: Push vm-compute-node to Docker Hub
|
||||
run: |
|
||||
crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
|
||||
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push latest tags to Docker Hub
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
needs: [ check-permissions, tag ]
|
||||
|
||||
49
Cargo.lock
generated
49
Cargo.lock
generated
@@ -1072,9 +1072,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.38"
|
||||
version = "0.4.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
|
||||
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
|
||||
dependencies = [
|
||||
"android-tzdata",
|
||||
"iana-time-zone",
|
||||
@@ -1082,7 +1082,7 @@ dependencies = [
|
||||
"num-traits",
|
||||
"serde",
|
||||
"wasm-bindgen",
|
||||
"windows-targets 0.52.4",
|
||||
"windows-targets 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1109,7 +1109,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
|
||||
dependencies = [
|
||||
"ciborium-io",
|
||||
"half 1.8.2",
|
||||
"half",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1471,21 +1471,26 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.5"
|
||||
version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
|
||||
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
version = "0.9.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"memoffset 0.8.0",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2273,17 +2278,6 @@ version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crunchy",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hash32"
|
||||
version = "0.3.1"
|
||||
@@ -3908,13 +3902,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "parquet"
|
||||
version = "51.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
||||
version = "49.0.0"
|
||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"half 2.4.1",
|
||||
"hashbrown 0.14.5",
|
||||
"num",
|
||||
"num-bigint",
|
||||
@@ -3923,13 +3916,12 @@ dependencies = [
|
||||
"thrift",
|
||||
"twox-hash",
|
||||
"zstd",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parquet_derive"
|
||||
version = "51.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
||||
version = "49.0.0"
|
||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
||||
dependencies = [
|
||||
"parquet",
|
||||
"proc-macro2",
|
||||
@@ -3956,9 +3948,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
|
||||
|
||||
[[package]]
|
||||
name = "pbkdf2"
|
||||
version = "0.12.2"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
|
||||
checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"hmac",
|
||||
@@ -4381,7 +4373,6 @@ dependencies = [
|
||||
name = "proxy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
@@ -4398,7 +4389,6 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"consumption_metrics",
|
||||
"crossbeam-deque",
|
||||
"dashmap",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
@@ -7470,7 +7460,6 @@ dependencies = [
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"aws-runtime",
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -41,7 +41,6 @@ license = "Apache-2.0"
|
||||
|
||||
## All dependency versions, used in the project
|
||||
[workspace.dependencies]
|
||||
ahash = "0.8"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
@@ -75,7 +74,6 @@ clap = { version = "4.0", features = ["derive"] }
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-deque = "0.8.5"
|
||||
crossbeam-utils = "0.8.5"
|
||||
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
||||
either = "1.8"
|
||||
@@ -124,8 +122,8 @@ opentelemetry = "0.20.0"
|
||||
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.12.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "51.0.0"
|
||||
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "49.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
procfs = "0.14"
|
||||
@@ -246,8 +244,8 @@ tonic-build = "0.9"
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
# bug fixes for UUID
|
||||
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -241,17 +241,11 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# because we build the images on different machines than where we run them.
|
||||
# Pass OPTFLAGS="" to remove it.
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
[](https://neon.tech)
|
||||
|
||||
|
||||
[](https://neon.tech)
|
||||
|
||||
# Neon
|
||||
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use tracing::warn;
|
||||
|
||||
@@ -19,24 +17,17 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
|
||||
.arg(size_bytes.to_string())
|
||||
.spawn();
|
||||
|
||||
if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
|
||||
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
child_result
|
||||
.context("spawn() failed")
|
||||
.and_then(|mut child| child.wait().context("wait() failed"))
|
||||
.and_then(|status| match status.success() {
|
||||
true => Ok(()),
|
||||
false => {
|
||||
// The command failed. Maybe it was because the resize-swap file doesn't exist?
|
||||
// The --once flag causes it to delete itself on success so we don't disable swap
|
||||
// while postgres is running; maybe this is fine.
|
||||
match Path::new(RESIZE_SWAP_BIN).try_exists() {
|
||||
Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
|
||||
// The path doesn't exist; we're actually ok
|
||||
Ok(false) => {
|
||||
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
|
||||
Ok(())
|
||||
},
|
||||
}
|
||||
}
|
||||
false => Err(anyhow!("process exited with {status}")),
|
||||
})
|
||||
// wrap any prior error with the overall context that we couldn't run the command
|
||||
.with_context(|| {
|
||||
|
||||
@@ -243,13 +243,9 @@ impl StorageController {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
format!("port = {}", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG COMPUTE_IMAGE=compute-node-v14
|
||||
ARG TAG=latest
|
||||
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
|
||||
# to verify custom image builds (e.g pre-published ones).
|
||||
|
||||
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
|
||||
|
||||
set -eux -o pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
|
||||
@@ -307,7 +307,7 @@ impl KeySpace {
|
||||
}
|
||||
|
||||
/// Merge another keyspace into the current one.
|
||||
/// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
|
||||
/// Note: the keyspaces must not ovelap (enforced via assertions)
|
||||
pub fn merge(&mut self, other: &KeySpace) {
|
||||
let all_ranges = self
|
||||
.ranges
|
||||
|
||||
@@ -9,6 +9,7 @@ use std::{
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
sync::atomic::AtomicUsize,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
@@ -161,22 +162,6 @@ impl std::fmt::Debug for TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
/// A temporary lease to a specific lsn inside a timeline.
|
||||
/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct LsnLease {
|
||||
#[serde_as(as = "SystemTimeAsRfc3339Millis")]
|
||||
pub valid_until: SystemTime,
|
||||
}
|
||||
|
||||
serde_with::serde_conv!(
|
||||
SystemTimeAsRfc3339Millis,
|
||||
SystemTime,
|
||||
|time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
|
||||
|value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
|
||||
);
|
||||
|
||||
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum ActivatingFrom {
|
||||
@@ -305,7 +290,7 @@ pub struct TenantConfig {
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
@@ -333,28 +318,14 @@ pub struct TenantConfig {
|
||||
/// Unset -> V1
|
||||
/// -> V2
|
||||
/// -> CrossValidation -> V2
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum AuxFilePolicy {
|
||||
/// V1 aux file policy: store everything in AUX_FILE_KEY
|
||||
#[strum(ascii_case_insensitive)]
|
||||
V1,
|
||||
/// V2 aux file policy: store in the AUX_FILE keyspace
|
||||
#[strum(ascii_case_insensitive)]
|
||||
V2,
|
||||
/// Cross validation runs both formats on the write path and does validation
|
||||
/// on the read path.
|
||||
#[strum(ascii_case_insensitive)]
|
||||
CrossValidation,
|
||||
}
|
||||
|
||||
@@ -420,6 +391,23 @@ impl AuxFilePolicy {
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for AuxFilePolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.to_lowercase();
|
||||
if s == "v1" {
|
||||
Ok(Self::V1)
|
||||
} else if s == "v2" {
|
||||
Ok(Self::V2)
|
||||
} else if s == "crossvalidation" || s == "cross_validation" {
|
||||
Ok(Self::CrossValidation)
|
||||
} else {
|
||||
anyhow::bail!("cannot parse {} to aux file policy", s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum EvictionPolicy {
|
||||
@@ -438,28 +426,13 @@ impl EvictionPolicy {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum CompactionAlgorithm {
|
||||
Legacy,
|
||||
Tiered,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompactionAlgorithmSettings {
|
||||
pub kind: CompactionAlgorithm,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -816,8 +789,6 @@ pub enum HistoricLayerInfo {
|
||||
lsn_end: Lsn,
|
||||
remote: bool,
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
l0: bool,
|
||||
},
|
||||
Image {
|
||||
layer_file_name: String,
|
||||
@@ -1416,7 +1387,6 @@ impl PagestreamBeMessage {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -1679,14 +1649,4 @@ mod tests {
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_parse() {
|
||||
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(
|
||||
AuxFilePolicy::from_str("cross-validation").unwrap(),
|
||||
AuxFilePolicy::CrossValidation
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -559,14 +559,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Obtains the shard number and count combined into a `ShardIndex`.
|
||||
pub fn shard_index(&self) -> ShardIndex {
|
||||
ShardIndex {
|
||||
shard_count: self.count,
|
||||
shard_number: self.number,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
if self.count > ShardCount(0) {
|
||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||
|
||||
@@ -820,11 +820,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
|
||||
/// - Log as info/error result of handling COPY stream and send back
|
||||
/// ErrorResponse if that makes sense.
|
||||
/// - Shutdown the stream if we got Terminate.
|
||||
/// - Then close the connection because we don't handle exiting from COPY
|
||||
/// stream normally.
|
||||
/// Log as info/error result of handling COPY stream and send back
|
||||
/// ErrorResponse if that makes sense. Shutdown the stream if we got
|
||||
/// Terminate. TODO: transition into waiting for Sync msg if we initiate the
|
||||
/// close.
|
||||
pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
|
||||
use CopyStreamHandlerEnd::*;
|
||||
|
||||
@@ -850,6 +849,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
}
|
||||
}
|
||||
|
||||
if let Terminate = &end {
|
||||
self.state = ProtoState::Closed;
|
||||
}
|
||||
|
||||
let err_to_send_and_errcode = match &end {
|
||||
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
|
||||
Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
|
||||
@@ -879,12 +882,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
error!("failed to send ErrorResponse: {}", ee);
|
||||
}
|
||||
}
|
||||
|
||||
// Proper COPY stream finishing to continue using the connection is not
|
||||
// implemented at the server side (we don't need it so far). To prevent
|
||||
// further usages of the connection, close it.
|
||||
self.framed.shutdown().await.ok();
|
||||
self.state = ProtoState::Closed;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -178,13 +178,6 @@ impl PgConnectionConfig {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// The password is intentionally hidden and not part of this display string.
|
||||
write!(f, "postgresql://{}:{}", self.host, self.port)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
|
||||
|
||||
@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
// Is there enough space on the page for another logical message and an
|
||||
// XLOG_SWITCH? If not, start over.
|
||||
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
|
||||
if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
|
||||
if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -373,29 +373,31 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
||||
&[&(repeats as i32)],
|
||||
)?;
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
XLOG_SIZE_OF_XLOG_RECORD
|
||||
);
|
||||
|
||||
// Emit the XLOG_SWITCH
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
let xlog_switch_record_end: PgLsn =
|
||||
client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
|
||||
if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
!= XLOG_SIZE_OF_XLOG_SHORT_PHD
|
||||
{
|
||||
warn!(
|
||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
|
||||
xlog_switch_record_end,
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
);
|
||||
continue;
|
||||
}
|
||||
return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
|
||||
break;
|
||||
}
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
XLOG_SIZE_OF_XLOG_RECORD
|
||||
);
|
||||
|
||||
// Emit the XLOG_SWITCH
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let next_segment = PgLsn::from(0x0200_0000);
|
||||
ensure!(
|
||||
xlog_switch_record_end < next_segment,
|
||||
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
|
||||
xlog_switch_record_end,
|
||||
next_segment
|
||||
);
|
||||
ensure!(
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
||||
xlog_switch_record_end,
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
);
|
||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -50,9 +50,6 @@ pub struct SkTimelineInfo {
|
||||
pub safekeeper_connstr: Option<String>,
|
||||
#[serde(default)]
|
||||
pub http_connstr: Option<String>,
|
||||
// Minimum of all active RO replicas flush LSN
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub standby_horizon: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
|
||||
@@ -496,9 +496,9 @@ mod tests {
|
||||
// TODO: When updating Postgres versions, this test will cause
|
||||
// problems. Postgres version in message needs updating.
|
||||
//
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||
|
||||
@@ -257,37 +257,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Obtain lease for the given LSN
|
||||
parameters:
|
||||
- name: lsn
|
||||
in: query
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
description: A LSN to obtain the lease for
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LsnLease"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -612,80 +581,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
ŕequired: true
|
||||
schema:
|
||||
type: string
|
||||
|
||||
put:
|
||||
description: |
|
||||
Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
|
||||
Current implementation might not be retryable across failure cases, but will be enhanced in future.
|
||||
Detaching should be expected to be expensive operation. Timeouts should be retried.
|
||||
responses:
|
||||
"200":
|
||||
description: |
|
||||
The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
|
||||
If any timelines were deleted after reparenting, they might not be on this list.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/AncestorDetached"
|
||||
|
||||
"400":
|
||||
description: |
|
||||
Number of early checks meaning the timeline cannot be detached now:
|
||||
- the ancestor of timeline has an ancestor: not supported, see RFC
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
"404":
|
||||
description: Tenant or timeline not found.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
|
||||
"409":
|
||||
description: |
|
||||
The timeline can never be detached:
|
||||
- timeline has no ancestor, implying that the timeline has never had an ancestor
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
|
||||
"500":
|
||||
description: |
|
||||
Transient error, for example, pageserver shutdown happened while
|
||||
processing the request but we were unable to distinguish that. Must
|
||||
be retried.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
"503":
|
||||
description: |
|
||||
Temporarily unavailable, please retry. Possible reasons:
|
||||
- another timeline detach for the same tenant is underway, please retry later
|
||||
- detected shutdown error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/:
|
||||
get:
|
||||
description: Get tenants list
|
||||
@@ -1085,15 +980,6 @@ components:
|
||||
type: string
|
||||
enum: [past, present, future, nodata]
|
||||
|
||||
LsnLease:
|
||||
type: object
|
||||
required:
|
||||
- valid_until
|
||||
properties:
|
||||
valid_until:
|
||||
type: string
|
||||
format: date-time
|
||||
|
||||
PageserverUtilization:
|
||||
type: object
|
||||
required:
|
||||
@@ -1151,19 +1037,6 @@ components:
|
||||
format: int64
|
||||
description: How many bytes of layer content were in the latest layer heatmap
|
||||
|
||||
AncestorDetached:
|
||||
type: object
|
||||
required:
|
||||
- reparented_timelines
|
||||
properties:
|
||||
reparented_timelines:
|
||||
type: array
|
||||
description: Set of reparented timeline ids
|
||||
properties:
|
||||
type: string
|
||||
format: hex
|
||||
description: TimelineId
|
||||
|
||||
|
||||
Error:
|
||||
type: object
|
||||
|
||||
@@ -16,7 +16,6 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
@@ -75,7 +74,6 @@ use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::SpawnMode;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
@@ -281,13 +279,6 @@ impl From<GetTenantError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetTimelineError> for ApiError {
|
||||
fn from(gte: GetTimelineError) -> Self {
|
||||
// Rationale: tenant is activated only after eligble timelines activate
|
||||
ApiError::NotFound(gte.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetActiveTenantError> for ApiError {
|
||||
fn from(e: GetActiveTenantError) -> ApiError {
|
||||
match e {
|
||||
@@ -395,7 +386,7 @@ async fn build_timeline_info_common(
|
||||
let guard = timeline.last_received_wal.lock().unwrap();
|
||||
if let Some(info) = guard.as_ref() {
|
||||
(
|
||||
Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
|
||||
Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
|
||||
Some(info.last_received_msg_lsn),
|
||||
Some(info.last_received_msg_ts),
|
||||
)
|
||||
@@ -652,7 +643,9 @@ async fn timeline_preserve_initdb_handler(
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, false)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
timeline
|
||||
.preserve_initdb_archive()
|
||||
@@ -694,7 +687,9 @@ async fn timeline_detail_handler(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, false)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
@@ -1706,32 +1701,6 @@ async fn handle_tenant_break(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
// Obtains an lsn lease on the given timeline.
|
||||
async fn lsn_lease_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let lsn: Lsn = parse_query_param(&request, "lsn")?
|
||||
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline
|
||||
.make_lsn_lease(lsn, &ctx)
|
||||
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
async fn timeline_gc_handler(
|
||||
mut request: Request<Body>,
|
||||
@@ -1767,8 +1736,6 @@ async fn timeline_compact_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
@@ -1777,9 +1744,6 @@ async fn timeline_compact_handler(
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
@@ -1804,8 +1768,6 @@ async fn timeline_checkpoint_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
@@ -1819,10 +1781,6 @@ async fn timeline_checkpoint_handler(
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
@@ -1906,11 +1864,14 @@ async fn timeline_detach_ancestor_handler(
|
||||
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
let ctx = &ctx;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let (_guard, prepared) = timeline
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let res = state
|
||||
.tenant_manager
|
||||
@@ -2044,7 +2005,9 @@ async fn active_timeline_of_active_tenant(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
Ok(tenant.get_timeline(timeline_id, true)?)
|
||||
tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))
|
||||
}
|
||||
|
||||
async fn always_panic_handler(
|
||||
@@ -2308,31 +2271,6 @@ async fn post_tracing_event_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn force_aux_policy_switch_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
|
||||
let policy: AuxFilePolicy = json_request(&mut r).await?;
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
timeline
|
||||
.do_switch_aux_policy(policy)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn put_io_engine_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -2410,9 +2348,19 @@ async fn list_aux_files(
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
|
||||
json_response(StatusCode::OK, files)
|
||||
let process = || async move {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
|
||||
Ok::<_, anyhow::Error>(files)
|
||||
};
|
||||
|
||||
match process().await {
|
||||
Ok(st) => json_response(StatusCode::OK, st),
|
||||
Err(err) => json_response(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
ApiError::InternalServerError(err).to_string(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
async fn ingest_aux_files(
|
||||
@@ -2430,22 +2378,24 @@ async fn ingest_aux_files(
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let mut modification = timeline.begin_modification(
|
||||
Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
|
||||
);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
for (fname, content) in body.aux_files {
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
modification
|
||||
.commit(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let process = || async move {
|
||||
let mut modification = timeline.begin_modification(Lsn(
|
||||
timeline.get_last_record_lsn().0 + 8
|
||||
) /* advance LSN by 8 */);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
for (fname, content) in body.aux_files {
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.await?;
|
||||
}
|
||||
modification.commit(&ctx).await?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
match process().await {
|
||||
Ok(st) => json_response(StatusCode::OK, st),
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Report on the largest tenants on this pageserver, for the storage controller to identify
|
||||
@@ -2751,10 +2701,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|
||||
|r| api_handler(r, get_timestamp_of_lsn_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|
||||
|r| api_handler(r, lsn_lease_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
|
||||
|r| api_handler(r, timeline_gc_handler),
|
||||
@@ -2828,10 +2774,6 @@ pub fn make_router(
|
||||
|r| api_handler(r, timeline_collect_keyspace),
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
||||
|r| api_handler(r, force_aux_policy_switch_handler),
|
||||
)
|
||||
.get("/v1/utilization", |r| api_handler(r, get_utilization))
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
|
||||
|
||||
@@ -525,15 +525,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_standby_horizon",
|
||||
"Standby apply LSN for which GC is hold off, by timeline.",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_resident_physical_size",
|
||||
@@ -1867,6 +1858,7 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) time_spent_on_ingest: Histogram,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
@@ -1890,6 +1882,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
time_spent_on_ingest: register_histogram!(
|
||||
"pageserver_wal_ingest_put_value_seconds",
|
||||
"Actual time spent on ingesting a record",
|
||||
redo_histogram_time_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
@@ -2100,7 +2098,6 @@ pub(crate) struct TimelineMetrics {
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
@@ -2170,9 +2167,6 @@ impl TimelineMetrics {
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let standby_horizon_gauge = STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -2218,7 +2212,6 @@ impl TimelineMetrics {
|
||||
find_gc_cutoffs_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
aux_file_size_gauge,
|
||||
@@ -2253,7 +2246,6 @@ impl TimelineMetrics {
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
{
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
@@ -19,7 +19,6 @@ use pageserver_api::models::{
|
||||
};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use pageserver_api::shard::ShardNumber;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
@@ -34,7 +33,6 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::time::SystemTime;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
@@ -907,39 +905,6 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id, %lsn))]
|
||||
async fn handle_make_lsn_lease<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
|
||||
.await?;
|
||||
let lease = timeline.make_lsn_lease(lsn, ctx)?;
|
||||
let valid_until = lease
|
||||
.valid_until
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.map_err(|e| QueryError::Other(e.into()))?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"valid_until",
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(
|
||||
&valid_until.as_millis().to_be_bytes(),
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_rel_exists_request(
|
||||
&mut self,
|
||||
@@ -1521,8 +1486,9 @@ where
|
||||
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
debug!("process query {query_string:?}");
|
||||
let parts = query_string.split_whitespace().collect::<Vec<_>>();
|
||||
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
|
||||
if query_string.starts_with("pagestream_v2 ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
@@ -1547,7 +1513,9 @@ where
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
|
||||
} else if query_string.starts_with("pagestream ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
@@ -1572,7 +1540,10 @@ where
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||
} else if query_string.starts_with("basebackup ") {
|
||||
let (_, params_raw) = query_string.split_at("basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for basebackup command"
|
||||
@@ -1590,23 +1561,26 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
let lsn = if params.len() >= 3 {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let gzip = match params.get(3) {
|
||||
Some(&"--gzip") => true,
|
||||
None => false,
|
||||
Some(third_param) => {
|
||||
let gzip = if params.len() >= 4 {
|
||||
if params[3] == "--gzip" {
|
||||
true
|
||||
} else {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Parameter in position 3 unknown {third_param}",
|
||||
)))
|
||||
"Parameter in position 3 unknown {}",
|
||||
params[3],
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
@@ -1630,7 +1604,10 @@ where
|
||||
res?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for get_last_record_rlsn command"
|
||||
@@ -1672,7 +1649,10 @@ where
|
||||
.await?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
|
||||
else if query_string.starts_with("fullbackup ") {
|
||||
let (_, params_raw) = query_string.split_at("fullbackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for fullbackup command"
|
||||
@@ -1689,18 +1669,18 @@ where
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
let lsn = if params.len() > 2 {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
|
||||
let prev_lsn = if params.len() > 3 {
|
||||
Some(
|
||||
Lsn::from_str(prev_lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
|
||||
Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
@@ -1733,7 +1713,8 @@ where
|
||||
// 2. Run:
|
||||
// cat my_backup/base.tar | psql -h $PAGESERVER \
|
||||
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
|
||||
let params = &parts[2..];
|
||||
let (_, params_raw) = query_string.split_at("import basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
if params.len() != 5 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import basebackup command"
|
||||
@@ -1782,7 +1763,8 @@ where
|
||||
//
|
||||
// Files are scheduled to be persisted to remote storage, and the
|
||||
// caller should poll the http api to check when that is done.
|
||||
let params = &parts[2..];
|
||||
let (_, params_raw) = query_string.split_at("import wal ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
if params.len() != 4 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import wal command"
|
||||
@@ -1820,45 +1802,10 @@ where
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("lease lsn ") {
|
||||
let params = &parts[2..];
|
||||
if params.len() != 3 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number {} for lease lsn command",
|
||||
params.len()
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_shard_id = TenantShardId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_shard_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// The caller is responsible for providing correct lsn.
|
||||
let lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
|
||||
match self
|
||||
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error obtaining lsn lease for {lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else if let Some(params) = parts.strip_prefix(&["show"]) {
|
||||
} else if query_string.starts_with("show ") {
|
||||
// show <tenant_id>
|
||||
let (_, params_raw) = query_string.split_at("show ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
if params.len() != 1 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for config command"
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
@@ -39,11 +40,7 @@ use utils::bin_ser::DeserializeError;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
|
||||
/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
|
||||
const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LsnForTimestamp {
|
||||
@@ -1480,24 +1477,11 @@ impl<'a> DatadirModification<'a> {
|
||||
// Allowed switch path:
|
||||
// * no aux files -> v1/v2/cross-validation
|
||||
// * cross-validation->v2
|
||||
|
||||
let current_policy = if current_policy.is_none() {
|
||||
// This path will only be hit once per tenant: we will decide the final policy in this code block.
|
||||
// The next call to `put_file` will always have `last_aux_file_policy != None`.
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
|
||||
if aux_files_key_v1.is_empty() {
|
||||
None
|
||||
} else {
|
||||
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
||||
Some(AuxFilePolicy::V1)
|
||||
}
|
||||
} else {
|
||||
current_policy
|
||||
};
|
||||
|
||||
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
|
||||
self.tline.do_switch_aux_policy(switch_policy)?;
|
||||
self.tline.last_aux_file_policy.store(Some(switch_policy));
|
||||
self.tline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
|
||||
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
|
||||
switch_policy
|
||||
} else {
|
||||
@@ -1714,6 +1698,8 @@ impl<'a> DatadirModification<'a> {
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
@@ -1753,6 +1739,8 @@ impl<'a> DatadirModification<'a> {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
timer.observe_duration();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1788,12 +1776,6 @@ impl<'a> DatadirModification<'a> {
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
/// Only used during unit tests, force putting a key into the modification.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
|
||||
self.put(key, val);
|
||||
}
|
||||
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
|
||||
@@ -3964,20 +3964,18 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::pgdatadir_mapping::AuxFilesDirectory;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use rand::{thread_rng, Rng};
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
@@ -4779,12 +4777,7 @@ mod tests {
|
||||
info!("Doing vectored read on {:?}", read);
|
||||
|
||||
let vectored_res = tline
|
||||
.get_vectored_impl(
|
||||
read.clone(),
|
||||
reads_lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
&ctx,
|
||||
)
|
||||
.get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
|
||||
.await;
|
||||
tline
|
||||
.validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
|
||||
@@ -4833,7 +4826,7 @@ mod tests {
|
||||
.get_vectored_impl(
|
||||
aux_keyspace.clone(),
|
||||
read_lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
ValuesReconstructState::new(),
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -4978,7 +4971,7 @@ mod tests {
|
||||
.get_vectored_impl(
|
||||
read.clone(),
|
||||
current_lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
ValuesReconstructState::new(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5113,7 +5106,7 @@ mod tests {
|
||||
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
|
||||
},
|
||||
query_lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
ValuesReconstructState::new(),
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -5169,9 +5162,7 @@ mod tests {
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
|
||||
kind: compaction_algorithm,
|
||||
};
|
||||
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -5528,9 +5519,7 @@ mod tests {
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
|
||||
kind: compaction_algorithm,
|
||||
};
|
||||
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -5558,7 +5547,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
const NUM_KEYS: usize = 1000;
|
||||
const STEP: usize = 10000; // random update + scan base_key + idx * STEP
|
||||
const STEP: usize = 100; // random update + scan base_key + idx * STEP
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
@@ -5591,7 +5580,7 @@ mod tests {
|
||||
|
||||
let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
|
||||
|
||||
for iter in 0..=10 {
|
||||
for _ in 0..10 {
|
||||
// Read all the blocks
|
||||
for (blknum, last_lsn) in updated.iter().enumerate() {
|
||||
test_key.field6 = (blknum * STEP) as u32;
|
||||
@@ -5606,7 +5595,7 @@ mod tests {
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::default(),
|
||||
ValuesReconstructState::default(),
|
||||
&ctx,
|
||||
)
|
||||
.await?
|
||||
@@ -5642,88 +5631,14 @@ mod tests {
|
||||
updated[blknum] = lsn;
|
||||
}
|
||||
|
||||
// Perform two cycles of flush, compact, and GC
|
||||
for round in 0..2 {
|
||||
tline.freeze_and_flush().await?;
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
if iter % 5 == 0 && round == 0 {
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceImageLayerCreation);
|
||||
flags.insert(CompactFlags::ForceRepartition);
|
||||
flags
|
||||
} else {
|
||||
EnumSet::empty()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
tenant
|
||||
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
base_key.field1 = AUX_KEY_PREFIX;
|
||||
let test_key = base_key;
|
||||
let mut lsn = Lsn(0x10);
|
||||
|
||||
for _ in 0..20 {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let mut writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", 0, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
// Perform a cycle of flush, compact, and GC
|
||||
tline.freeze_and_flush().await?;
|
||||
tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
|
||||
tenant
|
||||
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
tline.freeze_and_flush().await?; // force create a delta layer
|
||||
}
|
||||
|
||||
let before_num_l0_delta_files = tline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.get_level0_deltas()?
|
||||
.len();
|
||||
|
||||
tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
|
||||
|
||||
let after_num_l0_delta_files = tline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.get_level0_deltas()?
|
||||
.len();
|
||||
|
||||
assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
|
||||
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn, &ctx).await?,
|
||||
test_img(&format!("{} at {}", 0, lsn))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -6002,498 +5917,4 @@ mod tests {
|
||||
Some(&bytes::Bytes::from_static(b"last"))
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_force_switch() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
|
||||
let tline: Arc<Timeline> = tenant
|
||||
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
None,
|
||||
"no aux file is written so it should be unset"
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test1", b"first", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"dirty index_part.json reflected state is yet to be updated"
|
||||
);
|
||||
|
||||
// lose all data from v1
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test2", b"second", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
// read data ingested in v2
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"second"))
|
||||
);
|
||||
// lose all data from v1
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_auto_detect() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
|
||||
let tline: Arc<Timeline> = tenant
|
||||
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
None,
|
||||
"no aux file is written so it should be unset"
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: vec![(
|
||||
"test_file".to_string(),
|
||||
Bytes::copy_from_slice(b"test_file"),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
})
|
||||
.unwrap();
|
||||
modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test1", b"first", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
"keep using v1 because there are aux files writting with v1"
|
||||
);
|
||||
|
||||
// we can still read the auxfile v1
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("test_file"),
|
||||
Some(&bytes::Bytes::from_static(b"test_file"))
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_image_creation() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_image_creation")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
const NUM_KEYS: usize = 1000;
|
||||
const STEP: usize = 10000; // random update + scan base_key + idx * STEP
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
base_key.field1 = AUX_KEY_PREFIX;
|
||||
let mut test_key = base_key;
|
||||
let mut lsn = Lsn(0x10);
|
||||
|
||||
async fn scan_with_statistics(
|
||||
tline: &Timeline,
|
||||
keyspace: &KeySpace,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
|
||||
let mut reconstruct_state = ValuesReconstructState::default();
|
||||
let res = tline
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
|
||||
}
|
||||
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
for blknum in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
test_key.field6 = (blknum * STEP) as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
}
|
||||
|
||||
let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
|
||||
|
||||
for iter in 1..=10 {
|
||||
for _ in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = (blknum * STEP) as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
}
|
||||
|
||||
tline.freeze_and_flush().await?;
|
||||
|
||||
if iter % 5 == 0 {
|
||||
let (_, before_delta_file_accessed) =
|
||||
scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceImageLayerCreation);
|
||||
flags.insert(CompactFlags::ForceRepartition);
|
||||
flags
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
let (_, after_delta_file_accessed) =
|
||||
scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
|
||||
assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
|
||||
// Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
|
||||
assert!(
|
||||
after_delta_file_accessed <= 2,
|
||||
"after_delta_file_accessed={after_delta_file_accessed}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
|
||||
let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
|
||||
|
||||
let mut lsn = Lsn(0x20);
|
||||
|
||||
{
|
||||
let mut writer = tline.writer().await;
|
||||
writer
|
||||
.put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
tline.freeze_and_flush().await?; // this will create a image layer
|
||||
}
|
||||
|
||||
let child = tenant
|
||||
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
lsn.0 += 0x10;
|
||||
|
||||
{
|
||||
let mut writer = child.writer().await;
|
||||
writer
|
||||
.put(
|
||||
base_key_child,
|
||||
lsn,
|
||||
&Value::Image(test_img("data key 2")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
child.freeze_and_flush().await?; // this will create a delta
|
||||
|
||||
{
|
||||
// update the partitioning to include the test key space, otherwise they
|
||||
// will be dropped by image layer creation
|
||||
let mut guard = child.partitioning.lock().await;
|
||||
let ((partitioning, _), partition_lsn) = &mut *guard;
|
||||
partitioning
|
||||
.parts
|
||||
.push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
|
||||
*partition_lsn = lsn;
|
||||
}
|
||||
|
||||
child
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut set = EnumSet::empty();
|
||||
set.insert(CompactFlags::ForceImageLayerCreation);
|
||||
set
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?; // force create an image layer for the keys, TODO: check if the image layer is created
|
||||
}
|
||||
|
||||
async fn get_vectored_impl_wrapper(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Bytes>, GetVectoredError> {
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let mut res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key..key.next()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
Ok(res.pop_last().map(|(k, v)| {
|
||||
assert_eq!(k, key);
|
||||
v.unwrap()
|
||||
}))
|
||||
}
|
||||
|
||||
// test vectored get on parent timeline
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
|
||||
Some(test_img("data key 1"))
|
||||
);
|
||||
assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.is_missing_key_error());
|
||||
assert!(
|
||||
get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.is_missing_key_error()
|
||||
);
|
||||
|
||||
// test vectored get on child timeline
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
|
||||
Some(test_img("data key 1"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
|
||||
Some(test_img("data key 2"))
|
||||
);
|
||||
assert!(
|
||||
get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.is_missing_key_error()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
|
||||
let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
|
||||
base_key.field1 = AUX_KEY_PREFIX;
|
||||
base_key_child.field1 = AUX_KEY_PREFIX;
|
||||
base_key_nonexist.field1 = AUX_KEY_PREFIX;
|
||||
|
||||
let mut lsn = Lsn(0x20);
|
||||
|
||||
{
|
||||
let mut writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
base_key,
|
||||
lsn,
|
||||
&Value::Image(test_img("metadata key 1")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
tline.freeze_and_flush().await?; // this will create an image layer
|
||||
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut set = EnumSet::empty();
|
||||
set.insert(CompactFlags::ForceImageLayerCreation);
|
||||
set.insert(CompactFlags::ForceRepartition);
|
||||
set
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?; // force create an image layer for metadata keys
|
||||
tenant
|
||||
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let child = tenant
|
||||
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
lsn.0 += 0x10;
|
||||
|
||||
{
|
||||
let mut writer = child.writer().await;
|
||||
writer
|
||||
.put(
|
||||
base_key_child,
|
||||
lsn,
|
||||
&Value::Image(test_img("metadata key 2")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
child.freeze_and_flush().await?;
|
||||
|
||||
child
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut set = EnumSet::empty();
|
||||
set.insert(CompactFlags::ForceImageLayerCreation);
|
||||
set.insert(CompactFlags::ForceRepartition);
|
||||
set
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?; // force create an image layer for metadata keys
|
||||
tenant
|
||||
.gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
async fn get_vectored_impl_wrapper(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Bytes>, GetVectoredError> {
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let mut res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key..key.next()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
Ok(res.pop_last().map(|(k, v)| {
|
||||
assert_eq!(k, key);
|
||||
v.unwrap()
|
||||
}))
|
||||
}
|
||||
|
||||
// test vectored get on parent timeline
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key 1"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
|
||||
// test vectored get on child timeline
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key 2"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -238,13 +238,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
io_buf,
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({len} bytes)"),
|
||||
format!("blob too large ({} bytes)", len),
|
||||
)),
|
||||
);
|
||||
}
|
||||
if len > 0x0fff_ffff {
|
||||
tracing::warn!("writing blob above future limit ({len} bytes)");
|
||||
}
|
||||
let mut len_buf = (len as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
io_buf.extend_from_slice(&len_buf[..]);
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
use anyhow::bail;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
@@ -321,7 +320,7 @@ pub struct TenantConf {
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
pub compaction_algorithm: CompactionAlgorithmSettings,
|
||||
pub compaction_algorithm: CompactionAlgorithm,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is #of bytes of WAL.
|
||||
@@ -407,7 +406,7 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
@@ -498,9 +497,7 @@ impl TenantConfOpt {
|
||||
.unwrap_or(global_conf.compaction_threshold),
|
||||
compaction_algorithm: self
|
||||
.compaction_algorithm
|
||||
.as_ref()
|
||||
.unwrap_or(&global_conf.compaction_algorithm)
|
||||
.clone(),
|
||||
.unwrap_or(global_conf.compaction_algorithm),
|
||||
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
|
||||
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
|
||||
image_creation_threshold: self
|
||||
@@ -553,9 +550,7 @@ impl Default for TenantConf {
|
||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
compaction_algorithm: CompactionAlgorithmSettings {
|
||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||
},
|
||||
compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
|
||||
@@ -7,7 +7,7 @@ use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use pageserver_api::upcall_api::ReAttachResponseTenant;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
@@ -127,8 +127,6 @@ pub(crate) enum ShardSelector {
|
||||
First,
|
||||
/// Pick the shard that holds this key
|
||||
Page(Key),
|
||||
/// The shard ID is known: pick the given shard
|
||||
Known(ShardIndex),
|
||||
}
|
||||
|
||||
/// A convenience for use with the re_attach ControlPlaneClient function: rather
|
||||
@@ -2069,11 +2067,6 @@ impl TenantManager {
|
||||
return ShardResolveResult::Found(tenant.clone());
|
||||
}
|
||||
}
|
||||
ShardSelector::Known(shard)
|
||||
if tenant.shard_identity.shard_index() == shard =>
|
||||
{
|
||||
return ShardResolveResult::Found(tenant.clone());
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,10 +62,14 @@ use super::{
|
||||
CommandRequest, DownloadCommand,
|
||||
};
|
||||
|
||||
/// For each tenant, default period for how long must have passed since the last download_tenant call before
|
||||
/// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
|
||||
/// download, if the uploader populated it.
|
||||
const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
/// For each tenant, how long must have passed since the last download_tenant call before
|
||||
/// calling it again. This is approximately the time by which local data is allowed
|
||||
/// to fall behind remote data.
|
||||
///
|
||||
/// TODO: this should just be a default, and the actual period should be controlled
|
||||
/// via the heatmap itself
|
||||
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
|
||||
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
|
||||
/// Range of concurrency we may use when downloading layers within a timeline. This is independent
|
||||
/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
|
||||
@@ -93,7 +97,7 @@ pub(super) async fn downloader_task(
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("secondary_download_scheduler"))
|
||||
.instrument(info_span!("secondary_downloads"))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -148,22 +152,14 @@ pub(super) struct SecondaryDetailTimeline {
|
||||
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
|
||||
}
|
||||
|
||||
// Aspects of a heatmap that we remember after downloading it
|
||||
#[derive(Clone, Debug)]
|
||||
struct DownloadSummary {
|
||||
etag: Etag,
|
||||
#[allow(unused)]
|
||||
mtime: SystemTime,
|
||||
upload_period: Duration,
|
||||
}
|
||||
|
||||
/// This state is written by the secondary downloader, it is opaque
|
||||
/// to TenantManager
|
||||
#[derive(Debug)]
|
||||
pub(super) struct SecondaryDetail {
|
||||
pub(super) config: SecondaryLocationConfig,
|
||||
|
||||
last_download: Option<DownloadSummary>,
|
||||
last_download: Option<Instant>,
|
||||
last_etag: Option<Etag>,
|
||||
next_download: Option<Instant>,
|
||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
}
|
||||
@@ -193,6 +189,7 @@ impl SecondaryDetail {
|
||||
Self {
|
||||
config,
|
||||
last_download: None,
|
||||
last_etag: None,
|
||||
next_download: None,
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
@@ -246,8 +243,9 @@ impl SecondaryDetail {
|
||||
|
||||
struct PendingDownload {
|
||||
secondary_state: Arc<SecondaryTenant>,
|
||||
last_download: Option<DownloadSummary>,
|
||||
last_download: Option<Instant>,
|
||||
target_time: Option<Instant>,
|
||||
period: Option<Duration>,
|
||||
}
|
||||
|
||||
impl scheduler::PendingJob for PendingDownload {
|
||||
@@ -297,17 +295,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
|
||||
tracing::debug!("Secondary tenant download completed");
|
||||
|
||||
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
|
||||
// take priority to run again.
|
||||
let mut detail = secondary_state.detail.lock().unwrap();
|
||||
|
||||
let period = detail
|
||||
.last_download
|
||||
.as_ref()
|
||||
.map(|d| d.upload_period)
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
|
||||
|
||||
// We advance next_download irrespective of errors: we don't want error cases to result in
|
||||
// expensive busy-polling.
|
||||
detail.next_download = Some(Instant::now() + period_jitter(period, 5));
|
||||
detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
|
||||
}
|
||||
|
||||
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
|
||||
@@ -340,11 +331,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
if detail.next_download.is_none() {
|
||||
// Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times. Subsequent
|
||||
// rounds will use a smaller jitter to avoid accidentally synchronizing later.
|
||||
detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
|
||||
detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
|
||||
"Using our constant, which is known to be small compared with clock range",
|
||||
));
|
||||
}
|
||||
(detail.last_download.clone(), detail.next_download.unwrap())
|
||||
(detail.last_download, detail.next_download.unwrap())
|
||||
};
|
||||
|
||||
if now > next_download {
|
||||
@@ -352,6 +343,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
secondary_state: secondary_tenant,
|
||||
last_download,
|
||||
target_time: Some(next_download),
|
||||
period: Some(DOWNLOAD_FRESHEN_INTERVAL),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
@@ -377,6 +369,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
|
||||
Ok(PendingDownload {
|
||||
target_time: None,
|
||||
period: None,
|
||||
last_download: None,
|
||||
secondary_state: tenant,
|
||||
})
|
||||
@@ -393,6 +386,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
secondary_state,
|
||||
last_download,
|
||||
target_time,
|
||||
period,
|
||||
} = job;
|
||||
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
@@ -429,15 +423,20 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
|
||||
// If the job had a target execution time, we may check our final execution
|
||||
// time against that for observability purposes.
|
||||
if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
|
||||
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||
let elapsed = Instant::now().duration_since(target_time);
|
||||
if let (Some(target_time), Some(period)) = (target_time, period) {
|
||||
// Only track execution lag if this isn't our first download: otherwise, it is expected
|
||||
// that execution will have taken longer than our configured interval, for example
|
||||
// when starting up a pageserver and
|
||||
if last_download.is_some() {
|
||||
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||
let elapsed = Instant::now().duration_since(target_time);
|
||||
|
||||
warn_when_period_overrun(
|
||||
elapsed,
|
||||
last_download.upload_period,
|
||||
BackgroundLoopKind::SecondaryDownload,
|
||||
);
|
||||
warn_when_period_overrun(
|
||||
elapsed,
|
||||
period,
|
||||
BackgroundLoopKind::SecondaryDownload,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
CompleteDownload {
|
||||
@@ -526,12 +525,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
|
||||
// We will use the etag from last successful download to make the download conditional on changes
|
||||
let last_download = self
|
||||
let last_etag = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.last_download
|
||||
.last_etag
|
||||
.clone();
|
||||
|
||||
// Download the tenant's heatmap
|
||||
@@ -540,7 +539,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
etag: heatmap_etag,
|
||||
bytes: heatmap_bytes,
|
||||
} = match tokio::select!(
|
||||
bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
|
||||
bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
|
||||
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
||||
) {
|
||||
HeatMapDownload::Unmodified => {
|
||||
@@ -569,39 +568,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
heatmap.timelines.len()
|
||||
);
|
||||
|
||||
// Get or initialize the local disk state for the timelines we will update
|
||||
let mut timeline_states = HashMap::new();
|
||||
for timeline in &heatmap.timelines {
|
||||
let timeline_state = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.get(&timeline.timeline_id)
|
||||
.cloned();
|
||||
|
||||
let timeline_state = match timeline_state {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
// We have no existing state: need to scan local disk for layers first.
|
||||
let timeline_state =
|
||||
init_timeline_state(self.conf, tenant_shard_id, timeline).await;
|
||||
|
||||
// Re-acquire detail lock now that we're done with async load from local FS
|
||||
self.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(timeline.timeline_id, timeline_state.clone());
|
||||
timeline_state
|
||||
}
|
||||
};
|
||||
|
||||
timeline_states.insert(timeline.timeline_id, timeline_state);
|
||||
}
|
||||
|
||||
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
|
||||
// principle that deletions should be done before writes wherever possible, and so that we can use this
|
||||
// phase to initialize our SecondaryProgress.
|
||||
@@ -612,10 +578,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
let timeline_state = timeline_states
|
||||
.remove(&timeline.timeline_id)
|
||||
.expect("Just populated above");
|
||||
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!(
|
||||
"Cancelled before downloading timeline {}",
|
||||
@@ -625,7 +587,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
let timeline_id = timeline.timeline_id;
|
||||
self.download_timeline(timeline, timeline_state, ctx)
|
||||
self.download_timeline(timeline, ctx)
|
||||
.instrument(tracing::info_span!(
|
||||
"secondary_download_timeline",
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
@@ -637,30 +599,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
// Only update last_etag after a full successful download: this way will not skip
|
||||
// the next download, even if the heatmap's actual etag is unchanged.
|
||||
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
|
||||
etag: heatmap_etag,
|
||||
mtime: heatmap_mtime,
|
||||
upload_period: heatmap
|
||||
.upload_period_ms
|
||||
.map(|ms| Duration::from_millis(ms as u64))
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
|
||||
});
|
||||
|
||||
// Robustness: we should have updated progress properly, but in case we didn't, make sure
|
||||
// we don't leave the tenant in a state where we claim to have successfully downloaded
|
||||
// everything, but our progress is incomplete. The invariant here should be that if
|
||||
// we have set `last_download` to this heatmap's etag, then the next time we see that
|
||||
// etag we can safely do no work (i.e. we must be complete).
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
debug_assert!(progress.layers_downloaded == progress.layers_total);
|
||||
debug_assert!(progress.bytes_downloaded == progress.bytes_total);
|
||||
if progress.layers_downloaded != progress.layers_total
|
||||
|| progress.bytes_downloaded != progress.bytes_total
|
||||
{
|
||||
tracing::warn!("Correcting drift in progress stats ({progress:?})");
|
||||
progress.layers_downloaded = progress.layers_total;
|
||||
progress.bytes_downloaded = progress.bytes_total;
|
||||
}
|
||||
self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -837,7 +776,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
async fn download_timeline(
|
||||
&self,
|
||||
timeline: HeatMapTimeline,
|
||||
timeline_state: SecondaryDetailTimeline,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -846,6 +784,34 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
// Clone a view of what layers already exist on disk
|
||||
let timeline_state = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.get(&timeline.timeline_id)
|
||||
.cloned();
|
||||
|
||||
let timeline_state = match timeline_state {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
// We have no existing state: need to scan local disk for layers first.
|
||||
let timeline_state =
|
||||
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
|
||||
|
||||
// Re-acquire detail lock now that we're done with async load from local FS
|
||||
self.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(timeline.timeline_id, timeline_state.clone());
|
||||
timeline_state
|
||||
}
|
||||
};
|
||||
|
||||
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
let mut download_futs = Vec::new();
|
||||
@@ -1013,11 +979,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
);
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
tracing::info!(
|
||||
"Starting download of layer {}, size {}",
|
||||
layer.name,
|
||||
layer.metadata.file_size
|
||||
);
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
self.remote_storage,
|
||||
@@ -1040,14 +1001,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||
layer.name
|
||||
);
|
||||
|
||||
// If the layer is 404, adjust the progress statistics to reflect that we will not download it.
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
progress.layers_total = progress.layers_total.saturating_sub(1);
|
||||
progress.bytes_total = progress
|
||||
.bytes_total
|
||||
.saturating_sub(layer.metadata.file_size);
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
|
||||
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("heatmap_upload_scheduler"))
|
||||
.instrument(info_span!("heatmap_uploader"))
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
@@ -179,13 +179,6 @@ where
|
||||
// Schedule some work, if concurrency limit permits it
|
||||
self.spawn_pending();
|
||||
|
||||
// This message is printed every scheduling iteration as proof of liveness when looking at logs
|
||||
tracing::info!(
|
||||
"Status: {} tasks running, {} pending",
|
||||
self.running.len(),
|
||||
self.pending.len()
|
||||
);
|
||||
|
||||
// Between scheduling iterations, we will:
|
||||
// - Drain any complete tasks and spawn pending tasks
|
||||
// - Handle incoming administrative commands
|
||||
@@ -265,11 +258,7 @@ where
|
||||
|
||||
self.tasks.spawn(fut);
|
||||
|
||||
let replaced = self.running.insert(tenant_shard_id, in_progress);
|
||||
debug_assert!(replaced.is_none());
|
||||
if replaced.is_some() {
|
||||
tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
|
||||
}
|
||||
self.running.insert(tenant_shard_id, in_progress);
|
||||
}
|
||||
|
||||
/// For all pending tenants that are elegible for execution, spawn their task.
|
||||
@@ -279,9 +268,7 @@ where
|
||||
while !self.pending.is_empty() && self.running.len() < self.concurrency {
|
||||
// unwrap: loop condition includes !is_empty()
|
||||
let pending = self.pending.pop_front().unwrap();
|
||||
if !self.running.contains_key(pending.get_tenant_shard_id()) {
|
||||
self.do_spawn(pending);
|
||||
}
|
||||
self.do_spawn(pending);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -334,8 +321,7 @@ where
|
||||
|
||||
let tenant_shard_id = job.get_tenant_shard_id();
|
||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||
tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Command already running, waiting for it");
|
||||
tracing::info!("Command already running, waiting for it");
|
||||
barrier
|
||||
} else {
|
||||
let running = self.spawn_now(job);
|
||||
|
||||
@@ -113,20 +113,12 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Bag of data accumulated during a vectored get..
|
||||
/// Bag of data accumulated during a vectored get
|
||||
pub(crate) struct ValuesReconstructState {
|
||||
/// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
|
||||
/// should not expect to get anything from this hashmap.
|
||||
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
||||
/// The keys which are already retrieved
|
||||
|
||||
keys_done: KeySpaceRandomAccum,
|
||||
|
||||
/// The keys covered by the image layers
|
||||
keys_with_image_coverage: Option<Range<Key>>,
|
||||
|
||||
// Statistics that are still accessible as a caller of `get_vectored_impl`.
|
||||
layers_visited: u32,
|
||||
delta_layers_visited: u32,
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
@@ -134,9 +126,7 @@ impl ValuesReconstructState {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
keys_with_image_coverage: None,
|
||||
layers_visited: 0,
|
||||
delta_layers_visited: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,17 +140,8 @@ impl ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
|
||||
pub(crate) fn on_layer_visited(&mut self) {
|
||||
self.layers_visited += 1;
|
||||
if let ReadableLayer::PersistentLayer(layer) = layer {
|
||||
if layer.layer_desc().is_delta() {
|
||||
self.delta_layers_visited += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_delta_layers_visited(&self) -> u32 {
|
||||
self.delta_layers_visited
|
||||
}
|
||||
|
||||
pub(crate) fn get_layers_visited(&self) -> u32 {
|
||||
@@ -190,16 +171,6 @@ impl ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
/// On hitting image layer, we can mark all keys in this range as done, because
|
||||
/// if the image layer does not contain a key, it is deleted/never added.
|
||||
pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
|
||||
let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
|
||||
assert_eq!(
|
||||
prev_val, None,
|
||||
"should consume the keyspace before the next iteration"
|
||||
);
|
||||
}
|
||||
|
||||
/// Update the state collected for a given key.
|
||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||
///
|
||||
@@ -262,12 +233,8 @@ impl ValuesReconstructState {
|
||||
|
||||
/// Returns the key space describing the keys that have
|
||||
/// been marked as completed since the last call to this function.
|
||||
/// Returns individual keys done, and the image layer coverage.
|
||||
pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
|
||||
(
|
||||
self.keys_done.consume_keyspace(),
|
||||
self.keys_with_image_coverage.take(),
|
||||
)
|
||||
pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
|
||||
self.keys_done.consume_keyspace()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -158,7 +158,6 @@ pub struct ImageLayerInner {
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
file: VirtualFile,
|
||||
@@ -420,7 +419,6 @@ impl ImageLayerInner {
|
||||
file,
|
||||
file_id,
|
||||
max_vectored_read_bytes,
|
||||
key_range: actual_summary.key_range,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -480,8 +478,6 @@ impl ImageLayerInner {
|
||||
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
reconstruct_state.on_image_layer_visited(&self.key_range);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -650,7 +646,7 @@ impl ImageLayerWriterInner {
|
||||
lsn,
|
||||
},
|
||||
);
|
||||
trace!("creating image layer {}", path);
|
||||
info!("new image layer {path}");
|
||||
let mut file = {
|
||||
VirtualFile::open_with_options(
|
||||
&path,
|
||||
@@ -770,7 +766,7 @@ impl ImageLayerWriterInner {
|
||||
// FIXME: why not carry the virtualfile here, it supports renaming?
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
|
||||
info!("created image layer {}", layer.local_path());
|
||||
trace!("created image layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
|
||||
use tracing::Instrument;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::{gate, heavier_once_cell};
|
||||
use utils::sync::heavier_once_cell;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -1264,7 +1264,6 @@ impl LayerInner {
|
||||
lsn_end: lsn_range.end,
|
||||
remote: !resident,
|
||||
access_stats,
|
||||
l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
|
||||
}
|
||||
} else {
|
||||
let lsn = self.desc.image_layer_lsn();
|
||||
@@ -1333,7 +1332,7 @@ impl LayerInner {
|
||||
|
||||
is_good_to_continue(&rx.borrow_and_update())?;
|
||||
|
||||
let Ok(gate) = timeline.gate.enter() else {
|
||||
let Ok(_gate) = timeline.gate.enter() else {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
};
|
||||
|
||||
@@ -1421,7 +1420,7 @@ impl LayerInner {
|
||||
Self::spawn_blocking(move || {
|
||||
let _span = span.entered();
|
||||
|
||||
let res = self.evict_blocking(&timeline, &gate, &permit);
|
||||
let res = self.evict_blocking(&timeline, &permit);
|
||||
|
||||
let waiters = self.inner.initializer_count();
|
||||
|
||||
@@ -1447,7 +1446,6 @@ impl LayerInner {
|
||||
fn evict_blocking(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
_gate: &gate::GateGuard,
|
||||
_permit: &heavier_once_cell::InitPermit,
|
||||
) -> Result<(), EvictionCancelled> {
|
||||
// now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
|
||||
|
||||
@@ -347,33 +347,37 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
|
||||
mod test {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn image_layer_parse() {
|
||||
fn image_layer_parse() -> anyhow::Result<()> {
|
||||
let expected = LayerName::Image(ImageLayerName {
|
||||
key_range: Key::from_i128(0)
|
||||
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
|
||||
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
|
||||
});
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected,);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected,);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delta_layer_parse() {
|
||||
fn delta_layer_parse() -> anyhow::Result<()> {
|
||||
let expected = LayerName::Delta(DeltaLayerName {
|
||||
key_range: Key::from_i128(0)
|
||||
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
|
||||
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
|
||||
..Lsn::from_hex("000000000154C481").unwrap(),
|
||||
});
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,14 +18,14 @@ use fail::fail_point;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::{
|
||||
AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
|
||||
NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
|
||||
AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
},
|
||||
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
|
||||
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
|
||||
InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
|
||||
AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
|
||||
DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
|
||||
TimelineState,
|
||||
},
|
||||
reltag::BlockNumber,
|
||||
shard::{ShardIdentity, ShardNumber, TenantShardId},
|
||||
@@ -60,6 +60,7 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::init::LocalLayerFileMetadata;
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
tenant::{
|
||||
@@ -88,9 +89,6 @@ use crate::{
|
||||
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
|
||||
};
|
||||
use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
|
||||
use crate::{
|
||||
pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
|
||||
};
|
||||
use crate::{
|
||||
pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
@@ -269,8 +267,6 @@ pub struct Timeline {
|
||||
// Atomic would be more appropriate here.
|
||||
last_freeze_ts: RwLock<Instant>,
|
||||
|
||||
pub(crate) standby_horizon: AtomicLsn,
|
||||
|
||||
// WAL redo manager. `None` only for broken tenants.
|
||||
walredo_mgr: Option<Arc<super::WalRedoManager>>,
|
||||
|
||||
@@ -350,8 +346,8 @@ pub struct Timeline {
|
||||
// though let's keep them both for better error visibility.
|
||||
pub initdb_lsn: Lsn,
|
||||
|
||||
/// When did we last calculate the partitioning? Make it pub to test cases.
|
||||
pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
/// When did we last calculate the partitioning?
|
||||
partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
@@ -485,11 +481,6 @@ impl GcCutoffs {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct TimelineVisitOutcome {
|
||||
completed_keyspace: KeySpace,
|
||||
image_covered_keyspace: KeySpace,
|
||||
}
|
||||
|
||||
/// An error happened in a get() operation.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum PageReconstructError {
|
||||
@@ -514,13 +505,6 @@ pub(crate) enum PageReconstructError {
|
||||
MissingKey(MissingKeyError),
|
||||
}
|
||||
|
||||
impl GetVectoredError {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn is_missing_key_error(&self) -> bool {
|
||||
matches!(self, Self::MissingKey(_))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
@@ -798,11 +782,6 @@ pub(crate) enum ShutdownMode {
|
||||
Hard,
|
||||
}
|
||||
|
||||
struct ImageLayerCreationOutcome {
|
||||
image: Option<ResidentLayer>,
|
||||
next_start_key: Key,
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -904,7 +883,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
if self.conf.validate_vectored_get {
|
||||
@@ -1049,12 +1028,7 @@ impl Timeline {
|
||||
}
|
||||
GetVectoredImpl::Vectored => {
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
ctx,
|
||||
)
|
||||
.get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
|
||||
.await;
|
||||
|
||||
if self.conf.validate_vectored_get {
|
||||
@@ -1142,7 +1116,7 @@ impl Timeline {
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::default(),
|
||||
ValuesReconstructState::default(),
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -1219,7 +1193,7 @@ impl Timeline {
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
mut reconstruct_state: ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let get_kind = if keyspace.total_raw_size() == 1 {
|
||||
@@ -1231,7 +1205,7 @@ impl Timeline {
|
||||
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
|
||||
.for_get_kind(get_kind)
|
||||
.start_timer();
|
||||
self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
|
||||
self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
get_data_timer.stop_and_record();
|
||||
|
||||
@@ -1240,8 +1214,7 @@ impl Timeline {
|
||||
.start_timer();
|
||||
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
|
||||
let layers_visited = reconstruct_state.get_layers_visited();
|
||||
|
||||
for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
|
||||
for (key, res) in reconstruct_state.keys {
|
||||
match res {
|
||||
Err(err) => {
|
||||
results.insert(key, Err(err));
|
||||
@@ -1532,20 +1505,6 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Obtains a temporary lease blocking garbage collection for the given LSN
|
||||
pub(crate) fn make_lsn_lease(
|
||||
&self,
|
||||
_lsn: Lsn,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
|
||||
let lease = LsnLease {
|
||||
valid_until: SystemTime::now() + LEASE_LENGTH,
|
||||
};
|
||||
// TODO: dummy implementation
|
||||
Ok(lease)
|
||||
}
|
||||
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||
@@ -1700,7 +1659,7 @@ impl Timeline {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
match self.get_compaction_algorithm_settings().kind {
|
||||
match self.get_compaction_algorithm() {
|
||||
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
|
||||
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
|
||||
}
|
||||
@@ -2096,14 +2055,12 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
|
||||
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
|
||||
let tenant_conf = &self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.compaction_algorithm
|
||||
.as_ref()
|
||||
.unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm)
|
||||
.clone()
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
|
||||
}
|
||||
|
||||
fn get_eviction_policy(&self) -> EvictionPolicy {
|
||||
@@ -2297,8 +2254,6 @@ impl Timeline {
|
||||
compaction_lock: tokio::sync::Mutex::default(),
|
||||
gc_lock: tokio::sync::Mutex::default(),
|
||||
|
||||
standby_horizon: AtomicLsn::new(0),
|
||||
|
||||
timeline_get_throttle: resources.timeline_get_throttle,
|
||||
|
||||
aux_files: tokio::sync::Mutex::new(AuxFilesState {
|
||||
@@ -3332,15 +3287,12 @@ impl Timeline {
|
||||
|
||||
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
||||
|
||||
let missing_keyspace = loop {
|
||||
loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
let TimelineVisitOutcome {
|
||||
completed_keyspace: completed,
|
||||
image_covered_keyspace,
|
||||
} = Self::get_vectored_reconstruct_data_timeline(
|
||||
let completed = Self::get_vectored_reconstruct_data_timeline(
|
||||
timeline,
|
||||
keyspace.clone(),
|
||||
cont_lsn,
|
||||
@@ -3359,31 +3311,12 @@ impl Timeline {
|
||||
ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
|
||||
});
|
||||
|
||||
// Keyspace is fully retrieved
|
||||
if keyspace.is_empty() {
|
||||
break None;
|
||||
// Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
|
||||
// into ancestor timelines). TODO: is there any other metadata which we want to inherit?
|
||||
if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Not fully retrieved but no ancestor timeline.
|
||||
if timeline.ancestor_timeline.is_none() {
|
||||
break Some(keyspace);
|
||||
}
|
||||
|
||||
// Now we see if there are keys covered by the image layer but does not exist in the
|
||||
// image layer, which means that the key does not exist.
|
||||
|
||||
// The block below will stop the vectored search if any of the keys encountered an image layer
|
||||
// which did not contain a snapshot for said key. Since we have already removed all completed
|
||||
// keys from `keyspace`, we expect there to be no overlap between it and the image covered key
|
||||
// space. If that's not the case, we had at least one key encounter a gap in the image layer
|
||||
// and stop the search as a result of that.
|
||||
let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
|
||||
if !removed.is_empty() {
|
||||
break Some(removed);
|
||||
}
|
||||
// If we reached this point, `remove_overlapping_with` should not have made any change to the
|
||||
// keyspace.
|
||||
|
||||
// Take the min to avoid reconstructing a page with data newer than request Lsn.
|
||||
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
|
||||
timeline_owned = timeline
|
||||
@@ -3391,14 +3324,14 @@ impl Timeline {
|
||||
.await
|
||||
.map_err(GetVectoredError::GetReadyAncestorError)?;
|
||||
timeline = &*timeline_owned;
|
||||
};
|
||||
}
|
||||
|
||||
if let Some(missing_keyspace) = missing_keyspace {
|
||||
if keyspace.total_raw_size() != 0 {
|
||||
return Err(GetVectoredError::MissingKey(MissingKeyError {
|
||||
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
shard: self
|
||||
.shard_identity
|
||||
.get_shard_number(&missing_keyspace.start().unwrap()),
|
||||
.get_shard_number(&keyspace.start().unwrap()),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
@@ -3423,9 +3356,6 @@ impl Timeline {
|
||||
///
|
||||
/// At each iteration pop the top of the fringe (the layer with the highest Lsn)
|
||||
/// and get all the required reconstruct data from the layer in one go.
|
||||
///
|
||||
/// Returns the completed keyspace and the keyspaces with image coverage. The caller
|
||||
/// decides how to deal with these two keyspaces.
|
||||
async fn get_vectored_reconstruct_data_timeline(
|
||||
timeline: &Timeline,
|
||||
keyspace: KeySpace,
|
||||
@@ -3433,27 +3363,20 @@ impl Timeline {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<TimelineVisitOutcome, GetVectoredError> {
|
||||
) -> Result<KeySpace, GetVectoredError> {
|
||||
let mut unmapped_keyspace = keyspace.clone();
|
||||
let mut fringe = LayerFringe::new();
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
let mut image_covered_keyspace = KeySpaceRandomAccum::new();
|
||||
|
||||
loop {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
let (keys_done_last_step, keys_with_image_coverage) =
|
||||
reconstruct_state.consume_done_keys();
|
||||
let keys_done_last_step = reconstruct_state.consume_done_keys();
|
||||
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
|
||||
completed_keyspace.merge(&keys_done_last_step);
|
||||
if let Some(keys_with_image_coverage) = keys_with_image_coverage {
|
||||
unmapped_keyspace
|
||||
.remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone()));
|
||||
image_covered_keyspace.add_range(keys_with_image_coverage);
|
||||
}
|
||||
|
||||
// Do not descent any further if the last layer we visited
|
||||
// completed all keys in the keyspace it inspected. This is not
|
||||
@@ -3525,16 +3448,13 @@ impl Timeline {
|
||||
unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = next_cont_lsn;
|
||||
|
||||
reconstruct_state.on_layer_visited(&layer_to_read);
|
||||
reconstruct_state.on_layer_visited();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(TimelineVisitOutcome {
|
||||
completed_keyspace,
|
||||
image_covered_keyspace: image_covered_keyspace.consume_keyspace(),
|
||||
})
|
||||
Ok(completed_keyspace)
|
||||
}
|
||||
|
||||
/// # Cancel-safety
|
||||
@@ -3882,7 +3802,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let (layers_to_upload, delta_layers_to_add) = if create_image_layer {
|
||||
let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
|
||||
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
|
||||
// require downloading anything during initial import.
|
||||
let ((rel_partition, metadata_partition), _lsn) = self
|
||||
@@ -3899,23 +3819,26 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// For metadata, always create delta layers.
|
||||
let delta_layers = if !metadata_partition.parts.is_empty() {
|
||||
// In the current implementation, the metadata partition will only have one part, and the part will only have
|
||||
// one single key range. This might change in the future.
|
||||
let mut delta_layers_created = Vec::new();
|
||||
for ks in &metadata_partition.parts {
|
||||
for range in &ks.0.ranges {
|
||||
let layer = self
|
||||
.create_delta_layer(&frozen_layer, Some(range.clone()), ctx)
|
||||
.await?;
|
||||
if let Some(layer) = layer {
|
||||
delta_layers_created.push(layer);
|
||||
}
|
||||
}
|
||||
}
|
||||
delta_layers_created
|
||||
let delta_layer = if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single aux file keyspace"
|
||||
);
|
||||
let metadata_keyspace = &metadata_partition.parts[0];
|
||||
assert_eq!(
|
||||
metadata_keyspace.0.ranges.len(),
|
||||
1,
|
||||
"aux file keyspace should be a single range"
|
||||
);
|
||||
self.create_delta_layer(
|
||||
&frozen_layer,
|
||||
Some(metadata_keyspace.0.ranges[0].clone()),
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
} else {
|
||||
Vec::new()
|
||||
None
|
||||
};
|
||||
|
||||
// For image layers, we add them immediately into the layer map.
|
||||
@@ -3930,8 +3853,12 @@ impl Timeline {
|
||||
.await?,
|
||||
);
|
||||
|
||||
layers_to_upload.extend(delta_layers.iter().cloned());
|
||||
(layers_to_upload, delta_layers)
|
||||
if let Some(delta_layer) = delta_layer {
|
||||
layers_to_upload.push(delta_layer.clone());
|
||||
(layers_to_upload, Some(delta_layer))
|
||||
} else {
|
||||
(layers_to_upload, None)
|
||||
}
|
||||
} else {
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
// `create_delta_layer` will not modify the layer map.
|
||||
@@ -3939,7 +3866,12 @@ impl Timeline {
|
||||
let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
|
||||
panic!("delta layer cannot be empty if no filter is applied");
|
||||
};
|
||||
(vec![layer.clone()], vec![layer])
|
||||
(
|
||||
// FIXME: even though we have a single image and single delta layer assumption
|
||||
// we push them to vec
|
||||
vec![layer.clone()],
|
||||
Some(layer),
|
||||
)
|
||||
};
|
||||
|
||||
pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
|
||||
@@ -3960,7 +3892,7 @@ impl Timeline {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
guard.finish_flush_l0_layer(&delta_layers_to_add, &frozen_layer, &self.metrics);
|
||||
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
|
||||
|
||||
if self.set_disk_consistent_lsn(disk_consistent_lsn) {
|
||||
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
||||
@@ -4202,176 +4134,6 @@ impl Timeline {
|
||||
false
|
||||
}
|
||||
|
||||
/// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large,
|
||||
/// so that at most one image layer will be produced from this function.
|
||||
async fn create_image_layer_for_rel_blocks(
|
||||
self: &Arc<Self>,
|
||||
partition: &KeySpace,
|
||||
mut image_layer_writer: ImageLayerWriter,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
img_range: Range<Key>,
|
||||
start: Key,
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
let mut wrote_keys = false;
|
||||
|
||||
let mut key_request_accum = KeySpaceAccum::new();
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
// Decide whether to retain this key: usually we do, but sharded tenants may
|
||||
// need to drop keys that don't belong to them. If we retain the key, add it
|
||||
// to `key_request_accum` for later issuing a vectored get
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
debug!(
|
||||
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||
key,
|
||||
self.shard_identity.get_shard_number(&key)
|
||||
);
|
||||
} else {
|
||||
key_request_accum.add_key(key);
|
||||
}
|
||||
|
||||
let last_key_in_range = key.next() == range.end;
|
||||
key = key.next();
|
||||
|
||||
// Maybe flush `key_rest_accum`
|
||||
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| (last_key_in_range && key_request_accum.raw_size() > 0)
|
||||
{
|
||||
let results = self
|
||||
.get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
|
||||
.await?;
|
||||
|
||||
for (img_key, img) in results {
|
||||
let img = match img {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(CreateImageLayersError::PageReconstructError(err));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Write all the keys we just read into our new image layer.
|
||||
image_layer_writer.put_image(img_key, img, ctx).await?;
|
||||
wrote_keys = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if wrote_keys {
|
||||
// Normal path: we have written some data into the new image layer for this
|
||||
// partition, so flush it to disk.
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: Some(image_layer),
|
||||
next_start_key: img_range.end,
|
||||
})
|
||||
} else {
|
||||
// Special case: the image layer may be empty if this is a sharded tenant and the
|
||||
// partition does not cover any keys owned by this shard. In this case, to ensure
|
||||
// we don't leave gaps between image layers, leave `start` where it is, so that the next
|
||||
// layer we write will cover the key range that we just scanned.
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: None,
|
||||
next_start_key: start,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an image layer for metadata keys. This function produces one image layer for all metadata
|
||||
/// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
|
||||
/// would not be too large to fit in a single image layer.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn create_image_layer_for_metadata_keys(
|
||||
self: &Arc<Self>,
|
||||
partition: &KeySpace,
|
||||
mut image_layer_writer: ImageLayerWriter,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
img_range: Range<Key>,
|
||||
mode: ImageLayerCreationMode,
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
assert!(!matches!(mode, ImageLayerCreationMode::Initial));
|
||||
|
||||
// Metadata keys image layer creation.
|
||||
let mut reconstruct_state = ValuesReconstructState::default();
|
||||
let data = self
|
||||
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
let (data, total_kb_retrieved, total_key_retrieved) = {
|
||||
let mut new_data = BTreeMap::new();
|
||||
let mut total_kb_retrieved = 0;
|
||||
let mut total_key_retrieved = 0;
|
||||
for (k, v) in data {
|
||||
let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
|
||||
total_kb_retrieved += KEY_SIZE + v.len();
|
||||
total_key_retrieved += 1;
|
||||
new_data.insert(k, v);
|
||||
}
|
||||
(new_data, total_kb_retrieved / 1024, total_key_retrieved)
|
||||
};
|
||||
let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
|
||||
|
||||
let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
|
||||
debug!(
|
||||
"generate image layers for metadata keys: trigger_generation={trigger_generation}, \
|
||||
delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
|
||||
total_key_retrieved={total_key_retrieved}"
|
||||
);
|
||||
if !trigger_generation && mode == ImageLayerCreationMode::Try {
|
||||
return Ok(ImageLayerCreationOutcome {
|
||||
image: None,
|
||||
next_start_key: img_range.end,
|
||||
});
|
||||
}
|
||||
let has_keys = !data.is_empty();
|
||||
for (k, v) in data {
|
||||
// Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
|
||||
// considers this situation properly.
|
||||
// if v.is_empty() {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// No need to handle sharding b/c metadata keys are always on the 0-th shard.
|
||||
|
||||
// TODO: split image layers to avoid too large layer files. Too large image files are not handled
|
||||
// on the normal data path either.
|
||||
image_layer_writer.put_image(k, v, ctx).await?;
|
||||
}
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: if has_keys {
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
Some(image_layer)
|
||||
} else {
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
None
|
||||
},
|
||||
next_start_key: img_range.end,
|
||||
})
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
|
||||
async fn create_image_layers(
|
||||
self: &Arc<Timeline>,
|
||||
@@ -4413,17 +4175,19 @@ impl Timeline {
|
||||
|
||||
for partition in partitioning.parts.iter() {
|
||||
let img_range = start..partition.ranges.last().unwrap().end;
|
||||
let compact_metadata = partition.overlaps(&Key::metadata_key_range());
|
||||
if compact_metadata {
|
||||
for range in &partition.ranges {
|
||||
assert!(
|
||||
range.start.field1 >= METADATA_KEY_BEGIN_PREFIX
|
||||
&& range.end.field1 <= METADATA_KEY_END_PREFIX,
|
||||
"metadata keys must be partitioned separately"
|
||||
);
|
||||
}
|
||||
if mode == ImageLayerCreationMode::Initial {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
|
||||
if partition.overlaps(&Key::metadata_key_range()) {
|
||||
// TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
|
||||
// rather big change. Keep this patch small for now.
|
||||
match mode {
|
||||
ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
|
||||
// skip image layer creation anyways for metadata keys.
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
ImageLayerCreationMode::Initial => {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
}
|
||||
}
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
@@ -4434,7 +4198,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let image_layer_writer = ImageLayerWriter::new(
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
@@ -4450,39 +4214,87 @@ impl Timeline {
|
||||
)))
|
||||
});
|
||||
|
||||
if !compact_metadata {
|
||||
let ImageLayerCreationOutcome {
|
||||
image,
|
||||
next_start_key,
|
||||
} = self
|
||||
.create_image_layer_for_rel_blocks(
|
||||
partition,
|
||||
image_layer_writer,
|
||||
lsn,
|
||||
ctx,
|
||||
img_range,
|
||||
start,
|
||||
)
|
||||
.await?;
|
||||
let mut wrote_keys = false;
|
||||
|
||||
start = next_start_key;
|
||||
image_layers.extend(image);
|
||||
let mut key_request_accum = KeySpaceAccum::new();
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
// Decide whether to retain this key: usually we do, but sharded tenants may
|
||||
// need to drop keys that don't belong to them. If we retain the key, add it
|
||||
// to `key_request_accum` for later issuing a vectored get
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
debug!(
|
||||
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||
key,
|
||||
self.shard_identity.get_shard_number(&key)
|
||||
);
|
||||
} else {
|
||||
key_request_accum.add_key(key);
|
||||
}
|
||||
|
||||
let last_key_in_range = key.next() == range.end;
|
||||
key = key.next();
|
||||
|
||||
// Maybe flush `key_rest_accum`
|
||||
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| (last_key_in_range && key_request_accum.raw_size() > 0)
|
||||
{
|
||||
let results = self
|
||||
.get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
|
||||
.await?;
|
||||
|
||||
for (img_key, img) in results {
|
||||
let img = match img {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
|
||||
{
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(CreateImageLayersError::PageReconstructError(
|
||||
err,
|
||||
));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Write all the keys we just read into our new image layer.
|
||||
image_layer_writer.put_image(img_key, img, ctx).await?;
|
||||
wrote_keys = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if wrote_keys {
|
||||
// Normal path: we have written some data into the new image layer for this
|
||||
// partition, so flush it to disk.
|
||||
start = img_range.end;
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
image_layers.push(image_layer);
|
||||
} else {
|
||||
let ImageLayerCreationOutcome {
|
||||
image,
|
||||
next_start_key,
|
||||
} = self
|
||||
.create_image_layer_for_metadata_keys(
|
||||
partition,
|
||||
image_layer_writer,
|
||||
lsn,
|
||||
ctx,
|
||||
img_range,
|
||||
mode,
|
||||
)
|
||||
.await?;
|
||||
start = next_start_key;
|
||||
image_layers.extend(image);
|
||||
// Special case: the image layer may be empty if this is a sharded tenant and the
|
||||
// partition does not cover any keys owned by this shard. In this case, to ensure
|
||||
// we don't leave gaps between image layers, leave `start` where it is, so that the next
|
||||
// layer we write will cover the key range that we just scanned.
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4596,14 +4408,6 @@ impl Timeline {
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
detach_ancestor::complete(self, tenant, prepared, ctx).await
|
||||
}
|
||||
|
||||
/// Switch aux file policy and schedule upload to the index part.
|
||||
pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
|
||||
self.last_aux_file_policy.store(Some(policy));
|
||||
self.remote_client
|
||||
.schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Top-level failure to compact.
|
||||
@@ -4860,32 +4664,7 @@ impl Timeline {
|
||||
(horizon_cutoff, pitr_cutoff, retain_lsns)
|
||||
};
|
||||
|
||||
let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
||||
let standby_horizon = self.standby_horizon.load();
|
||||
// Hold GC for the standby, but as a safety guard do it only within some
|
||||
// reasonable lag.
|
||||
if standby_horizon != Lsn::INVALID {
|
||||
if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) {
|
||||
const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB
|
||||
if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG {
|
||||
new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff);
|
||||
trace!("holding off GC for standby apply LSN {}", standby_horizon);
|
||||
} else {
|
||||
warn!(
|
||||
"standby is lagging for more than {}MB, not holding gc for it",
|
||||
MAX_ALLOWED_STANDBY_LAG / 1024 / 1024
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reset standby horizon to ignore it if it is not updated till next GC.
|
||||
// It is an easy way to unset it when standby disappears without adding
|
||||
// more conf options.
|
||||
self.standby_horizon.store(Lsn::INVALID);
|
||||
self.metrics
|
||||
.standby_horizon_gauge
|
||||
.set(Lsn::INVALID.0 as i64);
|
||||
let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
||||
|
||||
let res = self
|
||||
.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
|
||||
|
||||
@@ -9,10 +9,7 @@ use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
|
||||
RecordedDuration, Timeline,
|
||||
};
|
||||
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use enumset::EnumSet;
|
||||
@@ -25,13 +22,14 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
|
||||
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
|
||||
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::DeltaLayer;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
use crate::{page_cache, ZERO_PAGE};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::Key;
|
||||
@@ -118,13 +116,9 @@ impl Timeline {
|
||||
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let mut partitioning = dense_partitioning;
|
||||
partitioning
|
||||
.parts
|
||||
.extend(sparse_partitioning.into_dense().parts);
|
||||
let image_layers = self
|
||||
let dense_layers = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
&dense_partitioning,
|
||||
lsn,
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
@@ -136,8 +130,24 @@ impl Timeline {
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
partitioning.parts.len()
|
||||
// For now, nothing will be produced...
|
||||
let sparse_layers = self
|
||||
.create_image_layers(
|
||||
&sparse_partitioning.clone().into_dense(),
|
||||
lsn,
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
assert!(sparse_layers.is_empty());
|
||||
|
||||
self.upload_new_image_layers(dense_layers)?;
|
||||
dense_partitioning.parts.len()
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -489,11 +499,8 @@ impl Timeline {
|
||||
|
||||
for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter, do not create hole entries for metadata keys. The last hole in the
|
||||
// compaction is the gap between data key and metadata keys.
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
|
||||
&& !Key::is_metadata_key(&prev_key)
|
||||
{
|
||||
// just first fast filter
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
|
||||
let key_range = prev_key..next_key;
|
||||
// Measuring hole by just subtraction of i128 representation of key range boundaries
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
@@ -1152,10 +1159,10 @@ impl TimelineAdaptor {
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CreateImageLayersError> {
|
||||
) -> Result<(), PageReconstructError> {
|
||||
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
|
||||
|
||||
let image_layer_writer = ImageLayerWriter::new(
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.timeline.conf,
|
||||
self.timeline.timeline_id,
|
||||
self.timeline.tenant_shard_id,
|
||||
@@ -1166,34 +1173,47 @@ impl TimelineAdaptor {
|
||||
.await?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"failpoint image-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
|
||||
let keyspace = KeySpace {
|
||||
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
|
||||
};
|
||||
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
|
||||
let start = Key::MIN;
|
||||
let ImageLayerCreationOutcome {
|
||||
image,
|
||||
next_start_key: _,
|
||||
} = self
|
||||
.timeline
|
||||
.create_image_layer_for_rel_blocks(
|
||||
&keyspace,
|
||||
image_layer_writer,
|
||||
lsn,
|
||||
ctx,
|
||||
key_range.clone(),
|
||||
start,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(image_layer) = image {
|
||||
self.new_images.push(image_layer);
|
||||
let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
|
||||
for range in &keyspace_ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
let img = match self.timeline.get(key, lsn, ctx).await {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
|
||||
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
image_layer_writer.put_image(key, img, ctx).await?;
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
|
||||
|
||||
self.new_images.push(image_layer);
|
||||
|
||||
timer.stop_and_record();
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
|
||||
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum Error {
|
||||
@@ -41,27 +41,6 @@ pub(crate) enum Error {
|
||||
Unexpected(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<Error> for ApiError {
|
||||
fn from(value: Error) -> Self {
|
||||
match value {
|
||||
e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
|
||||
// TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
|
||||
e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
|
||||
Error::ShuttingDown => ApiError::ShuttingDown,
|
||||
Error::OtherTimelineDetachOngoing(_) => {
|
||||
ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
|
||||
}
|
||||
// All of these contain shutdown errors, in fact, it's the most common
|
||||
e @ Error::FlushAncestor(_)
|
||||
| e @ Error::RewrittenDeltaDownloadFailed(_)
|
||||
| e @ Error::CopyDeltaPrefix(_)
|
||||
| e @ Error::UploadRewritten(_)
|
||||
| e @ Error::CopyFailed(_)
|
||||
| e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct PreparedTimelineDetach {
|
||||
layers: Vec<Layer>,
|
||||
}
|
||||
@@ -96,11 +75,6 @@ pub(super) async fn prepare(
|
||||
.as_ref()
|
||||
.map(|tl| (tl.clone(), detached.ancestor_lsn))
|
||||
else {
|
||||
// TODO: check if we have already been detached; for this we need to read the stored data
|
||||
// on remote client, for that we need a follow-up which makes uploads cheaper and maintains
|
||||
// a projection of the commited data.
|
||||
//
|
||||
// the error is wrong per openapi
|
||||
return Err(NoAncestor);
|
||||
};
|
||||
|
||||
@@ -110,7 +84,7 @@ pub(super) async fn prepare(
|
||||
|
||||
if ancestor.ancestor_timeline.is_some() {
|
||||
// non-technical requirement; we could flatten N ancestors just as easily but we chose
|
||||
// not to, at least initially
|
||||
// not to
|
||||
return Err(TooManyAncestors);
|
||||
}
|
||||
|
||||
|
||||
@@ -166,7 +166,7 @@ impl LayerManager {
|
||||
/// Flush a frozen layer and add the written delta layer to the layer map.
|
||||
pub(crate) fn finish_flush_l0_layer(
|
||||
&mut self,
|
||||
delta_layers: &[ResidentLayer],
|
||||
delta_layer: Option<&ResidentLayer>,
|
||||
frozen_layer_for_check: &Arc<InMemoryLayer>,
|
||||
metrics: &TimelineMetrics,
|
||||
) {
|
||||
@@ -181,12 +181,10 @@ impl LayerManager {
|
||||
// layer to disk at the same time, that would not work.
|
||||
assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
|
||||
|
||||
if !delta_layers.is_empty() {
|
||||
if let Some(l) = delta_layer {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for l in delta_layers {
|
||||
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
|
||||
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
||||
}
|
||||
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
|
||||
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
||||
updates.flush();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -705,7 +705,6 @@ impl ConnectionManagerState {
|
||||
commit_lsn: info.commit_lsn,
|
||||
safekeeper_connstr: info.safekeeper_connstr,
|
||||
availability_zone: info.availability_zone,
|
||||
standby_horizon: info.standby_horizon,
|
||||
}
|
||||
}
|
||||
MessageType::SafekeeperDiscoveryResponse => {
|
||||
@@ -726,21 +725,6 @@ impl ConnectionManagerState {
|
||||
|
||||
WALRECEIVER_BROKER_UPDATES.inc();
|
||||
|
||||
trace!(
|
||||
"safekeeper info update: standby_horizon(cutoff)={}",
|
||||
timeline_update.standby_horizon
|
||||
);
|
||||
if timeline_update.standby_horizon != 0 {
|
||||
// ignore reports from safekeepers not connected to replicas
|
||||
self.timeline
|
||||
.standby_horizon
|
||||
.store(Lsn(timeline_update.standby_horizon));
|
||||
self.timeline
|
||||
.metrics
|
||||
.standby_horizon_gauge
|
||||
.set(timeline_update.standby_horizon as i64);
|
||||
}
|
||||
|
||||
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
|
||||
let old_entry = self.wal_stream_candidates.insert(
|
||||
new_safekeeper_id,
|
||||
@@ -1110,7 +1094,6 @@ mod tests {
|
||||
commit_lsn,
|
||||
safekeeper_connstr: safekeeper_connstr.to_owned(),
|
||||
availability_zone: None,
|
||||
standby_horizon: 0,
|
||||
},
|
||||
latest_update,
|
||||
}
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
|
||||
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
|
||||
Date: Fri, 2 Feb 2024 22:26:45 +0200
|
||||
Subject: [PATCH 1/1] Make v0.6.0 work with Neon
|
||||
|
||||
Now that the WAL-logging happens as a separate step at the end of the
|
||||
build, we need a few neon-specific hints to make it work.
|
||||
---
|
||||
src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 36 insertions(+)
|
||||
|
||||
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
|
||||
index 680789b..ec54dea 100644
|
||||
--- a/src/hnswbuild.c
|
||||
+++ b/src/hnswbuild.c
|
||||
@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
|
||||
|
||||
hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_start_unlogged_build(RelationGetSmgr(indexRel));
|
||||
+#endif
|
||||
+
|
||||
/* Perform inserts */
|
||||
HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
|
||||
+#endif
|
||||
+
|
||||
/* Close relations within worker */
|
||||
index_close(indexRel, indexLockmode);
|
||||
table_close(heapRel, heapLockmode);
|
||||
@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
|
||||
SeedRandom(42);
|
||||
#endif
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_start_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
InitBuildState(buildstate, heap, index, indexInfo, forkNum);
|
||||
|
||||
BuildGraph(buildstate, forkNum);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
if (RelationNeedsWAL(index))
|
||||
+ {
|
||||
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ {
|
||||
+#if PG_VERSION_NUM >= 160000
|
||||
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
|
||||
+#else
|
||||
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
|
||||
+#endif
|
||||
+
|
||||
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
|
||||
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
|
||||
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
|
||||
+ }
|
||||
+#endif
|
||||
+ }
|
||||
+
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_end_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
FreeBuildState(buildstate);
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
||||
@@ -49,7 +49,7 @@ char *neon_auth_token;
|
||||
int readahead_buffer_size = 128;
|
||||
int flush_every_n_requests = 8;
|
||||
|
||||
int neon_protocol_version = 2;
|
||||
int neon_protocol_version = 1;
|
||||
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
|
||||
"Version of compute<->page server protocol",
|
||||
NULL,
|
||||
&neon_protocol_version,
|
||||
2, /* use protocol version 2 */
|
||||
1, /* default to old protocol for now */
|
||||
1, /* min */
|
||||
2, /* max */
|
||||
PGC_SU_BACKEND,
|
||||
|
||||
@@ -45,7 +45,6 @@
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/parallel.h"
|
||||
#include "access/xact.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlogdefs.h"
|
||||
@@ -1349,10 +1348,6 @@ PageIsEmptyHeapPage(char *buffer)
|
||||
return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* A page is being evicted from the shared buffer cache. Update the
|
||||
* last-written LSN of the page, and WAL-log it if needed.
|
||||
*/
|
||||
static void
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
|
||||
@@ -1361,7 +1356,12 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn = PageGetLSN((Page) buffer);
|
||||
bool log_page;
|
||||
|
||||
if (ShutdownRequestPending)
|
||||
return;
|
||||
/* Don't log any pages if we're not allowed to do so. */
|
||||
if (!XLogInsertAllowed())
|
||||
return;
|
||||
|
||||
/*
|
||||
* Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
|
||||
@@ -1370,21 +1370,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
* correctness, the non-logged updates are not critical. But we want to
|
||||
* have a reasonably up-to-date VM and FSM in the page server.
|
||||
*/
|
||||
log_page = false;
|
||||
if (force)
|
||||
{
|
||||
Assert(XLogInsertAllowed());
|
||||
log_page = true;
|
||||
}
|
||||
else if (XLogInsertAllowed() &&
|
||||
!ShutdownRequestPending &&
|
||||
(forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
|
||||
{
|
||||
log_page = true;
|
||||
}
|
||||
|
||||
if (log_page)
|
||||
if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
|
||||
{
|
||||
/* FSM is never WAL-logged and we don't care. */
|
||||
XLogRecPtr recptr;
|
||||
|
||||
recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
|
||||
@@ -1397,8 +1385,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
}
|
||||
|
||||
if (lsn == InvalidXLogRecPtr)
|
||||
else if (lsn == InvalidXLogRecPtr)
|
||||
{
|
||||
/*
|
||||
* When PostgreSQL extends a relation, it calls smgrextend() with an
|
||||
@@ -1434,31 +1421,19 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
}
|
||||
else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Its a bad sign if there is a page with zero LSN in the buffer
|
||||
* cache in a standby, too. However, PANICing seems like a cure
|
||||
* worse than the disease, as the damage has likely already been
|
||||
* done in the primary. So in a standby, make this an assertion,
|
||||
* and in a release build just LOG the error and soldier on. We
|
||||
* update the last-written LSN of the page with a conservative
|
||||
* value in that case, which is the last replayed LSN.
|
||||
*/
|
||||
ereport(RecoveryInProgress() ? LOG : PANIC,
|
||||
ereport(PANIC,
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
Assert(false);
|
||||
|
||||
lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(SmgrTrace,
|
||||
(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
@@ -1551,92 +1526,8 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
|
||||
|
||||
if (RecoveryInProgress())
|
||||
{
|
||||
/*---
|
||||
* In broad strokes, a replica always requests the page at the current
|
||||
* replay LSN. But looking closer, what exactly is the replay LSN? Is
|
||||
* it the last replayed record, or the record being replayed? And does
|
||||
* the startup process performing the replay need to do something
|
||||
* differently than backends running queries? Let's take a closer look
|
||||
* at the different scenarios:
|
||||
*
|
||||
* 1. Startup process reads a page, last_written_lsn is old.
|
||||
*
|
||||
* Read the old version of the page. We will apply the WAL record on
|
||||
* it to bring it up-to-date.
|
||||
*
|
||||
* We could read the new version, with the changes from this WAL
|
||||
* record already applied, to offload the work of replaying the record
|
||||
* to the pageserver. The pageserver might not have received the WAL
|
||||
* record yet, though, so a read of the old page version and applying
|
||||
* the record ourselves is likely faster. Also, the redo function
|
||||
* might be surprised if the changes have already applied. That's
|
||||
* normal during crash recovery, but not in hot standby.
|
||||
*
|
||||
* 2. Startup process reads a page, last_written_lsn == record we're
|
||||
* replaying.
|
||||
*
|
||||
* Can this happen? There are a few theoretical cases when it might:
|
||||
*
|
||||
* A) The redo function reads the same page twice. We had already read
|
||||
* and applied the changes once, and now we're reading it for the
|
||||
* second time. That would be a rather silly thing for a redo
|
||||
* function to do, and I'm not aware of any that would do it.
|
||||
*
|
||||
* B) The redo function modifies multiple pages, and it already
|
||||
* applied the changes to one of the pages, released the lock on
|
||||
* it, and is now reading a second page. Furthermore, the first
|
||||
* page was already evicted from the buffer cache, and also from
|
||||
* the last-written LSN cache, so that the per-relation or global
|
||||
* last-written LSN was already updated. All the WAL redo functions
|
||||
* hold the locks on pages that they modify, until all the changes
|
||||
* have been modified (?), which would make that impossible.
|
||||
* However, we skip the locking, if the page isn't currently in the
|
||||
* page cache (see neon_redo_read_buffer_filter below).
|
||||
*
|
||||
* Even if the one of the above cases were possible in theory, they
|
||||
* would also require the pages being modified by the redo function to
|
||||
* be immediately evicted from the page cache.
|
||||
*
|
||||
* So this probably does not happen in practice. But if it does, we
|
||||
* request the new version, including the changes from the record
|
||||
* being replayed. That seems like the correct behavior in any case.
|
||||
*
|
||||
* 3. Backend process reads a page with old last-written LSN
|
||||
*
|
||||
* Nothing special here. Read the old version.
|
||||
*
|
||||
* 4. Backend process reads a page with last_written_lsn == record being replayed
|
||||
*
|
||||
* This can happen, if the redo function has started to run, and saw
|
||||
* that the page isn't present in the page cache (see
|
||||
* neon_redo_read_buffer_filter below). Normally, in a normal
|
||||
* Postgres server, the redo function would hold a lock on the page,
|
||||
* so we would get blocked waiting the redo function to release the
|
||||
* lock. To emulate that, wait for the WAL replay of the record to
|
||||
* finish.
|
||||
*/
|
||||
/* Request the page at the end of the last fully replayed LSN. */
|
||||
XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
|
||||
|
||||
if (last_written_lsn > replay_lsn)
|
||||
{
|
||||
/* GetCurrentReplayRecPtr was introduced in v15 */
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Cases 2 and 4. If this is a backend (case 4), the
|
||||
* neon_read_at_lsn() call later will wait for the WAL record to be
|
||||
* fully replayed.
|
||||
*/
|
||||
result.request_lsn = last_written_lsn;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* cases 1 and 3 */
|
||||
result.request_lsn = replay_lsn;
|
||||
}
|
||||
/* Request the page at the last replayed LSN. */
|
||||
result.request_lsn = GetXLogReplayRecPtr(NULL);
|
||||
result.not_modified_since = last_written_lsn;
|
||||
result.effective_request_lsn = result.request_lsn;
|
||||
Assert(last_written_lsn <= result.request_lsn);
|
||||
@@ -2931,14 +2822,10 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
|
||||
|
||||
/*
|
||||
* Create the local file. In a parallel build, the leader is expected to
|
||||
* call this first and do it.
|
||||
*
|
||||
* FIXME: should we pass isRedo true to create the tablespace dir if it
|
||||
* doesn't exist? Is it needed?
|
||||
*/
|
||||
if (!IsParallelWorker())
|
||||
mdcreate(reln, MAIN_FORKNUM, false);
|
||||
mdcreate(reln, MAIN_FORKNUM, false);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2962,17 +2849,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
|
||||
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
|
||||
|
||||
/*
|
||||
* In a parallel build, (only) the leader process performs the 2nd
|
||||
* phase.
|
||||
*/
|
||||
if (IsParallelWorker())
|
||||
{
|
||||
unlogged_build_rel = NULL;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
}
|
||||
else
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3324,7 +3201,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
BufferTag tag;
|
||||
uint32 hash;
|
||||
LWLock *partitionLock;
|
||||
int buf_id;
|
||||
Buffer buffer;
|
||||
bool no_redo_needed;
|
||||
|
||||
if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
|
||||
@@ -3362,20 +3239,20 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
else
|
||||
{
|
||||
/* Try to find the relevant buffer */
|
||||
buf_id = BufTableLookup(&tag, hash);
|
||||
buffer = BufTableLookup(&tag, hash);
|
||||
|
||||
no_redo_needed = buf_id < 0;
|
||||
no_redo_needed = buffer < 0;
|
||||
}
|
||||
/* In both cases st lwlsn past this WAL record */
|
||||
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
|
||||
|
||||
/*
|
||||
* we don't have the buffer in memory, update lwLsn past this record, also
|
||||
* evict page from file cache
|
||||
*/
|
||||
if (no_redo_needed)
|
||||
{
|
||||
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
|
||||
lfc_evict(rinfo, forknum, blkno);
|
||||
}
|
||||
|
||||
|
||||
LWLockRelease(partitionLock);
|
||||
|
||||
|
||||
@@ -1852,30 +1852,34 @@ static void
|
||||
CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
|
||||
{
|
||||
hs->ts = 0;
|
||||
hs->xmin = InvalidFullTransactionId;
|
||||
hs->catalog_xmin = InvalidFullTransactionId;
|
||||
hs->xmin.value = ~0; /* largest unsigned value */
|
||||
hs->catalog_xmin.value = ~0; /* largest unsigned value */
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
|
||||
if (wp->safekeeper[i].state == SS_ACTIVE)
|
||||
if (wp->safekeeper[i].appendResponse.hs.ts != 0)
|
||||
{
|
||||
HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;
|
||||
|
||||
if (FullTransactionIdIsNormal(skhs->xmin)
|
||||
&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
|
||||
&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
|
||||
{
|
||||
hs->xmin = skhs->xmin;
|
||||
hs->ts = skhs->ts;
|
||||
}
|
||||
if (FullTransactionIdIsNormal(skhs->catalog_xmin)
|
||||
&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
|
||||
&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
|
||||
{
|
||||
hs->catalog_xmin = skhs->catalog_xmin;
|
||||
hs->ts = skhs->ts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (hs->xmin.value == ~0)
|
||||
hs->xmin = InvalidFullTransactionId;
|
||||
if (hs->catalog_xmin.value == ~0)
|
||||
hs->catalog_xmin = InvalidFullTransactionId;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1942,28 +1946,14 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
|
||||
}
|
||||
|
||||
CombineHotStanbyFeedbacks(&hsFeedback, wp);
|
||||
if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
|
||||
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
|
||||
{
|
||||
FullTransactionId xmin = hsFeedback.xmin;
|
||||
FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
|
||||
FullTransactionId next_xid = ReadNextFullTransactionId();
|
||||
/*
|
||||
* Page server is updating nextXid in checkpoint each 1024 transactions,
|
||||
* so feedback xmin can be actually larger then nextXid and
|
||||
* function TransactionIdInRecentPast return false in this case,
|
||||
* preventing update of slot's xmin.
|
||||
*/
|
||||
if (FullTransactionIdPrecedes(next_xid, xmin))
|
||||
xmin = next_xid;
|
||||
if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
|
||||
catalog_xmin = next_xid;
|
||||
agg_hs_feedback = hsFeedback;
|
||||
elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
ProcessStandbyHSFeedback(hsFeedback.ts,
|
||||
XidFromFullTransactionId(xmin),
|
||||
EpochFromFullTransactionId(xmin),
|
||||
XidFromFullTransactionId(catalog_xmin),
|
||||
EpochFromFullTransactionId(catalog_xmin));
|
||||
XidFromFullTransactionId(hsFeedback.xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.xmin),
|
||||
XidFromFullTransactionId(hsFeedback.catalog_xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
}
|
||||
|
||||
CheckGracefulShutdown(wp);
|
||||
|
||||
21
poetry.lock
generated
21
poetry.lock
generated
@@ -2405,7 +2405,6 @@ files = [
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
@@ -2530,13 +2529,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.32.0"
|
||||
version = "2.31.0"
|
||||
description = "Python HTTP for Humans."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
|
||||
{file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
|
||||
{file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
|
||||
{file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2960,16 +2959,6 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
@@ -3207,4 +3196,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
|
||||
content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"
|
||||
|
||||
@@ -9,7 +9,6 @@ default = []
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
ahash.workspace = true
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
@@ -25,7 +24,6 @@ camino.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crossbeam-deque.workspace = true
|
||||
dashmap.workspace = true
|
||||
env_logger.workspace = true
|
||||
framed-websockets.workspace = true
|
||||
@@ -54,6 +52,7 @@ opentelemetry.workspace = true
|
||||
parking_lot.workspace = true
|
||||
parquet.workspace = true
|
||||
parquet_derive.workspace = true
|
||||
pbkdf2 = { workspace = true, features = ["simple", "std"] }
|
||||
pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
pq_proto.workspace = true
|
||||
@@ -107,7 +106,6 @@ workspace_hack.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
tokio-tungstenite.workspace = true
|
||||
pbkdf2 = { workspace = true, features = ["simple", "std"] }
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
|
||||
@@ -365,10 +365,7 @@ async fn authenticate_with_secret(
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
if let Some(password) = unauthenticated_password {
|
||||
let ep = EndpointIdInt::from(&info.endpoint);
|
||||
|
||||
let auth_outcome =
|
||||
validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
|
||||
let auth_outcome = validate_password_and_exchange(&password, secret).await?;
|
||||
let keys = match auth_outcome {
|
||||
crate::sasl::Outcome::Success(key) => key,
|
||||
crate::sasl::Outcome::Failure(reason) => {
|
||||
@@ -389,7 +386,7 @@ async fn authenticate_with_secret(
|
||||
// Currently, we use it for websocket connections (latency).
|
||||
if allow_cleartext {
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
return hacks::authenticate_cleartext(ctx, info, client, secret, config).await;
|
||||
return hacks::authenticate_cleartext(ctx, info, client, secret).await;
|
||||
}
|
||||
|
||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||
@@ -557,7 +554,7 @@ mod tests {
|
||||
context::RequestMonitoring,
|
||||
proxy::NeonOptions,
|
||||
rate_limiter::{EndpointRateLimiter, RateBucketInfo},
|
||||
scram::{threadpool::ThreadPool, ServerSecret},
|
||||
scram::ServerSecret,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
|
||||
@@ -599,7 +596,6 @@ mod tests {
|
||||
}
|
||||
|
||||
static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
|
||||
thread_pool: ThreadPool::new(1),
|
||||
scram_protocol_timeout: std::time::Duration::from_secs(5),
|
||||
rate_limiter_enabled: true,
|
||||
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
|
||||
|
||||
@@ -3,10 +3,8 @@ use super::{
|
||||
};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow},
|
||||
config::AuthenticationConfig,
|
||||
console::AuthSecret,
|
||||
context::RequestMonitoring,
|
||||
intern::EndpointIdInt,
|
||||
sasl,
|
||||
stream::{self, Stream},
|
||||
};
|
||||
@@ -22,7 +20,6 @@ pub async fn authenticate_cleartext(
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
secret: AuthSecret,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
@@ -30,14 +27,8 @@ pub async fn authenticate_cleartext(
|
||||
// pause the timer while we communicate with the client
|
||||
let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let ep = EndpointIdInt::from(&info.endpoint);
|
||||
|
||||
let auth_flow = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword {
|
||||
secret,
|
||||
endpoint: ep,
|
||||
pool: config.thread_pool.clone(),
|
||||
})
|
||||
.begin(auth::CleartextPassword(secret))
|
||||
.await?;
|
||||
drop(paused);
|
||||
// cleartext auth is only allowed to the ws/http protocol.
|
||||
|
||||
@@ -5,14 +5,12 @@ use crate::{
|
||||
config::TlsServerEndPoint,
|
||||
console::AuthSecret,
|
||||
context::RequestMonitoring,
|
||||
intern::EndpointIdInt,
|
||||
sasl,
|
||||
scram::{self, threadpool::ThreadPool},
|
||||
sasl, scram,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
|
||||
use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
|
||||
use std::{io, sync::Arc};
|
||||
use std::io;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
|
||||
@@ -55,11 +53,7 @@ impl AuthMethod for PasswordHack {
|
||||
|
||||
/// Use clear-text password auth called `password` in docs
|
||||
/// <https://www.postgresql.org/docs/current/auth-password.html>
|
||||
pub struct CleartextPassword {
|
||||
pub pool: Arc<ThreadPool>,
|
||||
pub endpoint: EndpointIdInt,
|
||||
pub secret: AuthSecret,
|
||||
}
|
||||
pub struct CleartextPassword(pub AuthSecret);
|
||||
|
||||
impl AuthMethod for CleartextPassword {
|
||||
#[inline(always)]
|
||||
@@ -132,13 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
.strip_suffix(&[0])
|
||||
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
|
||||
|
||||
let outcome = validate_password_and_exchange(
|
||||
&self.state.pool,
|
||||
self.state.endpoint,
|
||||
password,
|
||||
self.state.secret,
|
||||
)
|
||||
.await?;
|
||||
let outcome = validate_password_and_exchange(password, self.state.0).await?;
|
||||
|
||||
if let sasl::Outcome::Success(_) = &outcome {
|
||||
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
@@ -193,8 +181,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
}
|
||||
|
||||
pub(crate) async fn validate_password_and_exchange(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
password: &[u8],
|
||||
secret: AuthSecret,
|
||||
) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
@@ -208,7 +194,7 @@ pub(crate) async fn validate_password_and_exchange(
|
||||
}
|
||||
// perform scram authentication as both client and server to validate the keys
|
||||
AuthSecret::Scram(scram_secret) => {
|
||||
let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
|
||||
let outcome = crate::scram::exchange(&scram_secret, password).await?;
|
||||
|
||||
let client_key = match outcome {
|
||||
sasl::Outcome::Success(client_key) => client_key,
|
||||
|
||||
@@ -27,7 +27,6 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
|
||||
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use proxy::redis::elasticache;
|
||||
use proxy::redis::notifications;
|
||||
use proxy::scram::threadpool::ThreadPool;
|
||||
use proxy::serverless::cancel_set::CancelSet;
|
||||
use proxy::serverless::GlobalConnPoolOptions;
|
||||
use proxy::usage_metrics;
|
||||
@@ -133,9 +132,6 @@ struct ProxyCliArgs {
|
||||
/// timeout for scram authentication protocol
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
scram_protocol_timeout: tokio::time::Duration,
|
||||
/// size of the threadpool for password hashing
|
||||
#[clap(long, default_value_t = 4)]
|
||||
scram_thread_pool_size: u8,
|
||||
/// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
require_client_ip: bool,
|
||||
@@ -493,9 +489,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
|
||||
Metrics::install(thread_pool.metrics.clone());
|
||||
|
||||
let tls_config = match (&args.tls_key, &args.tls_cert) {
|
||||
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(
|
||||
key_path,
|
||||
@@ -631,7 +624,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
thread_pool,
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
rate_limiter_enabled: args.auth_rate_limit_enabled,
|
||||
rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
|
||||
|
||||
@@ -2,7 +2,6 @@ use crate::{
|
||||
auth::{self, backend::AuthRateLimiter},
|
||||
console::locks::ApiLocks,
|
||||
rate_limiter::RateBucketInfo,
|
||||
scram::threadpool::ThreadPool,
|
||||
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
|
||||
Host,
|
||||
};
|
||||
@@ -62,7 +61,6 @@ pub struct HttpConfig {
|
||||
}
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
pub thread_pool: Arc<ThreadPool>,
|
||||
pub scram_protocol_timeout: tokio::time::Duration,
|
||||
pub rate_limiter_enabled: bool,
|
||||
pub rate_limiter: AuthRateLimiter,
|
||||
|
||||
@@ -307,7 +307,7 @@ where
|
||||
}
|
||||
|
||||
async fn upload_parquet(
|
||||
mut w: SerializedFileWriter<Writer<BytesMut>>,
|
||||
w: SerializedFileWriter<Writer<BytesMut>>,
|
||||
len: i64,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<Writer<BytesMut>> {
|
||||
@@ -319,15 +319,11 @@ async fn upload_parquet(
|
||||
|
||||
// I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
|
||||
// finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
|
||||
let (mut buffer, metadata) =
|
||||
tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> {
|
||||
let metadata = w.finish()?;
|
||||
let buffer = std::mem::take(w.inner_mut().get_mut());
|
||||
Ok((buffer, metadata))
|
||||
})
|
||||
let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish())
|
||||
.await
|
||||
.unwrap()?;
|
||||
|
||||
let mut buffer = writer.into_inner();
|
||||
let data = buffer.split().freeze();
|
||||
|
||||
let compression = len as f64 / len_uncompressed as f64;
|
||||
@@ -355,7 +351,7 @@ async fn upload_parquet(
|
||||
"{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
|
||||
))?;
|
||||
let cancel = CancellationToken::new();
|
||||
let maybe_err = backoff::retry(
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
|
||||
storage
|
||||
@@ -372,12 +368,7 @@ async fn upload_parquet(
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("request_data_upload")
|
||||
.err();
|
||||
|
||||
if let Some(err) = maybe_err {
|
||||
tracing::warn!(%id, %err, "failed to upload request data");
|
||||
}
|
||||
.context("request_data_upload")?;
|
||||
|
||||
Ok(buffer.writer())
|
||||
}
|
||||
@@ -483,11 +474,10 @@ mod tests {
|
||||
RequestData {
|
||||
session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
|
||||
peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
|
||||
timestamp: chrono::DateTime::from_timestamp_millis(
|
||||
timestamp: chrono::NaiveDateTime::from_timestamp_millis(
|
||||
rng.gen_range(1703862754..1803862754),
|
||||
)
|
||||
.unwrap()
|
||||
.naive_utc(),
|
||||
.unwrap(),
|
||||
application_name: Some("test".to_owned()),
|
||||
username: Some(hex::encode(rng.gen::<[u8; 4]>())),
|
||||
endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
|
||||
@@ -570,15 +560,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1315314, 3, 6000),
|
||||
(1315307, 3, 6000),
|
||||
(1315367, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(1315454, 3, 6000),
|
||||
(1315296, 3, 6000),
|
||||
(1315088, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(438713, 1, 2000)
|
||||
(1315008, 3, 6000),
|
||||
(1315001, 3, 6000),
|
||||
(1315061, 3, 6000),
|
||||
(1315018, 3, 6000),
|
||||
(1315148, 3, 6000),
|
||||
(1314990, 3, 6000),
|
||||
(1314782, 3, 6000),
|
||||
(1315018, 3, 6000),
|
||||
(438575, 1, 2000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -608,11 +598,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1222212, 5, 10000),
|
||||
(1228362, 5, 10000),
|
||||
(1230156, 5, 10000),
|
||||
(1229518, 5, 10000),
|
||||
(1220796, 5, 10000)
|
||||
(1221738, 5, 10000),
|
||||
(1227888, 5, 10000),
|
||||
(1229682, 5, 10000),
|
||||
(1229044, 5, 10000),
|
||||
(1220322, 5, 10000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -644,11 +634,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1207859, 5, 10000),
|
||||
(1207590, 5, 10000),
|
||||
(1207883, 5, 10000),
|
||||
(1207871, 5, 10000),
|
||||
(1208126, 5, 10000)
|
||||
(1207385, 5, 10000),
|
||||
(1207116, 5, 10000),
|
||||
(1207409, 5, 10000),
|
||||
(1207397, 5, 10000),
|
||||
(1207652, 5, 10000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -673,15 +663,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1315314, 3, 6000),
|
||||
(1315307, 3, 6000),
|
||||
(1315367, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(1315454, 3, 6000),
|
||||
(1315296, 3, 6000),
|
||||
(1315088, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(438713, 1, 2000)
|
||||
(1315008, 3, 6000),
|
||||
(1315001, 3, 6000),
|
||||
(1315061, 3, 6000),
|
||||
(1315018, 3, 6000),
|
||||
(1315148, 3, 6000),
|
||||
(1314990, 3, 6000),
|
||||
(1314782, 3, 6000),
|
||||
(1315018, 3, 6000),
|
||||
(438575, 1, 2000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -718,7 +708,7 @@ mod tests {
|
||||
// files are smaller than the size threshold, but they took too long to fill so were flushed early
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
|
||||
[(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use lasso::ThreadedRodeo;
|
||||
use measured::{
|
||||
label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
|
||||
label::StaticLabelSet,
|
||||
metric::{histogram::Thresholds, name::MetricName},
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
|
||||
LabelGroup, MetricGroup,
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
|
||||
MetricGroup,
|
||||
};
|
||||
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
|
||||
|
||||
@@ -14,36 +14,26 @@ use tokio::time::{self, Instant};
|
||||
use crate::console::messages::ColdStartInfo;
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
|
||||
pub struct Metrics {
|
||||
#[metric(namespace = "proxy")]
|
||||
#[metric(init = ProxyMetrics::new(thread_pool))]
|
||||
pub proxy: ProxyMetrics,
|
||||
|
||||
#[metric(namespace = "wake_compute_lock")]
|
||||
pub wake_compute_lock: ApiLockMetrics,
|
||||
}
|
||||
|
||||
static SELF: OnceLock<Metrics> = OnceLock::new();
|
||||
impl Metrics {
|
||||
pub fn install(thread_pool: Arc<ThreadPoolMetrics>) {
|
||||
SELF.set(Metrics::new(thread_pool))
|
||||
.ok()
|
||||
.expect("proxy metrics must not be installed more than once");
|
||||
}
|
||||
|
||||
pub fn get() -> &'static Self {
|
||||
#[cfg(test)]
|
||||
return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0))));
|
||||
|
||||
#[cfg(not(test))]
|
||||
SELF.get()
|
||||
.expect("proxy metrics must be installed by the main() function")
|
||||
static SELF: OnceLock<Metrics> = OnceLock::new();
|
||||
SELF.get_or_init(|| Metrics {
|
||||
proxy: ProxyMetrics::default(),
|
||||
wake_compute_lock: ApiLockMetrics::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
|
||||
#[metric(new())]
|
||||
pub struct ProxyMetrics {
|
||||
#[metric(flatten)]
|
||||
pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
|
||||
@@ -139,10 +129,6 @@ pub struct ProxyMetrics {
|
||||
|
||||
#[metric(namespace = "connect_compute_lock")]
|
||||
pub connect_compute_lock: ApiLockMetrics,
|
||||
|
||||
#[metric(namespace = "scram_pool")]
|
||||
#[metric(init = thread_pool)]
|
||||
pub scram_pool: Arc<ThreadPoolMetrics>,
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
@@ -160,6 +146,12 @@ pub struct ApiLockMetrics {
|
||||
pub semaphore_acquire_seconds: Histogram<16>,
|
||||
}
|
||||
|
||||
impl Default for ProxyMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ApiLockMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
@@ -561,52 +553,3 @@ pub enum RedisEventsCount {
|
||||
PasswordUpdate,
|
||||
AllowedIpsUpdate,
|
||||
}
|
||||
|
||||
pub struct ThreadPoolWorkers(usize);
|
||||
pub struct ThreadPoolWorkerId(pub usize);
|
||||
|
||||
impl LabelValue for ThreadPoolWorkerId {
|
||||
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
|
||||
v.write_int(self.0 as i64)
|
||||
}
|
||||
}
|
||||
|
||||
impl LabelGroup for ThreadPoolWorkerId {
|
||||
fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
|
||||
v.write_value(LabelName::from_str("worker"), self);
|
||||
}
|
||||
}
|
||||
|
||||
impl LabelSet for ThreadPoolWorkers {
|
||||
type Value<'a> = ThreadPoolWorkerId;
|
||||
|
||||
fn dynamic_cardinality(&self) -> Option<usize> {
|
||||
Some(self.0)
|
||||
}
|
||||
|
||||
fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
|
||||
(value.0 < self.0).then_some(value.0)
|
||||
}
|
||||
|
||||
fn decode(&self, value: usize) -> Self::Value<'_> {
|
||||
ThreadPoolWorkerId(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedCardinalitySet for ThreadPoolWorkers {
|
||||
fn cardinality(&self) -> usize {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(workers: usize))]
|
||||
pub struct ThreadPoolMetrics {
|
||||
pub injector_queue_depth: Gauge,
|
||||
#[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
|
||||
pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
|
||||
#[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
|
||||
pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
|
||||
#[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
|
||||
pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
|
||||
}
|
||||
|
||||
@@ -6,14 +6,11 @@
|
||||
//! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/backend/libpq/auth-scram.c>
|
||||
//! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/interfaces/libpq/fe-auth-scram.c>
|
||||
|
||||
mod countmin;
|
||||
mod exchange;
|
||||
mod key;
|
||||
mod messages;
|
||||
mod pbkdf2;
|
||||
mod secret;
|
||||
mod signature;
|
||||
pub mod threadpool;
|
||||
|
||||
pub use exchange::{exchange, Exchange};
|
||||
pub use key::ScramKey;
|
||||
@@ -59,13 +56,9 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
intern::EndpointIdInt,
|
||||
sasl::{Mechanism, Step},
|
||||
EndpointId,
|
||||
};
|
||||
use crate::sasl::{Mechanism, Step};
|
||||
|
||||
use super::{threadpool::ThreadPool, Exchange, ServerSecret};
|
||||
use super::{Exchange, ServerSecret};
|
||||
|
||||
#[test]
|
||||
fn snapshot() {
|
||||
@@ -119,13 +112,8 @@ mod tests {
|
||||
}
|
||||
|
||||
async fn run_round_trip_test(server_password: &str, client_password: &str) {
|
||||
let pool = ThreadPool::new(1);
|
||||
|
||||
let ep = EndpointId::from("foo");
|
||||
let ep = EndpointIdInt::from(ep);
|
||||
|
||||
let scram_secret = ServerSecret::build(server_password).await.unwrap();
|
||||
let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes())
|
||||
let outcome = super::exchange(&scram_secret, client_password.as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
use std::hash::Hash;
|
||||
|
||||
/// estimator of hash jobs per second.
|
||||
/// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
|
||||
pub struct CountMinSketch {
|
||||
// one for each depth
|
||||
hashers: Vec<ahash::RandomState>,
|
||||
width: usize,
|
||||
depth: usize,
|
||||
// buckets, width*depth
|
||||
buckets: Vec<u32>,
|
||||
}
|
||||
|
||||
impl CountMinSketch {
|
||||
/// Given parameters (ε, δ),
|
||||
/// set width = ceil(e/ε)
|
||||
/// set depth = ceil(ln(1/δ))
|
||||
///
|
||||
/// guarantees:
|
||||
/// actual <= estimate
|
||||
/// estimate <= actual + ε * N with probability 1 - δ
|
||||
/// where N is the cardinality of the stream
|
||||
pub fn with_params(epsilon: f64, delta: f64) -> Self {
|
||||
CountMinSketch::new(
|
||||
(std::f64::consts::E / epsilon).ceil() as usize,
|
||||
(1.0_f64 / delta).ln().ceil() as usize,
|
||||
)
|
||||
}
|
||||
|
||||
fn new(width: usize, depth: usize) -> Self {
|
||||
Self {
|
||||
#[cfg(test)]
|
||||
hashers: (0..depth)
|
||||
.map(|i| {
|
||||
// digits of pi for good randomness
|
||||
ahash::RandomState::with_seeds(
|
||||
314159265358979323,
|
||||
84626433832795028,
|
||||
84197169399375105,
|
||||
82097494459230781 + i as u64,
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
#[cfg(not(test))]
|
||||
hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(),
|
||||
width,
|
||||
depth,
|
||||
buckets: vec![0; width * depth],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
|
||||
let mut min = u32::MAX;
|
||||
for row in 0..self.depth {
|
||||
let col = (self.hashers[row].hash_one(t) as usize) % self.width;
|
||||
|
||||
let row = &mut self.buckets[row * self.width..][..self.width];
|
||||
row[col] = row[col].saturating_add(x);
|
||||
min = std::cmp::min(min, row[col]);
|
||||
}
|
||||
min
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) {
|
||||
self.buckets.clear();
|
||||
self.buckets.resize(self.width * self.depth, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
|
||||
|
||||
use super::CountMinSketch;
|
||||
|
||||
fn eval_precision(n: usize, p: f64, q: f64) -> usize {
|
||||
// fixed value of phi for consistent test
|
||||
let mut rng = StdRng::seed_from_u64(16180339887498948482);
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
let mut N = 0;
|
||||
|
||||
let mut ids = vec![];
|
||||
|
||||
for _ in 0..n {
|
||||
// number of insert operations
|
||||
let n = rng.gen_range(1..100);
|
||||
// number to insert at once
|
||||
let m = rng.gen_range(1..4096);
|
||||
|
||||
let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
|
||||
ids.push((id, n, m));
|
||||
|
||||
// N = sum(actual)
|
||||
N += n * m;
|
||||
}
|
||||
|
||||
// q% of counts will be within p of the actual value
|
||||
let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
|
||||
|
||||
dbg!(sketch.buckets.len());
|
||||
|
||||
// insert a bunch of entries in a random order
|
||||
let mut ids2 = ids.clone();
|
||||
while !ids2.is_empty() {
|
||||
ids2.shuffle(&mut rng);
|
||||
|
||||
let mut i = 0;
|
||||
while i < ids2.len() {
|
||||
sketch.inc_and_return(&ids2[i].0, ids2[i].1);
|
||||
ids2[i].2 -= 1;
|
||||
if ids2[i].2 == 0 {
|
||||
ids2.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut within_p = 0;
|
||||
for (id, n, m) in ids {
|
||||
let actual = n * m;
|
||||
let estimate = sketch.inc_and_return(&id, 0);
|
||||
|
||||
// This estimate has the guarantee that actual <= estimate
|
||||
assert!(actual <= estimate);
|
||||
|
||||
// This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ.
|
||||
// ε = p / N, δ = 1 - q;
|
||||
// therefore, estimate <= actual + p with probability q.
|
||||
if estimate as f64 <= actual as f64 + p {
|
||||
within_p += 1;
|
||||
}
|
||||
}
|
||||
within_p
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn precision() {
|
||||
assert_eq!(eval_precision(100, 100.0, 0.99), 100);
|
||||
assert_eq!(eval_precision(1000, 100.0, 0.99), 1000);
|
||||
assert_eq!(eval_precision(100, 4096.0, 0.99), 100);
|
||||
assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000);
|
||||
|
||||
// seems to be more precise than the literature indicates?
|
||||
// probably numbers are too small to truly represent the probabilities.
|
||||
assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
|
||||
assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
|
||||
assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
|
||||
assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
|
||||
}
|
||||
|
||||
// returns memory usage in bytes, and the time complexity per insert.
|
||||
fn eval_cost(p: f64, q: f64) -> (usize, usize) {
|
||||
#[allow(non_snake_case)]
|
||||
// N = sum(actual)
|
||||
// Let's assume 1021 samples, all of 4096
|
||||
let N = 1021 * 4096;
|
||||
let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
|
||||
|
||||
let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
|
||||
let time = sketch.depth;
|
||||
(memory, time)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn memory_usage() {
|
||||
assert_eq!(eval_cost(100.0, 0.99), (2273580, 5));
|
||||
assert_eq!(eval_cost(4096.0, 0.99), (55520, 5));
|
||||
assert_eq!(eval_cost(4096.0, 0.90), (33312, 3));
|
||||
assert_eq!(eval_cost(4096.0, 0.1), (11104, 1));
|
||||
}
|
||||
}
|
||||
@@ -4,17 +4,15 @@ use std::convert::Infallible;
|
||||
|
||||
use hmac::{Hmac, Mac};
|
||||
use sha2::Sha256;
|
||||
use tokio::task::yield_now;
|
||||
|
||||
use super::messages::{
|
||||
ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
|
||||
};
|
||||
use super::pbkdf2::Pbkdf2;
|
||||
use super::secret::ServerSecret;
|
||||
use super::signature::SignatureBuilder;
|
||||
use super::threadpool::ThreadPool;
|
||||
use super::ScramKey;
|
||||
use crate::config;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::sasl::{self, ChannelBinding, Error as SaslError};
|
||||
|
||||
/// The only channel binding mode we currently support.
|
||||
@@ -76,18 +74,37 @@ impl<'a> Exchange<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
|
||||
async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
|
||||
let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
|
||||
let mut prev = hmac
|
||||
.clone()
|
||||
.chain_update(salt)
|
||||
.chain_update(1u32.to_be_bytes())
|
||||
.finalize()
|
||||
.into_bytes();
|
||||
|
||||
let mut hi = prev;
|
||||
|
||||
for i in 1..iterations {
|
||||
prev = hmac.clone().chain_update(prev).finalize().into_bytes();
|
||||
|
||||
for (hi, prev) in hi.iter_mut().zip(prev) {
|
||||
*hi ^= prev;
|
||||
}
|
||||
// yield every ~250us
|
||||
// hopefully reduces tail latencies
|
||||
if i % 1024 == 0 {
|
||||
yield_now().await
|
||||
}
|
||||
}
|
||||
|
||||
hi.into()
|
||||
}
|
||||
|
||||
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
|
||||
async fn derive_client_key(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
password: &[u8],
|
||||
salt: &[u8],
|
||||
iterations: u32,
|
||||
) -> ScramKey {
|
||||
let salted_password = pool
|
||||
.spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
|
||||
.await
|
||||
.expect("job should not be cancelled");
|
||||
async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
|
||||
let salted_password = pbkdf2(password, salt, iterations).await;
|
||||
|
||||
let make_key = |name| {
|
||||
let key = Hmac::<Sha256>::new_from_slice(&salted_password)
|
||||
@@ -102,13 +119,11 @@ async fn derive_client_key(
|
||||
}
|
||||
|
||||
pub async fn exchange(
|
||||
pool: &ThreadPool,
|
||||
endpoint: EndpointIdInt,
|
||||
secret: &ServerSecret,
|
||||
password: &[u8],
|
||||
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
|
||||
let salt = base64::decode(&secret.salt_base64)?;
|
||||
let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
|
||||
let client_key = derive_client_key(password, &salt, secret.iterations).await;
|
||||
|
||||
if secret.is_password_invalid(&client_key).into() {
|
||||
Ok(sasl::Outcome::Failure("password doesn't match"))
|
||||
|
||||
@@ -1,89 +0,0 @@
|
||||
use hmac::{
|
||||
digest::{consts::U32, generic_array::GenericArray},
|
||||
Hmac, Mac,
|
||||
};
|
||||
use sha2::Sha256;
|
||||
|
||||
pub struct Pbkdf2 {
|
||||
hmac: Hmac<Sha256>,
|
||||
prev: GenericArray<u8, U32>,
|
||||
hi: GenericArray<u8, U32>,
|
||||
iterations: u32,
|
||||
}
|
||||
|
||||
// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
|
||||
impl Pbkdf2 {
|
||||
pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
|
||||
let hmac =
|
||||
Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
|
||||
|
||||
let prev = hmac
|
||||
.clone()
|
||||
.chain_update(salt)
|
||||
.chain_update(1u32.to_be_bytes())
|
||||
.finalize()
|
||||
.into_bytes();
|
||||
|
||||
Self {
|
||||
hmac,
|
||||
// one consumed for the hash above
|
||||
iterations: iterations - 1,
|
||||
hi: prev,
|
||||
prev,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cost(&self) -> u32 {
|
||||
(self.iterations).clamp(0, 4096)
|
||||
}
|
||||
|
||||
pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
|
||||
let Self {
|
||||
hmac,
|
||||
prev,
|
||||
hi,
|
||||
iterations,
|
||||
} = self;
|
||||
|
||||
// only do 4096 iterations per turn before sharing the thread for fairness
|
||||
let n = (*iterations).clamp(0, 4096);
|
||||
for _ in 0..n {
|
||||
*prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
|
||||
|
||||
for (hi, prev) in hi.iter_mut().zip(*prev) {
|
||||
*hi ^= prev;
|
||||
}
|
||||
}
|
||||
|
||||
*iterations -= n;
|
||||
if *iterations == 0 {
|
||||
std::task::Poll::Ready((*hi).into())
|
||||
} else {
|
||||
std::task::Poll::Pending
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Pbkdf2;
|
||||
use pbkdf2::pbkdf2_hmac_array;
|
||||
use sha2::Sha256;
|
||||
|
||||
#[test]
|
||||
fn works() {
|
||||
let salt = b"sodium chloride";
|
||||
let pass = b"Ne0n_!5_50_C007";
|
||||
|
||||
let mut job = Pbkdf2::start(pass, salt, 600000);
|
||||
let hash = loop {
|
||||
let std::task::Poll::Ready(hash) = job.turn() else {
|
||||
continue;
|
||||
};
|
||||
break hash;
|
||||
};
|
||||
|
||||
let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
|
||||
assert_eq!(hash, expected)
|
||||
}
|
||||
}
|
||||
@@ -1,321 +0,0 @@
|
||||
//! Custom threadpool implementation for password hashing.
|
||||
//!
|
||||
//! Requirements:
|
||||
//! 1. Fairness per endpoint.
|
||||
//! 2. Yield support for high iteration counts.
|
||||
|
||||
use std::sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
};
|
||||
|
||||
use crossbeam_deque::{Injector, Stealer, Worker};
|
||||
use itertools::Itertools;
|
||||
use parking_lot::{Condvar, Mutex};
|
||||
use rand::Rng;
|
||||
use rand::{rngs::SmallRng, SeedableRng};
|
||||
use tokio::sync::oneshot;
|
||||
|
||||
use crate::{
|
||||
intern::EndpointIdInt,
|
||||
metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
|
||||
scram::countmin::CountMinSketch,
|
||||
};
|
||||
|
||||
use super::pbkdf2::Pbkdf2;
|
||||
|
||||
pub struct ThreadPool {
|
||||
queue: Injector<JobSpec>,
|
||||
stealers: Vec<Stealer<JobSpec>>,
|
||||
parkers: Vec<(Condvar, Mutex<ThreadState>)>,
|
||||
/// bitpacked representation.
|
||||
/// lower 8 bits = number of sleeping threads
|
||||
/// next 8 bits = number of idle threads (searching for work)
|
||||
counters: AtomicU64,
|
||||
|
||||
pub metrics: Arc<ThreadPoolMetrics>,
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum ThreadState {
|
||||
Parked,
|
||||
Active,
|
||||
}
|
||||
|
||||
impl ThreadPool {
|
||||
pub fn new(n_workers: u8) -> Arc<Self> {
|
||||
let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
|
||||
let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
|
||||
|
||||
let parkers = (0..n_workers)
|
||||
.map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
|
||||
.collect_vec();
|
||||
|
||||
let pool = Arc::new(Self {
|
||||
queue: Injector::new(),
|
||||
stealers,
|
||||
parkers,
|
||||
// threads start searching for work
|
||||
counters: AtomicU64::new((n_workers as u64) << 8),
|
||||
metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
|
||||
});
|
||||
|
||||
for (i, worker) in workers.into_iter().enumerate() {
|
||||
let pool = Arc::clone(&pool);
|
||||
std::thread::spawn(move || thread_rt(pool, worker, i));
|
||||
}
|
||||
|
||||
pool
|
||||
}
|
||||
|
||||
pub fn spawn_job(
|
||||
&self,
|
||||
endpoint: EndpointIdInt,
|
||||
pbkdf2: Pbkdf2,
|
||||
) -> oneshot::Receiver<[u8; 32]> {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
let queue_was_empty = self.queue.is_empty();
|
||||
|
||||
self.metrics.injector_queue_depth.inc();
|
||||
self.queue.push(JobSpec {
|
||||
response: tx,
|
||||
pbkdf2,
|
||||
endpoint,
|
||||
});
|
||||
|
||||
// inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
|
||||
let counts = self.counters.load(Ordering::SeqCst);
|
||||
let num_awake_but_idle = (counts >> 8) & 0xff;
|
||||
let num_sleepers = counts & 0xff;
|
||||
|
||||
// If the queue is non-empty, then we always wake up a worker
|
||||
// -- clearly the existing idle jobs aren't enough. Otherwise,
|
||||
// check to see if we have enough idle workers.
|
||||
if !queue_was_empty || num_awake_but_idle == 0 {
|
||||
let num_to_wake = Ord::min(1, num_sleepers);
|
||||
self.wake_any_threads(num_to_wake);
|
||||
}
|
||||
|
||||
rx
|
||||
}
|
||||
|
||||
#[cold]
|
||||
fn wake_any_threads(&self, mut num_to_wake: u64) {
|
||||
if num_to_wake > 0 {
|
||||
for i in 0..self.parkers.len() {
|
||||
if self.wake_specific_thread(i) {
|
||||
num_to_wake -= 1;
|
||||
if num_to_wake == 0 {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn wake_specific_thread(&self, index: usize) -> bool {
|
||||
let (condvar, lock) = &self.parkers[index];
|
||||
|
||||
let mut state = lock.lock();
|
||||
if *state == ThreadState::Parked {
|
||||
condvar.notify_one();
|
||||
|
||||
// When the thread went to sleep, it will have incremented
|
||||
// this value. When we wake it, its our job to decrement
|
||||
// it. We could have the thread do it, but that would
|
||||
// introduce a delay between when the thread was
|
||||
// *notified* and when this counter was decremented. That
|
||||
// might mislead people with new work into thinking that
|
||||
// there are sleeping threads that they should try to
|
||||
// wake, when in fact there is nothing left for them to
|
||||
// do.
|
||||
self.counters.fetch_sub(1, Ordering::SeqCst);
|
||||
*state = ThreadState::Active;
|
||||
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
|
||||
// announce thread as idle
|
||||
self.counters.fetch_add(256, Ordering::SeqCst);
|
||||
|
||||
// try steal from the global queue
|
||||
loop {
|
||||
match self.queue.steal_batch_and_pop(worker) {
|
||||
crossbeam_deque::Steal::Success(job) => {
|
||||
self.metrics
|
||||
.injector_queue_depth
|
||||
.set(self.queue.len() as i64);
|
||||
// no longer idle
|
||||
self.counters.fetch_sub(256, Ordering::SeqCst);
|
||||
return Some(job);
|
||||
}
|
||||
crossbeam_deque::Steal::Retry => continue,
|
||||
crossbeam_deque::Steal::Empty => break,
|
||||
}
|
||||
}
|
||||
|
||||
// try steal from our neighbours
|
||||
loop {
|
||||
let mut retry = false;
|
||||
let start = rng.gen_range(0..self.stealers.len());
|
||||
let job = (start..self.stealers.len())
|
||||
.chain(0..start)
|
||||
.filter(|i| *i != skip)
|
||||
.find_map(
|
||||
|victim| match self.stealers[victim].steal_batch_and_pop(worker) {
|
||||
crossbeam_deque::Steal::Success(job) => Some(job),
|
||||
crossbeam_deque::Steal::Empty => None,
|
||||
crossbeam_deque::Steal::Retry => {
|
||||
retry = true;
|
||||
None
|
||||
}
|
||||
},
|
||||
);
|
||||
if job.is_some() {
|
||||
// no longer idle
|
||||
self.counters.fetch_sub(256, Ordering::SeqCst);
|
||||
return job;
|
||||
}
|
||||
if !retry {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
|
||||
/// interval when we should steal from the global queue
|
||||
/// so that tail latencies are managed appropriately
|
||||
const STEAL_INTERVAL: usize = 61;
|
||||
|
||||
/// How often to reset the sketch values
|
||||
const SKETCH_RESET_INTERVAL: usize = 1021;
|
||||
|
||||
let mut rng = SmallRng::from_entropy();
|
||||
|
||||
// used to determine whether we should temporarily skip tasks for fairness.
|
||||
// 99% of estimates will overcount by no more than 4096 samples
|
||||
let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
|
||||
|
||||
let (condvar, lock) = &pool.parkers[index];
|
||||
|
||||
'wait: loop {
|
||||
// wait for notification of work
|
||||
{
|
||||
let mut lock = lock.lock();
|
||||
|
||||
// queue is empty
|
||||
pool.metrics
|
||||
.worker_queue_depth
|
||||
.set(ThreadPoolWorkerId(index), 0);
|
||||
|
||||
// subtract 1 from idle count, add 1 to sleeping count.
|
||||
pool.counters.fetch_sub(255, Ordering::SeqCst);
|
||||
|
||||
*lock = ThreadState::Parked;
|
||||
condvar.wait(&mut lock);
|
||||
}
|
||||
|
||||
for i in 0.. {
|
||||
let mut job = match worker
|
||||
.pop()
|
||||
.or_else(|| pool.steal(&mut rng, index, &worker))
|
||||
{
|
||||
Some(job) => job,
|
||||
None => continue 'wait,
|
||||
};
|
||||
|
||||
pool.metrics
|
||||
.worker_queue_depth
|
||||
.set(ThreadPoolWorkerId(index), worker.len() as i64);
|
||||
|
||||
// receiver is closed, cancel the task
|
||||
if !job.response.is_closed() {
|
||||
let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
|
||||
|
||||
const P: f64 = 2000.0;
|
||||
// probability decreases as rate increases.
|
||||
// lower probability, higher chance of being skipped
|
||||
//
|
||||
// estimates (rate in terms of 4096 rounds):
|
||||
// rate = 0 => probability = 100%
|
||||
// rate = 10 => probability = 71.3%
|
||||
// rate = 50 => probability = 62.1%
|
||||
// rate = 500 => probability = 52.3%
|
||||
// rate = 1021 => probability = 49.8%
|
||||
//
|
||||
// My expectation is that the pool queue will only begin backing up at ~1000rps
|
||||
// in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
|
||||
// are in requests per second.
|
||||
let probability = P.ln() / (P + rate as f64).ln();
|
||||
if pool.queue.len() > 32 || rng.gen_bool(probability) {
|
||||
pool.metrics
|
||||
.worker_task_turns_total
|
||||
.inc(ThreadPoolWorkerId(index));
|
||||
|
||||
match job.pbkdf2.turn() {
|
||||
std::task::Poll::Ready(result) => {
|
||||
let _ = job.response.send(result);
|
||||
}
|
||||
std::task::Poll::Pending => worker.push(job),
|
||||
}
|
||||
} else {
|
||||
pool.metrics
|
||||
.worker_task_skips_total
|
||||
.inc(ThreadPoolWorkerId(index));
|
||||
|
||||
// skip for now
|
||||
worker.push(job)
|
||||
}
|
||||
}
|
||||
|
||||
// if we get stuck with a few long lived jobs in the queue
|
||||
// it's better to try and steal from the queue too for fairness
|
||||
if i % STEAL_INTERVAL == 0 {
|
||||
let _ = pool.queue.steal_batch(&worker);
|
||||
}
|
||||
|
||||
if i % SKETCH_RESET_INTERVAL == 0 {
|
||||
sketch.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct JobSpec {
|
||||
response: oneshot::Sender<[u8; 32]>,
|
||||
pbkdf2: Pbkdf2,
|
||||
endpoint: EndpointIdInt,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::EndpointId;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn hash_is_correct() {
|
||||
let pool = ThreadPool::new(1);
|
||||
|
||||
let ep = EndpointId::from("foo");
|
||||
let ep = EndpointIdInt::from(ep);
|
||||
|
||||
let salt = [0x55; 32];
|
||||
let actual = pool
|
||||
.spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let expected = [
|
||||
10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
|
||||
178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
|
||||
];
|
||||
assert_eq!(actual, expected)
|
||||
}
|
||||
}
|
||||
@@ -15,7 +15,6 @@ use crate::{
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
error::{ErrorKind, ReportableError, UserFacingError},
|
||||
intern::EndpointIdInt,
|
||||
proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
Host,
|
||||
@@ -67,14 +66,8 @@ impl PoolingBackend {
|
||||
return Err(AuthError::auth_failed(&*user_info.user));
|
||||
}
|
||||
};
|
||||
let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
|
||||
let auth_outcome = crate::auth::validate_password_and_exchange(
|
||||
&config.thread_pool,
|
||||
ep,
|
||||
&conn_info.password,
|
||||
secret,
|
||||
)
|
||||
.await?;
|
||||
let auth_outcome =
|
||||
crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
|
||||
let res = match auth_outcome {
|
||||
crate::sasl::Outcome::Success(key) => {
|
||||
info!("user successfully authenticated");
|
||||
|
||||
@@ -10,7 +10,7 @@ pytest = "^7.4.4"
|
||||
psycopg2-binary = "^2.9.6"
|
||||
typing-extensions = "^4.6.1"
|
||||
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
|
||||
requests = "^2.32.0"
|
||||
requests = "^2.31.0"
|
||||
pytest-xdist = "^3.3.1"
|
||||
asyncpg = "^0.29.0"
|
||||
aiopg = "^1.4.0"
|
||||
|
||||
@@ -20,6 +20,7 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage_broker::Uri;
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
use tracing::*;
|
||||
use utils::pid_file;
|
||||
@@ -29,13 +30,13 @@ use safekeeper::defaults::{
|
||||
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
|
||||
};
|
||||
use safekeeper::remove_wal;
|
||||
use safekeeper::wal_service;
|
||||
use safekeeper::GlobalTimelines;
|
||||
use safekeeper::SafeKeeperConf;
|
||||
use safekeeper::{broker, WAL_SERVICE_RUNTIME};
|
||||
use safekeeper::{control_file, BROKER_RUNTIME};
|
||||
use safekeeper::{http, WAL_REMOVER_RUNTIME};
|
||||
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
|
||||
use safekeeper::{wal_backup, HTTP_RUNTIME};
|
||||
use storage_broker::DEFAULT_ENDPOINT;
|
||||
use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
|
||||
@@ -376,6 +377,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
metrics::register_internal(Box::new(timeline_collector))?;
|
||||
|
||||
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
|
||||
|
||||
wal_backup::init_remote_storage(&conf);
|
||||
|
||||
// Keep handles to main tasks to die if any of them disappears.
|
||||
@@ -388,9 +391,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let current_thread_rt = conf
|
||||
.current_thread_runtime
|
||||
.then(|| Handle::try_current().expect("no runtime in main"));
|
||||
let conf_ = conf.clone();
|
||||
let wal_backup_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
|
||||
.spawn(wal_backup::wal_backup_launcher_task_main(
|
||||
conf_,
|
||||
wal_backup_launcher_rx,
|
||||
))
|
||||
.map(|res| ("WAL backup launcher".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_backup_handle));
|
||||
|
||||
// Load all timelines from disk to memory.
|
||||
GlobalTimelines::init(conf.clone()).await?;
|
||||
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
|
||||
|
||||
let conf_ = conf.clone();
|
||||
// Run everything in current thread rt, if asked.
|
||||
|
||||
@@ -46,8 +46,6 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
|
||||
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||
@@ -59,9 +57,15 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
// sensitive and there is no risk of deadlock as we don't await while
|
||||
// lock is held.
|
||||
let now = Instant::now();
|
||||
let all_tlis = active_timelines_set.get_all();
|
||||
let all_tlis = GlobalTimelines::get_all();
|
||||
let mut n_pushed_tlis = 0;
|
||||
for tli in &all_tlis {
|
||||
// filtering alternative futures::stream::iter(all_tlis)
|
||||
// .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
|
||||
// doesn't look better, and I'm not sure how to do that without collect.
|
||||
if !tli.is_active().await {
|
||||
continue;
|
||||
}
|
||||
let sk_info = tli.get_safekeeper_info(&conf).await;
|
||||
yield sk_info;
|
||||
BROKER_PUSHED_UPDATES.inc();
|
||||
@@ -86,7 +90,6 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
/// Subscribe and fetch all the interesting data from the broker.
|
||||
#[instrument(name = "broker pull", skip_all)]
|
||||
async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
|
||||
let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
|
||||
|
||||
@@ -183,7 +186,6 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
|
||||
commit_lsn: sk_info.commit_lsn,
|
||||
safekeeper_connstr: sk_info.safekeeper_connstr,
|
||||
availability_zone: sk_info.availability_zone,
|
||||
standby_horizon: 0,
|
||||
};
|
||||
|
||||
// note this is a blocking call
|
||||
@@ -317,7 +319,7 @@ async fn task_stats(stats: Arc<BrokerStats>) {
|
||||
|
||||
let now = BrokerStats::now_millis();
|
||||
if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
|
||||
let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
|
||||
let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
|
||||
info!("no broker updates for some time, last update: {:?}", ts);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -350,7 +350,6 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
||||
backup_lsn: sk_info.backup_lsn.0,
|
||||
local_start_lsn: sk_info.local_start_lsn.0,
|
||||
availability_zone: None,
|
||||
standby_horizon: sk_info.standby_horizon.0,
|
||||
};
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
@@ -31,8 +31,6 @@ pub mod safekeeper;
|
||||
pub mod send_wal;
|
||||
pub mod state;
|
||||
pub mod timeline;
|
||||
pub mod timeline_manager;
|
||||
pub mod timelines_set;
|
||||
pub mod wal_backup;
|
||||
pub mod wal_backup_partial;
|
||||
pub mod wal_service;
|
||||
|
||||
@@ -11,9 +11,8 @@ use futures::Future;
|
||||
use metrics::{
|
||||
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
|
||||
proto::MetricFamily,
|
||||
register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
|
||||
register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
|
||||
IntGaugeVec,
|
||||
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge,
|
||||
IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
@@ -163,29 +162,6 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
)
|
||||
.expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
|
||||
});
|
||||
pub static MANAGER_ITERATIONS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"safekeeper_manager_iterations_total",
|
||||
"Number of iterations of the timeline manager task"
|
||||
)
|
||||
.expect("Failed to register safekeeper_manager_iterations_total counter")
|
||||
});
|
||||
pub static MANAGER_ACTIVE_CHANGES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"safekeeper_manager_active_changes_total",
|
||||
"Number of timeline active status changes in the timeline manager task"
|
||||
)
|
||||
.expect("Failed to register safekeeper_manager_active_changes_total counter")
|
||||
});
|
||||
pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
|
||||
register_int_counter_pair!(
|
||||
"safekeeper_wal_backup_tasks_started_total",
|
||||
"Number of active WAL backup tasks",
|
||||
"safekeeper_wal_backup_tasks_finished_total",
|
||||
"Number of finished WAL backup tasks",
|
||||
)
|
||||
.expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
|
||||
});
|
||||
|
||||
pub const LABEL_UNKNOWN: &str = "unknown";
|
||||
|
||||
@@ -638,7 +614,8 @@ impl Collector for TimelineCollector {
|
||||
self.written_wal_seconds.reset();
|
||||
self.flushed_wal_seconds.reset();
|
||||
|
||||
let timelines_count = GlobalTimelines::get_all().len();
|
||||
let timelines = GlobalTimelines::get_all();
|
||||
let timelines_count = timelines.len();
|
||||
let mut active_timelines_count = 0;
|
||||
|
||||
// Prometheus Collector is sync, and data is stored under async lock. To
|
||||
@@ -769,9 +746,9 @@ impl Collector for TimelineCollector {
|
||||
|
||||
async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
|
||||
let mut res = vec![];
|
||||
let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
|
||||
let timelines = GlobalTimelines::get_all();
|
||||
|
||||
for tli in active_timelines {
|
||||
for tli in timelines {
|
||||
if let Some(info) = tli.info_for_metrics().await {
|
||||
res.push(info);
|
||||
}
|
||||
|
||||
@@ -45,9 +45,6 @@ const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
|
||||
pub struct WalReceivers {
|
||||
mutex: Mutex<WalReceiversShared>,
|
||||
pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
|
||||
|
||||
num_computes_tx: tokio::sync::watch::Sender<usize>,
|
||||
num_computes_rx: tokio::sync::watch::Receiver<usize>,
|
||||
}
|
||||
|
||||
/// Id under which walreceiver is registered in shmem.
|
||||
@@ -58,21 +55,16 @@ impl WalReceivers {
|
||||
let (pageserver_feedback_tx, _) =
|
||||
tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);
|
||||
|
||||
let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize);
|
||||
|
||||
Arc::new(WalReceivers {
|
||||
mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
|
||||
pageserver_feedback_tx,
|
||||
num_computes_tx,
|
||||
num_computes_rx,
|
||||
})
|
||||
}
|
||||
|
||||
/// Register new walreceiver. Returned guard provides access to the slot and
|
||||
/// automatically deregisters in Drop.
|
||||
pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
|
||||
let mut shared = self.mutex.lock();
|
||||
let slots = &mut shared.slots;
|
||||
let slots = &mut self.mutex.lock().slots;
|
||||
let walreceiver = WalReceiverState {
|
||||
conn_id,
|
||||
status: WalReceiverStatus::Voting,
|
||||
@@ -86,9 +78,6 @@ impl WalReceivers {
|
||||
slots.push(Some(walreceiver));
|
||||
pos
|
||||
};
|
||||
|
||||
self.update_num(&shared);
|
||||
|
||||
WalReceiverGuard {
|
||||
id: pos,
|
||||
walreceivers: self.clone(),
|
||||
@@ -110,18 +99,7 @@ impl WalReceivers {
|
||||
|
||||
/// Get number of walreceivers (compute connections).
|
||||
pub fn get_num(self: &Arc<WalReceivers>) -> usize {
|
||||
self.mutex.lock().get_num()
|
||||
}
|
||||
|
||||
/// Get channel for number of walreceivers.
|
||||
pub fn get_num_rx(self: &Arc<WalReceivers>) -> tokio::sync::watch::Receiver<usize> {
|
||||
self.num_computes_rx.clone()
|
||||
}
|
||||
|
||||
/// Should get called after every update of slots.
|
||||
fn update_num(self: &Arc<WalReceivers>, shared: &MutexGuard<WalReceiversShared>) {
|
||||
let num = shared.get_num();
|
||||
self.num_computes_tx.send_replace(num);
|
||||
self.mutex.lock().slots.iter().flatten().count()
|
||||
}
|
||||
|
||||
/// Get state of all walreceivers.
|
||||
@@ -145,7 +123,6 @@ impl WalReceivers {
|
||||
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.slots[id] = None;
|
||||
self.update_num(&shared);
|
||||
}
|
||||
|
||||
/// Broadcast pageserver feedback to connected walproposers.
|
||||
@@ -160,13 +137,6 @@ struct WalReceiversShared {
|
||||
slots: Vec<Option<WalReceiverState>>,
|
||||
}
|
||||
|
||||
impl WalReceiversShared {
|
||||
/// Get number of walreceivers (compute connections).
|
||||
fn get_num(&self) -> usize {
|
||||
self.slots.iter().flatten().count()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalReceiverState {
|
||||
/// None means it is recovery initiated by us (this safekeeper).
|
||||
@@ -213,19 +183,9 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
) -> Result<(), QueryError> {
|
||||
let mut tli: Option<Arc<Timeline>> = None;
|
||||
if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
|
||||
if let Err(end) = self.handle_start_wal_push_guts(pgb).await {
|
||||
// Log the result and probably send it to the client, closing the stream.
|
||||
let handle_end_fut = pgb.handle_copy_stream_end(end);
|
||||
// If we managed to create the timeline, augment logging with current LSNs etc.
|
||||
if let Some(tli) = tli {
|
||||
let info = tli.get_safekeeper_info(&self.conf).await;
|
||||
handle_end_fut
|
||||
.instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn)))
|
||||
.await;
|
||||
} else {
|
||||
handle_end_fut.await;
|
||||
}
|
||||
pgb.handle_copy_stream_end(end).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -233,7 +193,6 @@ impl SafekeeperPostgresHandler {
|
||||
pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tli: &mut Option<Arc<Timeline>>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
// Notify the libpq client that it's allowed to send `CopyData` messages
|
||||
pgb.write_message(&BeMessage::CopyBothResponse).await?;
|
||||
@@ -263,17 +222,13 @@ impl SafekeeperPostgresHandler {
|
||||
// Read first message and create timeline if needed.
|
||||
let res = network_reader.read_first_message().await;
|
||||
|
||||
let network_res = if let Ok((timeline, next_msg)) = res {
|
||||
let res = if let Ok((tli, next_msg)) = res {
|
||||
let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
|
||||
timeline
|
||||
.get_walreceivers()
|
||||
.pageserver_feedback_tx
|
||||
.subscribe();
|
||||
*tli = Some(timeline.clone());
|
||||
tli.get_walreceivers().pageserver_feedback_tx.subscribe();
|
||||
|
||||
tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r,
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
|
||||
r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
|
||||
}
|
||||
} else {
|
||||
@@ -289,13 +244,13 @@ impl SafekeeperPostgresHandler {
|
||||
match acceptor_handle {
|
||||
None => {
|
||||
// failed even before spawning; read_network should have error
|
||||
Err(network_res.expect_err("no error with WalAcceptor not spawn"))
|
||||
Err(res.expect_err("no error with WalAcceptor not spawn"))
|
||||
}
|
||||
Some(handle) => {
|
||||
let wal_acceptor_res = handle.await;
|
||||
|
||||
// If there was any network error, return it.
|
||||
network_res?;
|
||||
res?;
|
||||
|
||||
// Otherwise, WalAcceptor thread must have errored.
|
||||
match wal_acceptor_res {
|
||||
@@ -486,7 +441,14 @@ impl WalAcceptor {
|
||||
/// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
|
||||
/// it must mean that network thread terminated.
|
||||
async fn run(&mut self) -> anyhow::Result<()> {
|
||||
// Register the connection and defer unregister.
|
||||
// Order of the next two lines is important: we want first to remove our entry and then
|
||||
// update status which depends on registered connections.
|
||||
let _compute_conn_guard = ComputeConnectionGuard {
|
||||
timeline: Arc::clone(&self.tli),
|
||||
};
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
|
||||
self.tli.update_status_notify().await?;
|
||||
|
||||
// After this timestamp we will stop processing AppendRequests and send a response
|
||||
// to the walproposer. walproposer sends at least one AppendRequest per second,
|
||||
@@ -552,3 +514,19 @@ impl WalAcceptor {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls update_status_notify in drop to update timeline status.
|
||||
struct ComputeConnectionGuard {
|
||||
timeline: Arc<Timeline>,
|
||||
}
|
||||
|
||||
impl Drop for ComputeConnectionGuard {
|
||||
fn drop(&mut self) {
|
||||
let tli = self.timeline.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = tli.update_status_notify().await {
|
||||
error!("failed to update timeline status: {}", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,11 +37,17 @@ use crate::{
|
||||
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
info!("started");
|
||||
let mut cancellation_rx = match tli.get_cancellation_rx() {
|
||||
Ok(rx) => rx,
|
||||
Err(_) => {
|
||||
info!("timeline canceled during task start");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let cancel = tli.cancel.clone();
|
||||
select! {
|
||||
_ = recovery_main_loop(tli, conf) => { unreachable!() }
|
||||
_ = cancel.cancelled() => {
|
||||
_ = cancellation_rx.changed() => {
|
||||
info!("stopped");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,18 +7,29 @@ use tracing::*;
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
const ALLOW_INACTIVE_TIMELINES: bool = true;
|
||||
|
||||
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let wal_removal_interval = Duration::from_millis(5000);
|
||||
loop {
|
||||
let now = tokio::time::Instant::now();
|
||||
let mut active_timelines = 0;
|
||||
|
||||
let tlis = GlobalTimelines::get_all();
|
||||
for tli in &tlis {
|
||||
let is_active = tli.is_active().await;
|
||||
if is_active {
|
||||
active_timelines += 1;
|
||||
}
|
||||
if !ALLOW_INACTIVE_TIMELINES && !is_active {
|
||||
continue;
|
||||
}
|
||||
let ttid = tli.ttid;
|
||||
async {
|
||||
if let Err(e) = tli.maybe_persist_control_file().await {
|
||||
warn!("failed to persist control file: {e}");
|
||||
}
|
||||
if let Err(e) = tli.remove_old_wal().await {
|
||||
if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await {
|
||||
error!("failed to remove WAL: {}", e);
|
||||
}
|
||||
}
|
||||
@@ -31,8 +42,8 @@ pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
|
||||
if elapsed > wal_removal_interval {
|
||||
info!(
|
||||
"WAL removal is too long, processed {} timelines in {:?}",
|
||||
total_timelines, elapsed
|
||||
"WAL removal is too long, processed {} active timelines ({} total) in {:?}",
|
||||
active_timelines, total_timelines, elapsed
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -827,10 +827,10 @@ where
|
||||
|
||||
/// Persist control file if there is something to save and enough time
|
||||
/// passed after the last save.
|
||||
pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<bool> {
|
||||
pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> {
|
||||
const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
|
||||
if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
|
||||
return Ok(false);
|
||||
return Ok(());
|
||||
}
|
||||
let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
|
||||
|| self.state.inmem.backup_lsn > self.state.backup_lsn
|
||||
@@ -840,7 +840,7 @@ where
|
||||
self.state.flush().await?;
|
||||
trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
|
||||
}
|
||||
Ok(need_persist)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle request to append WAL.
|
||||
|
||||
@@ -23,7 +23,7 @@ use utils::failpoint_support;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
|
||||
use std::cmp::{max, min};
|
||||
use std::cmp::min;
|
||||
use std::net::SocketAddr;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
@@ -85,17 +85,8 @@ impl StandbyReply {
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct StandbyFeedback {
|
||||
pub reply: StandbyReply,
|
||||
pub hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
|
||||
impl StandbyFeedback {
|
||||
pub fn empty() -> Self {
|
||||
StandbyFeedback {
|
||||
reply: StandbyReply::empty(),
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
}
|
||||
}
|
||||
reply: StandbyReply,
|
||||
hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
|
||||
/// WalSenders registry. Timeline holds it (wrapped in Arc).
|
||||
@@ -171,8 +162,8 @@ impl WalSenders {
|
||||
}
|
||||
|
||||
/// Get aggregated hot standby feedback (we send it to compute).
|
||||
pub fn get_hotstandby(self: &Arc<WalSenders>) -> StandbyFeedback {
|
||||
self.mutex.lock().agg_standby_feedback
|
||||
pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
|
||||
self.mutex.lock().agg_hs_feedback
|
||||
}
|
||||
|
||||
/// Record new pageserver feedback, update aggregated values.
|
||||
@@ -193,10 +184,6 @@ impl WalSenders {
|
||||
fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
|
||||
let mut shared = self.mutex.lock();
|
||||
let slot = shared.get_slot_mut(id);
|
||||
debug!(
|
||||
"Record standby reply: ts={} apply_lsn={}",
|
||||
reply.reply_ts, reply.apply_lsn
|
||||
);
|
||||
match &mut slot.feedback {
|
||||
ReplicationFeedback::Standby(sf) => sf.reply = *reply,
|
||||
ReplicationFeedback::Pageserver(_) => {
|
||||
@@ -221,7 +208,7 @@ impl WalSenders {
|
||||
})
|
||||
}
|
||||
}
|
||||
shared.update_reply_feedback();
|
||||
shared.update_hs_feedback();
|
||||
}
|
||||
|
||||
/// Get remote_consistent_lsn reported by the pageserver. Returns None if
|
||||
@@ -239,13 +226,13 @@ impl WalSenders {
|
||||
fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.slots[id] = None;
|
||||
shared.update_reply_feedback();
|
||||
shared.update_hs_feedback();
|
||||
}
|
||||
}
|
||||
|
||||
struct WalSendersShared {
|
||||
// aggregated over all walsenders value
|
||||
agg_standby_feedback: StandbyFeedback,
|
||||
agg_hs_feedback: HotStandbyFeedback,
|
||||
// last feedback ever received from any pageserver, empty if none
|
||||
last_ps_feedback: PageserverFeedback,
|
||||
// total counter of pageserver feedbacks received
|
||||
@@ -256,7 +243,7 @@ struct WalSendersShared {
|
||||
impl WalSendersShared {
|
||||
fn new() -> Self {
|
||||
WalSendersShared {
|
||||
agg_standby_feedback: StandbyFeedback::empty(),
|
||||
agg_hs_feedback: HotStandbyFeedback::empty(),
|
||||
last_ps_feedback: PageserverFeedback::empty(),
|
||||
ps_feedback_counter: 0,
|
||||
slots: Vec::new(),
|
||||
@@ -273,11 +260,10 @@ impl WalSendersShared {
|
||||
self.slots[id].as_mut().expect("walsender doesn't exist")
|
||||
}
|
||||
|
||||
/// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins
|
||||
/// Update aggregated hot standy feedback. We just take min of valid xmins
|
||||
/// and ts.
|
||||
fn update_reply_feedback(&mut self) {
|
||||
fn update_hs_feedback(&mut self) {
|
||||
let mut agg = HotStandbyFeedback::empty();
|
||||
let mut reply_agg = StandbyReply::empty();
|
||||
for ws_state in self.slots.iter().flatten() {
|
||||
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
|
||||
let hs_feedback = standby_feedback.hs_feedback;
|
||||
@@ -290,7 +276,7 @@ impl WalSendersShared {
|
||||
} else {
|
||||
agg.xmin = hs_feedback.xmin;
|
||||
}
|
||||
agg.ts = max(agg.ts, hs_feedback.ts);
|
||||
agg.ts = min(agg.ts, hs_feedback.ts);
|
||||
}
|
||||
if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
|
||||
if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
|
||||
@@ -298,43 +284,11 @@ impl WalSendersShared {
|
||||
} else {
|
||||
agg.catalog_xmin = hs_feedback.catalog_xmin;
|
||||
}
|
||||
agg.ts = max(agg.ts, hs_feedback.ts);
|
||||
}
|
||||
let reply = standby_feedback.reply;
|
||||
if reply.write_lsn != Lsn::INVALID {
|
||||
if reply_agg.write_lsn != Lsn::INVALID {
|
||||
reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn);
|
||||
} else {
|
||||
reply_agg.write_lsn = reply.write_lsn;
|
||||
}
|
||||
}
|
||||
if reply.flush_lsn != Lsn::INVALID {
|
||||
if reply_agg.flush_lsn != Lsn::INVALID {
|
||||
reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn);
|
||||
} else {
|
||||
reply_agg.flush_lsn = reply.flush_lsn;
|
||||
}
|
||||
}
|
||||
if reply.apply_lsn != Lsn::INVALID {
|
||||
if reply_agg.apply_lsn != Lsn::INVALID {
|
||||
reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn);
|
||||
} else {
|
||||
reply_agg.apply_lsn = reply.apply_lsn;
|
||||
}
|
||||
}
|
||||
if reply.reply_ts != 0 {
|
||||
if reply_agg.reply_ts != 0 {
|
||||
reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts);
|
||||
} else {
|
||||
reply_agg.reply_ts = reply.reply_ts;
|
||||
}
|
||||
agg.ts = min(agg.ts, hs_feedback.ts);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.agg_standby_feedback = StandbyFeedback {
|
||||
reply: reply_agg,
|
||||
hs_feedback: agg,
|
||||
};
|
||||
self.agg_hs_feedback = agg;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -386,16 +340,12 @@ impl SafekeeperPostgresHandler {
|
||||
start_pos: Lsn,
|
||||
term: Option<Term>,
|
||||
) -> Result<(), QueryError> {
|
||||
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
|
||||
if let Err(end) = self
|
||||
.handle_start_replication_guts(pgb, start_pos, term, tli.clone())
|
||||
.handle_start_replication_guts(pgb, start_pos, term)
|
||||
.await
|
||||
{
|
||||
let info = tli.get_safekeeper_info(&self.conf).await;
|
||||
// Log the result and probably send it to the client, closing the stream.
|
||||
pgb.handle_copy_stream_end(end)
|
||||
.instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn)))
|
||||
.await;
|
||||
pgb.handle_copy_stream_end(end).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -405,9 +355,10 @@ impl SafekeeperPostgresHandler {
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
term: Option<Term>,
|
||||
tli: Arc<Timeline>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let appname = self.appname.clone();
|
||||
let tli =
|
||||
GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?;
|
||||
|
||||
// Use a guard object to remove our entry from the timeline when we are done.
|
||||
let ws_guard = Arc::new(tli.get_walsenders().register(
|
||||
@@ -756,15 +707,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
match msg.first().cloned() {
|
||||
Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
|
||||
// Note: deserializing is on m[1..] because we skip the tag byte.
|
||||
let mut hs_feedback = HotStandbyFeedback::des(&msg[1..])
|
||||
let hs_feedback = HotStandbyFeedback::des(&msg[1..])
|
||||
.context("failed to deserialize HotStandbyFeedback")?;
|
||||
// TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way:
|
||||
// pq_sendint32(&reply_message, xmin);
|
||||
// pq_sendint32(&reply_message, xmin_epoch);
|
||||
// So it is two big endian 32-bit words in low endian order!
|
||||
hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
|
||||
hs_feedback.catalog_xmin =
|
||||
(hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
|
||||
self.ws_guard
|
||||
.walsenders
|
||||
.record_hs_feedback(self.ws_guard.id, &hs_feedback);
|
||||
@@ -846,11 +790,8 @@ mod tests {
|
||||
fn test_hs_feedback_no_valid() {
|
||||
let mut wss = WalSendersShared::new();
|
||||
push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
|
||||
wss.update_reply_feedback();
|
||||
assert_eq!(
|
||||
wss.agg_standby_feedback.hs_feedback.xmin,
|
||||
INVALID_FULL_TRANSACTION_ID
|
||||
);
|
||||
wss.update_hs_feedback();
|
||||
assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -859,7 +800,7 @@ mod tests {
|
||||
push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
|
||||
push_feedback(&mut wss, hs_feedback(1, 42));
|
||||
push_feedback(&mut wss, hs_feedback(1, 64));
|
||||
wss.update_reply_feedback();
|
||||
assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42);
|
||||
wss.update_hs_feedback();
|
||||
assert_eq!(wss.agg_hs_feedback.xmin, 42);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,15 +6,15 @@ use camino::Utf8PathBuf;
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::fs;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use std::cmp::max;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tokio::{sync::watch, time::Instant};
|
||||
use tokio::sync::{Mutex, MutexGuard};
|
||||
use tokio::{
|
||||
sync::{mpsc::Sender, watch},
|
||||
time::Instant,
|
||||
};
|
||||
use tracing::*;
|
||||
use utils::http::error::ApiError;
|
||||
use utils::{
|
||||
@@ -33,13 +33,12 @@ use crate::safekeeper::{
|
||||
};
|
||||
use crate::send_wal::WalSenders;
|
||||
use crate::state::{TimelineMemState, TimelinePersistentState};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::{self};
|
||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
|
||||
use crate::metrics::FullTimelineInfo;
|
||||
use crate::wal_storage::Storage as wal_storage_iface;
|
||||
use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
|
||||
use crate::{debug_dump, wal_backup_partial, wal_storage};
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
/// Things safekeeper should know about timeline state on peers.
|
||||
@@ -52,7 +51,8 @@ pub struct PeerInfo {
|
||||
/// LSN of the last record.
|
||||
pub flush_lsn: Lsn,
|
||||
pub commit_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has WAL.
|
||||
/// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
|
||||
/// sk since backup_lsn.
|
||||
pub local_start_lsn: Lsn,
|
||||
/// When info was received. Serde annotations are not very useful but make
|
||||
/// the code compile -- we don't rely on this field externally.
|
||||
@@ -97,79 +97,25 @@ impl PeersInfo {
|
||||
}
|
||||
}
|
||||
|
||||
pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
|
||||
|
||||
/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard<SharedState>` that
|
||||
/// automatically updates `watch::Sender` channels with state on drop.
|
||||
pub struct WriteGuardSharedState<'a> {
|
||||
tli: Arc<Timeline>,
|
||||
guard: RwLockWriteGuard<'a, SharedState>,
|
||||
skip_update: bool,
|
||||
}
|
||||
|
||||
impl<'a> WriteGuardSharedState<'a> {
|
||||
fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
|
||||
WriteGuardSharedState {
|
||||
tli,
|
||||
guard,
|
||||
skip_update: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for WriteGuardSharedState<'a> {
|
||||
type Target = SharedState;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.guard
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DerefMut for WriteGuardSharedState<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.guard
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Drop for WriteGuardSharedState<'a> {
|
||||
fn drop(&mut self) {
|
||||
let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn()));
|
||||
let commit_lsn = self.guard.sk.state.inmem.commit_lsn;
|
||||
|
||||
let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| {
|
||||
if *old != term_flush_lsn {
|
||||
*old = term_flush_lsn;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
|
||||
let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| {
|
||||
if *old != commit_lsn {
|
||||
*old = commit_lsn;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
|
||||
if !self.skip_update {
|
||||
// send notification about shared state update
|
||||
self.tli.shared_state_version_tx.send_modify(|old| {
|
||||
*old += 1;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Shared state associated with database instance
|
||||
pub struct SharedState {
|
||||
/// Safekeeper object
|
||||
pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
|
||||
sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
|
||||
/// In memory list containing state of peers sent in latest messages from them.
|
||||
pub(crate) peers_info: PeersInfo,
|
||||
pub(crate) last_removed_segno: XLogSegNo,
|
||||
peers_info: PeersInfo,
|
||||
/// True when WAL backup launcher oversees the timeline, making sure WAL is
|
||||
/// offloaded, allows to bother launcher less.
|
||||
wal_backup_active: bool,
|
||||
/// True whenever there is at least some pending activity on timeline: live
|
||||
/// compute connection, pageserver is not caughtup (it must have latest WAL
|
||||
/// for new compute start) or WAL backuping is not finished. Practically it
|
||||
/// means safekeepers broadcast info to peers about the timeline, old WAL is
|
||||
/// trimmed.
|
||||
///
|
||||
/// TODO: it might be better to remove tli completely from GlobalTimelines
|
||||
/// when tli is inactive instead of having this flag.
|
||||
active: bool,
|
||||
last_removed_segno: XLogSegNo,
|
||||
}
|
||||
|
||||
impl SharedState {
|
||||
@@ -206,6 +152,8 @@ impl SharedState {
|
||||
Ok(Self {
|
||||
sk,
|
||||
peers_info: PeersInfo(vec![]),
|
||||
wal_backup_active: false,
|
||||
active: false,
|
||||
last_removed_segno: 0,
|
||||
})
|
||||
}
|
||||
@@ -223,10 +171,75 @@ impl SharedState {
|
||||
Ok(Self {
|
||||
sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
|
||||
peers_info: PeersInfo(vec![]),
|
||||
wal_backup_active: false,
|
||||
active: false,
|
||||
last_removed_segno: 0,
|
||||
})
|
||||
}
|
||||
|
||||
fn is_active(&self, num_computes: usize) -> bool {
|
||||
self.is_wal_backup_required(num_computes)
|
||||
// FIXME: add tracking of relevant pageservers and check them here individually,
|
||||
// otherwise migration won't work (we suspend too early).
|
||||
|| self.sk.state.inmem.remote_consistent_lsn < self.sk.state.inmem.commit_lsn
|
||||
}
|
||||
|
||||
/// Mark timeline active/inactive and return whether s3 offloading requires
|
||||
/// start/stop action. If timeline is deactivated, control file is persisted
|
||||
/// as maintenance task does that only for active timelines.
|
||||
async fn update_status(&mut self, num_computes: usize, ttid: TenantTimelineId) -> bool {
|
||||
let is_active = self.is_active(num_computes);
|
||||
if self.active != is_active {
|
||||
info!(
|
||||
"timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
|
||||
ttid,
|
||||
is_active,
|
||||
self.sk.state.inmem.remote_consistent_lsn,
|
||||
self.sk.state.inmem.commit_lsn
|
||||
);
|
||||
if !is_active {
|
||||
if let Err(e) = self.sk.state.flush().await {
|
||||
warn!("control file save in update_status failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.active = is_active;
|
||||
self.is_wal_backup_action_pending(num_computes)
|
||||
}
|
||||
|
||||
/// Should we run s3 offloading in current state?
|
||||
fn is_wal_backup_required(&self, num_computes: usize) -> bool {
|
||||
let seg_size = self.get_wal_seg_size();
|
||||
num_computes > 0 ||
|
||||
// Currently only the whole segment is offloaded, so compare segment numbers.
|
||||
(self.sk.state.inmem.commit_lsn.segment_number(seg_size) >
|
||||
self.sk.state.inmem.backup_lsn.segment_number(seg_size))
|
||||
}
|
||||
|
||||
/// Is current state of s3 offloading is not what it ought to be?
|
||||
fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
|
||||
let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
|
||||
if res {
|
||||
let action_pending = if self.is_wal_backup_required(num_computes) {
|
||||
"start"
|
||||
} else {
|
||||
"stop"
|
||||
};
|
||||
trace!(
|
||||
"timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}",
|
||||
self.sk.state.timeline_id, action_pending, num_computes, self.sk.state.inmem.commit_lsn, self.sk.state.inmem.backup_lsn
|
||||
);
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
/// Returns whether s3 offloading is required and sets current status as
|
||||
/// matching.
|
||||
fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
|
||||
self.wal_backup_active = self.is_wal_backup_required(num_computes);
|
||||
self.wal_backup_active
|
||||
}
|
||||
|
||||
fn get_wal_seg_size(&self) -> usize {
|
||||
self.sk.state.server.wal_seg_size as usize
|
||||
}
|
||||
@@ -235,7 +248,6 @@ impl SharedState {
|
||||
&self,
|
||||
ttid: &TenantTimelineId,
|
||||
conf: &SafeKeeperConf,
|
||||
standby_apply_lsn: Lsn,
|
||||
) -> SafekeeperTimelineInfo {
|
||||
SafekeeperTimelineInfo {
|
||||
safekeeper_id: conf.my_id.0,
|
||||
@@ -258,14 +270,13 @@ impl SharedState {
|
||||
backup_lsn: self.sk.state.inmem.backup_lsn.0,
|
||||
local_start_lsn: self.sk.state.local_start_lsn.0,
|
||||
availability_zone: conf.availability_zone.clone(),
|
||||
standby_horizon: standby_apply_lsn.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get our latest view of alive peers status on the timeline.
|
||||
/// We pass our own info through the broker as well, so when we don't have connection
|
||||
/// to the broker returned vec is empty.
|
||||
pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
|
||||
fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
|
||||
let now = Instant::now();
|
||||
self.peers_info
|
||||
.0
|
||||
@@ -281,13 +292,18 @@ impl SharedState {
|
||||
/// offloading.
|
||||
/// While it is safe to use inmem values for determining horizon,
|
||||
/// we use persistent to make possible normal states less surprising.
|
||||
fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
|
||||
fn get_horizon_segno(
|
||||
&self,
|
||||
wal_backup_enabled: bool,
|
||||
extra_horizon_lsn: Option<Lsn>,
|
||||
) -> XLogSegNo {
|
||||
let state = &self.sk.state;
|
||||
|
||||
use std::cmp::min;
|
||||
let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
|
||||
// we don't want to remove WAL that is not yet offloaded to s3
|
||||
horizon_lsn = min(horizon_lsn, state.backup_lsn);
|
||||
if wal_backup_enabled {
|
||||
horizon_lsn = min(horizon_lsn, state.backup_lsn);
|
||||
}
|
||||
if let Some(extra_horizon_lsn) = extra_horizon_lsn {
|
||||
horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
|
||||
}
|
||||
@@ -328,6 +344,11 @@ impl From<TimelineError> for ApiError {
|
||||
pub struct Timeline {
|
||||
pub ttid: TenantTimelineId,
|
||||
|
||||
/// Sending here asks for wal backup launcher attention (start/stop
|
||||
/// offloading). Sending ttid instead of concrete command allows to do
|
||||
/// sending without timeline lock.
|
||||
pub wal_backup_launcher_tx: Sender<TenantTimelineId>,
|
||||
|
||||
/// Used to broadcast commit_lsn updates to all background jobs.
|
||||
commit_lsn_watch_tx: watch::Sender<Lsn>,
|
||||
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||
@@ -339,19 +360,19 @@ pub struct Timeline {
|
||||
term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
|
||||
term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,
|
||||
|
||||
/// Broadcasts shared state updates.
|
||||
shared_state_version_tx: watch::Sender<usize>,
|
||||
shared_state_version_rx: watch::Receiver<usize>,
|
||||
|
||||
/// Safekeeper and other state, that should remain consistent and
|
||||
/// synchronized with the disk. This is tokio mutex as we write WAL to disk
|
||||
/// while holding it, ensuring that consensus checks are in order.
|
||||
mutex: RwLock<SharedState>,
|
||||
mutex: Mutex<SharedState>,
|
||||
walsenders: Arc<WalSenders>,
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
|
||||
/// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
|
||||
pub(crate) cancel: CancellationToken,
|
||||
/// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
|
||||
cancellation_tx: watch::Sender<bool>,
|
||||
|
||||
/// Timeline should not be used after cancellation. Background tasks should
|
||||
/// monitor this channel and stop eventually after receiving `true` from this channel.
|
||||
cancellation_rx: watch::Receiver<bool>,
|
||||
|
||||
/// Directory where timeline state is stored.
|
||||
pub timeline_dir: Utf8PathBuf,
|
||||
@@ -361,15 +382,15 @@ pub struct Timeline {
|
||||
/// with different speed.
|
||||
// TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
|
||||
walsenders_keep_horizon: bool,
|
||||
|
||||
// timeline_manager controlled state
|
||||
pub(crate) broker_active: AtomicBool,
|
||||
pub(crate) wal_backup_active: AtomicBool,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Load existing timeline from disk.
|
||||
pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Timeline> {
|
||||
pub fn load_timeline(
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: TenantTimelineId,
|
||||
wal_backup_launcher_tx: Sender<TenantTimelineId>,
|
||||
) -> Result<Timeline> {
|
||||
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
|
||||
|
||||
let shared_state = SharedState::restore(conf, &ttid)?;
|
||||
@@ -379,25 +400,23 @@ impl Timeline {
|
||||
shared_state.sk.get_term(),
|
||||
shared_state.sk.flush_lsn(),
|
||||
)));
|
||||
let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
|
||||
let (cancellation_tx, cancellation_rx) = watch::channel(false);
|
||||
|
||||
let walreceivers = WalReceivers::new();
|
||||
Ok(Timeline {
|
||||
ttid,
|
||||
wal_backup_launcher_tx,
|
||||
commit_lsn_watch_tx,
|
||||
commit_lsn_watch_rx,
|
||||
term_flush_lsn_watch_tx,
|
||||
term_flush_lsn_watch_rx,
|
||||
shared_state_version_tx,
|
||||
shared_state_version_rx,
|
||||
mutex: RwLock::new(shared_state),
|
||||
mutex: Mutex::new(shared_state),
|
||||
walsenders: WalSenders::new(walreceivers.clone()),
|
||||
walreceivers,
|
||||
cancel: CancellationToken::default(),
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
walsenders_keep_horizon: conf.walsenders_keep_horizon,
|
||||
broker_active: AtomicBool::new(false),
|
||||
wal_backup_active: AtomicBool::new(false),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -405,6 +424,7 @@ impl Timeline {
|
||||
pub fn create_empty(
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: TenantTimelineId,
|
||||
wal_backup_launcher_tx: Sender<TenantTimelineId>,
|
||||
server_info: ServerInfo,
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
@@ -412,28 +432,25 @@ impl Timeline {
|
||||
let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
|
||||
let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
|
||||
watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
|
||||
let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
|
||||
|
||||
let (cancellation_tx, cancellation_rx) = watch::channel(false);
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
|
||||
|
||||
let walreceivers = WalReceivers::new();
|
||||
Ok(Timeline {
|
||||
ttid,
|
||||
wal_backup_launcher_tx,
|
||||
commit_lsn_watch_tx,
|
||||
commit_lsn_watch_rx,
|
||||
term_flush_lsn_watch_tx,
|
||||
term_flush_lsn_watch_rx,
|
||||
shared_state_version_tx,
|
||||
shared_state_version_rx,
|
||||
mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
|
||||
mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
|
||||
walsenders: WalSenders::new(walreceivers.clone()),
|
||||
walreceivers,
|
||||
cancel: CancellationToken::default(),
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
walsenders_keep_horizon: conf.walsenders_keep_horizon,
|
||||
broker_active: AtomicBool::new(false),
|
||||
wal_backup_active: AtomicBool::new(false),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -444,9 +461,8 @@ impl Timeline {
|
||||
/// and state on disk should remain unchanged.
|
||||
pub async fn init_new(
|
||||
self: &Arc<Timeline>,
|
||||
shared_state: &mut WriteGuardSharedState<'_>,
|
||||
shared_state: &mut MutexGuard<'_, SharedState>,
|
||||
conf: &SafeKeeperConf,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
) -> Result<()> {
|
||||
match fs::metadata(&self.timeline_dir).await {
|
||||
Ok(_) => {
|
||||
@@ -477,29 +493,16 @@ impl Timeline {
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
self.bootstrap(conf, broker_active_set);
|
||||
self.bootstrap(conf);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Bootstrap new or existing timeline starting background tasks.
|
||||
pub fn bootstrap(
|
||||
self: &Arc<Timeline>,
|
||||
conf: &SafeKeeperConf,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
) {
|
||||
// Start manager task which will monitor timeline state and update
|
||||
// background tasks.
|
||||
tokio::spawn(timeline_manager::main_task(
|
||||
self.clone(),
|
||||
conf.clone(),
|
||||
broker_active_set,
|
||||
));
|
||||
|
||||
/// Bootstrap new or existing timeline starting background stasks.
|
||||
pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
|
||||
// Start recovery task which always runs on the timeline.
|
||||
if conf.peer_recovery_enabled {
|
||||
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
||||
}
|
||||
// TODO: migrate to timeline_manager
|
||||
if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
|
||||
tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
|
||||
}
|
||||
@@ -512,9 +515,10 @@ impl Timeline {
|
||||
/// deletion API endpoint is retriable.
|
||||
pub async fn delete(
|
||||
&self,
|
||||
shared_state: &mut WriteGuardSharedState<'_>,
|
||||
shared_state: &mut MutexGuard<'_, SharedState>,
|
||||
only_local: bool,
|
||||
) -> Result<bool> {
|
||||
) -> Result<(bool, bool)> {
|
||||
let was_active = shared_state.active;
|
||||
self.cancel(shared_state);
|
||||
|
||||
// TODO: It's better to wait for s3 offloader termination before
|
||||
@@ -528,14 +532,20 @@ impl Timeline {
|
||||
wal_backup::delete_timeline(&self.ttid).await?;
|
||||
}
|
||||
let dir_existed = delete_dir(&self.timeline_dir).await?;
|
||||
Ok(dir_existed)
|
||||
Ok((dir_existed, was_active))
|
||||
}
|
||||
|
||||
/// Cancel timeline to prevent further usage. Background tasks will stop
|
||||
/// eventually after receiving cancellation signal.
|
||||
fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
|
||||
///
|
||||
/// Note that we can't notify backup launcher here while holding
|
||||
/// shared_state lock, as this is a potential deadlock: caller is
|
||||
/// responsible for that. Generally we should probably make WAL backup tasks
|
||||
/// to shut down on their own, checking once in a while whether it is the
|
||||
/// time.
|
||||
fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
|
||||
info!("timeline {} is cancelled", self.ttid);
|
||||
self.cancel.cancel();
|
||||
let _ = self.cancellation_tx.send(true);
|
||||
// Close associated FDs. Nobody will be able to touch timeline data once
|
||||
// it is cancelled, so WAL storage won't be opened again.
|
||||
shared_state.sk.wal_store.close();
|
||||
@@ -543,16 +553,44 @@ impl Timeline {
|
||||
|
||||
/// Returns if timeline is cancelled.
|
||||
pub fn is_cancelled(&self) -> bool {
|
||||
self.cancel.is_cancelled()
|
||||
*self.cancellation_rx.borrow()
|
||||
}
|
||||
|
||||
/// Returns watch channel which gets value when timeline is cancelled. It is
|
||||
/// guaranteed to have not cancelled value observed (errors otherwise).
|
||||
pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
|
||||
let rx = self.cancellation_rx.clone();
|
||||
if *rx.borrow() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
Ok(rx)
|
||||
}
|
||||
|
||||
/// Take a writing mutual exclusive lock on timeline shared_state.
|
||||
pub async fn write_shared_state<'a>(self: &'a Arc<Self>) -> WriteGuardSharedState<'a> {
|
||||
WriteGuardSharedState::new(self.clone(), self.mutex.write().await)
|
||||
pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
|
||||
self.mutex.lock().await
|
||||
}
|
||||
|
||||
pub async fn read_shared_state(&self) -> ReadGuardSharedState {
|
||||
self.mutex.read().await
|
||||
async fn update_status(&self, shared_state: &mut SharedState) -> bool {
|
||||
shared_state
|
||||
.update_status(self.walreceivers.get_num(), self.ttid)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
|
||||
pub async fn update_status_notify(&self) -> Result<()> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
let is_wal_backup_action_pending: bool = {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
self.update_status(&mut shared_state).await
|
||||
};
|
||||
if is_wal_backup_action_pending {
|
||||
// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
self.wal_backup_launcher_tx.send(self.ttid).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns true if walsender should stop sending WAL to pageserver. We
|
||||
@@ -564,7 +602,7 @@ impl Timeline {
|
||||
if self.is_cancelled() {
|
||||
return true;
|
||||
}
|
||||
let shared_state = self.read_shared_state().await;
|
||||
let shared_state = self.write_shared_state().await;
|
||||
if self.walreceivers.get_num() == 0 {
|
||||
return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
|
||||
reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
|
||||
@@ -572,9 +610,9 @@ impl Timeline {
|
||||
false
|
||||
}
|
||||
|
||||
/// Ensure that current term is t, erroring otherwise, and lock the state.
|
||||
pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
|
||||
let ss = self.read_shared_state().await;
|
||||
/// Ensure taht current term is t, erroring otherwise, and lock the state.
|
||||
pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
|
||||
let ss = self.write_shared_state().await;
|
||||
if ss.sk.state.acceptor_state.term != t {
|
||||
bail!(
|
||||
"failed to acquire term {}, current term {}",
|
||||
@@ -585,6 +623,18 @@ impl Timeline {
|
||||
Ok(ss)
|
||||
}
|
||||
|
||||
/// Returns whether s3 offloading is required and sets current status as
|
||||
/// matching it.
|
||||
pub async fn wal_backup_attend(&self) -> bool {
|
||||
if self.is_cancelled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
self.write_shared_state()
|
||||
.await
|
||||
.wal_backup_attend(self.walreceivers.get_num())
|
||||
}
|
||||
|
||||
/// Returns commit_lsn watch channel.
|
||||
pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
|
||||
self.commit_lsn_watch_rx.clone()
|
||||
@@ -595,14 +645,9 @@ impl Timeline {
|
||||
self.term_flush_lsn_watch_rx.clone()
|
||||
}
|
||||
|
||||
/// Returns watch channel for SharedState update version.
|
||||
pub fn get_state_version_rx(&self) -> watch::Receiver<usize> {
|
||||
self.shared_state_version_rx.clone()
|
||||
}
|
||||
|
||||
/// Pass arrived message to the safekeeper.
|
||||
pub async fn process_msg(
|
||||
self: &Arc<Self>,
|
||||
&self,
|
||||
msg: &ProposerAcceptorMessage,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
if self.is_cancelled() {
|
||||
@@ -610,36 +655,53 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let mut rmsg: Option<AcceptorProposerMessage>;
|
||||
let commit_lsn: Lsn;
|
||||
let term_flush_lsn: TermLsn;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
rmsg = shared_state.sk.process_msg(msg).await?;
|
||||
|
||||
// if this is AppendResponse, fill in proper hot standby feedback.
|
||||
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
|
||||
resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
|
||||
resp.hs_feedback = self.walsenders.get_hotstandby();
|
||||
}
|
||||
|
||||
commit_lsn = shared_state.sk.state.inmem.commit_lsn;
|
||||
term_flush_lsn =
|
||||
TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
|
||||
}
|
||||
self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
|
||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||
Ok(rmsg)
|
||||
}
|
||||
|
||||
/// Returns wal_seg_size.
|
||||
pub async fn get_wal_seg_size(&self) -> usize {
|
||||
self.read_shared_state().await.get_wal_seg_size()
|
||||
self.write_shared_state().await.get_wal_seg_size()
|
||||
}
|
||||
|
||||
/// Returns true only if the timeline is loaded and active.
|
||||
pub async fn is_active(&self) -> bool {
|
||||
if self.is_cancelled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
self.write_shared_state().await.active
|
||||
}
|
||||
|
||||
/// Returns state of the timeline.
|
||||
pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) {
|
||||
let state = self.read_shared_state().await;
|
||||
let state = self.write_shared_state().await;
|
||||
(state.sk.state.inmem.clone(), state.sk.state.clone())
|
||||
}
|
||||
|
||||
/// Returns latest backup_lsn.
|
||||
pub async fn get_wal_backup_lsn(&self) -> Lsn {
|
||||
self.read_shared_state().await.sk.state.inmem.backup_lsn
|
||||
self.write_shared_state().await.sk.state.inmem.backup_lsn
|
||||
}
|
||||
|
||||
/// Sets backup_lsn to the given value.
|
||||
pub async fn set_wal_backup_lsn(self: &Arc<Self>, backup_lsn: Lsn) -> Result<()> {
|
||||
pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
@@ -653,34 +715,39 @@ impl Timeline {
|
||||
|
||||
/// Get safekeeper info for broadcasting to broker and other peers.
|
||||
pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
|
||||
let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn;
|
||||
let shared_state = self.read_shared_state().await;
|
||||
shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn)
|
||||
let shared_state = self.write_shared_state().await;
|
||||
shared_state.get_safekeeper_info(&self.ttid, conf)
|
||||
}
|
||||
|
||||
/// Update timeline state with peer safekeeper data.
|
||||
pub async fn record_safekeeper_info(
|
||||
self: &Arc<Self>,
|
||||
sk_info: SafekeeperTimelineInfo,
|
||||
) -> Result<()> {
|
||||
pub async fn record_safekeeper_info(&self, sk_info: SafekeeperTimelineInfo) -> Result<()> {
|
||||
let is_wal_backup_action_pending: bool;
|
||||
let commit_lsn: Lsn;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.sk.record_safekeeper_info(&sk_info).await?;
|
||||
let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
|
||||
shared_state.peers_info.upsert(&peer_info);
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
|
||||
commit_lsn = shared_state.sk.state.inmem.commit_lsn;
|
||||
}
|
||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||
// Wake up wal backup launcher, if it is time to stop the offloading.
|
||||
if is_wal_backup_action_pending {
|
||||
self.wal_backup_launcher_tx.send(self.ttid).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update in memory remote consistent lsn.
|
||||
pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
|
||||
pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.sk.state.inmem.remote_consistent_lsn =
|
||||
max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
|
||||
}
|
||||
|
||||
pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
|
||||
let shared_state = self.read_shared_state().await;
|
||||
let shared_state = self.write_shared_state().await;
|
||||
shared_state.get_peers(conf.heartbeat_timeout)
|
||||
}
|
||||
|
||||
@@ -702,7 +769,7 @@ impl Timeline {
|
||||
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
|
||||
/// Thus we don't try to predict it here.
|
||||
pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
|
||||
let ss = self.read_shared_state().await;
|
||||
let ss = self.write_shared_state().await;
|
||||
let term = ss.sk.state.acceptor_state.term;
|
||||
let last_log_term = ss.sk.get_epoch();
|
||||
let flush_lsn = ss.sk.flush_lsn();
|
||||
@@ -773,12 +840,12 @@ impl Timeline {
|
||||
|
||||
/// Returns flush_lsn.
|
||||
pub async fn get_flush_lsn(&self) -> Lsn {
|
||||
self.read_shared_state().await.sk.wal_store.flush_lsn()
|
||||
self.write_shared_state().await.sk.wal_store.flush_lsn()
|
||||
}
|
||||
|
||||
/// Delete WAL segments from disk that are no longer needed. This is determined
|
||||
/// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
|
||||
pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
|
||||
pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
@@ -794,8 +861,9 @@ impl Timeline {
|
||||
|
||||
let horizon_segno: XLogSegNo;
|
||||
let remover = {
|
||||
let shared_state = self.read_shared_state().await;
|
||||
horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
|
||||
let shared_state = self.write_shared_state().await;
|
||||
horizon_segno =
|
||||
shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn);
|
||||
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
|
||||
return Ok(()); // nothing to do
|
||||
}
|
||||
@@ -809,11 +877,7 @@ impl Timeline {
|
||||
|
||||
// update last_removed_segno
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
if shared_state.last_removed_segno != horizon_segno {
|
||||
shared_state.last_removed_segno = horizon_segno;
|
||||
} else {
|
||||
shared_state.skip_update = true;
|
||||
}
|
||||
shared_state.last_removed_segno = horizon_segno;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -821,40 +885,46 @@ impl Timeline {
|
||||
/// passed after the last save. This helps to keep remote_consistent_lsn up
|
||||
/// to date so that storage nodes restart doesn't cause many pageserver ->
|
||||
/// safekeeper reconnections.
|
||||
pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
|
||||
let mut guard = self.write_shared_state().await;
|
||||
let changed = guard.sk.maybe_persist_inmem_control_file().await?;
|
||||
guard.skip_update = !changed;
|
||||
Ok(())
|
||||
pub async fn maybe_persist_control_file(&self) -> Result<()> {
|
||||
self.write_shared_state()
|
||||
.await
|
||||
.sk
|
||||
.maybe_persist_inmem_control_file()
|
||||
.await
|
||||
}
|
||||
|
||||
/// Gather timeline data for metrics.
|
||||
/// Gather timeline data for metrics. If the timeline is not active, returns
|
||||
/// None, we do not collect these.
|
||||
pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
|
||||
if self.is_cancelled() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
|
||||
let state = self.read_shared_state().await;
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
ps_feedback_count,
|
||||
last_ps_feedback,
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
timeline_is_active: self.broker_active.load(Ordering::Relaxed),
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
mem_state: state.sk.state.inmem.clone(),
|
||||
persisted_state: state.sk.state.clone(),
|
||||
flush_lsn: state.sk.wal_store.flush_lsn(),
|
||||
wal_storage: state.sk.wal_store.get_metrics(),
|
||||
})
|
||||
let state = self.write_shared_state().await;
|
||||
if state.active {
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
ps_feedback_count,
|
||||
last_ps_feedback,
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
timeline_is_active: state.active,
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
mem_state: state.sk.state.inmem.clone(),
|
||||
persisted_state: state.sk.state.clone(),
|
||||
flush_lsn: state.sk.wal_store.flush_lsn(),
|
||||
wal_storage: state.sk.wal_store.get_metrics(),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns in-memory timeline state to build a full debug dump.
|
||||
pub async fn memory_dump(&self) -> debug_dump::Memory {
|
||||
let state = self.read_shared_state().await;
|
||||
let state = self.write_shared_state().await;
|
||||
|
||||
let (write_lsn, write_record_lsn, flush_lsn, file_open) =
|
||||
state.sk.wal_store.internal_state();
|
||||
@@ -863,8 +933,8 @@ impl Timeline {
|
||||
is_cancelled: self.is_cancelled(),
|
||||
peers_info_len: state.peers_info.0.len(),
|
||||
walsenders: self.walsenders.get_all(),
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
active: self.broker_active.load(Ordering::Relaxed),
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
active: state.active,
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
@@ -878,7 +948,7 @@ impl Timeline {
|
||||
|
||||
/// Apply a function to the control file state and persist it.
|
||||
pub async fn map_control_file<T>(
|
||||
self: &Arc<Self>,
|
||||
&self,
|
||||
f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
|
||||
) -> Result<T> {
|
||||
let mut state = self.write_shared_state().await;
|
||||
|
||||
@@ -1,145 +0,0 @@
|
||||
//! The timeline manager task is responsible for managing the timeline's background tasks.
|
||||
//! It is spawned alongside each timeline and exits when the timeline is deleted.
|
||||
//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
|
||||
//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
|
||||
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use tracing::{info, instrument, warn};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
|
||||
timeline::{PeerInfo, ReadGuardSharedState, Timeline},
|
||||
timelines_set::TimelinesSet,
|
||||
wal_backup::{self, WalBackupTaskHandle},
|
||||
SafeKeeperConf,
|
||||
};
|
||||
|
||||
pub struct StateSnapshot {
|
||||
pub commit_lsn: Lsn,
|
||||
pub backup_lsn: Lsn,
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
pub peers: Vec<PeerInfo>,
|
||||
}
|
||||
|
||||
impl StateSnapshot {
|
||||
/// Create a new snapshot of the timeline state.
|
||||
fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self {
|
||||
Self {
|
||||
commit_lsn: read_guard.sk.state.inmem.commit_lsn,
|
||||
backup_lsn: read_guard.sk.state.inmem.backup_lsn,
|
||||
remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
|
||||
peers: read_guard.get_peers(heartbeat_timeout),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Control how often the manager task should wake up to check updates.
|
||||
/// There is no need to check for updates more often than this.
|
||||
const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
|
||||
|
||||
/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
|
||||
/// background tasks.
|
||||
#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn main_task(
|
||||
tli: Arc<Timeline>,
|
||||
conf: SafeKeeperConf,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
) {
|
||||
scopeguard::defer! {
|
||||
if tli.is_cancelled() {
|
||||
info!("manager task finished");
|
||||
} else {
|
||||
warn!("manager task finished prematurely");
|
||||
}
|
||||
};
|
||||
|
||||
// sets whether timeline is active for broker pushes or not
|
||||
let mut tli_broker_active = broker_active_set.guard(tli.clone());
|
||||
|
||||
let ttid = tli.ttid;
|
||||
let wal_seg_size = tli.get_wal_seg_size().await;
|
||||
let heartbeat_timeout = conf.heartbeat_timeout;
|
||||
|
||||
let mut state_version_rx = tli.get_state_version_rx();
|
||||
|
||||
let walreceivers = tli.get_walreceivers();
|
||||
let mut num_computes_rx = walreceivers.get_num_rx();
|
||||
|
||||
// list of background tasks
|
||||
let mut backup_task: Option<WalBackupTaskHandle> = None;
|
||||
|
||||
let last_state = 'outer: loop {
|
||||
MANAGER_ITERATIONS_TOTAL.inc();
|
||||
|
||||
let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
|
||||
let num_computes = *num_computes_rx.borrow();
|
||||
|
||||
let is_wal_backup_required =
|
||||
wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
|
||||
|
||||
if conf.is_wal_backup_enabled() {
|
||||
wal_backup::update_task(
|
||||
&conf,
|
||||
ttid,
|
||||
is_wal_backup_required,
|
||||
&state_snapshot,
|
||||
&mut backup_task,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
let is_active = is_wal_backup_required
|
||||
|| num_computes > 0
|
||||
|| state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
|
||||
|
||||
// update the broker timeline set
|
||||
if tli_broker_active.set(is_active) {
|
||||
// write log if state has changed
|
||||
info!(
|
||||
"timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
|
||||
is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
|
||||
);
|
||||
|
||||
MANAGER_ACTIVE_CHANGES.inc();
|
||||
|
||||
if !is_active {
|
||||
// TODO: maybe use tokio::spawn?
|
||||
if let Err(e) = tli.maybe_persist_control_file().await {
|
||||
warn!("control file save in update_status failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update the state in Arc<Timeline>
|
||||
tli.wal_backup_active
|
||||
.store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
|
||||
tli.broker_active
|
||||
.store(is_active, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
// wait until something changes. tx channels are stored under Arc, so they will not be
|
||||
// dropped until the manager task is finished.
|
||||
tokio::select! {
|
||||
_ = tli.cancel.cancelled() => {
|
||||
// timeline was deleted
|
||||
break 'outer state_snapshot;
|
||||
}
|
||||
_ = async {
|
||||
// don't wake up on every state change, but at most every REFRESH_INTERVAL
|
||||
tokio::time::sleep(REFRESH_INTERVAL).await;
|
||||
let _ = state_version_rx.changed().await;
|
||||
} => {
|
||||
// state was updated
|
||||
}
|
||||
_ = num_computes_rx.changed() => {
|
||||
// number of connected computes was updated
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// shutdown background tasks
|
||||
if conf.is_wal_backup_enabled() {
|
||||
wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,6 @@
|
||||
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::timeline::{Timeline, TimelineError};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::SafeKeeperConf;
|
||||
use anyhow::{bail, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
@@ -12,16 +11,16 @@ use once_cell::sync::Lazy;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tracing::*;
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
struct GlobalTimelinesState {
|
||||
timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
|
||||
wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
|
||||
conf: Option<SafeKeeperConf>,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
|
||||
}
|
||||
|
||||
@@ -37,8 +36,11 @@ impl GlobalTimelinesState {
|
||||
}
|
||||
|
||||
/// Get dependencies for a timeline constructor.
|
||||
fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>) {
|
||||
(self.get_conf().clone(), self.broker_active_set.clone())
|
||||
fn get_dependencies(&self) -> (SafeKeeperConf, Sender<TenantTimelineId>) {
|
||||
(
|
||||
self.get_conf().clone(),
|
||||
self.wal_backup_launcher_tx.as_ref().unwrap().clone(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Insert timeline into the map. Returns error if timeline with the same id already exists.
|
||||
@@ -63,8 +65,8 @@ impl GlobalTimelinesState {
|
||||
static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
|
||||
Mutex::new(GlobalTimelinesState {
|
||||
timelines: HashMap::new(),
|
||||
wal_backup_launcher_tx: None,
|
||||
conf: None,
|
||||
broker_active_set: Arc::new(TimelinesSet::default()),
|
||||
load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
|
||||
})
|
||||
});
|
||||
@@ -74,11 +76,16 @@ pub struct GlobalTimelines;
|
||||
|
||||
impl GlobalTimelines {
|
||||
/// Inject dependencies needed for the timeline constructors and load all timelines to memory.
|
||||
pub async fn init(conf: SafeKeeperConf) -> Result<()> {
|
||||
pub async fn init(
|
||||
conf: SafeKeeperConf,
|
||||
wal_backup_launcher_tx: Sender<TenantTimelineId>,
|
||||
) -> Result<()> {
|
||||
// clippy isn't smart enough to understand that drop(state) releases the
|
||||
// lock, so use explicit block
|
||||
let tenants_dir = {
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
assert!(state.wal_backup_launcher_tx.is_none());
|
||||
state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
|
||||
state.conf = Some(conf);
|
||||
|
||||
// Iterate through all directories and load tenants for all directories
|
||||
@@ -122,9 +129,12 @@ impl GlobalTimelines {
|
||||
/// this function is called during init when nothing else is running, so
|
||||
/// this is fine.
|
||||
async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
|
||||
let (conf, broker_active_set) = {
|
||||
let (conf, wal_backup_launcher_tx) = {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
state.get_dependencies()
|
||||
(
|
||||
state.get_conf().clone(),
|
||||
state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
|
||||
)
|
||||
};
|
||||
|
||||
let timelines_dir = conf.tenant_dir(&tenant_id);
|
||||
@@ -137,7 +147,7 @@ impl GlobalTimelines {
|
||||
TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
|
||||
{
|
||||
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
match Timeline::load_timeline(&conf, ttid) {
|
||||
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
|
||||
Ok(timeline) => {
|
||||
let tli = Arc::new(timeline);
|
||||
TIMELINES_STATE
|
||||
@@ -145,7 +155,8 @@ impl GlobalTimelines {
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(ttid, tli.clone());
|
||||
tli.bootstrap(&conf, broker_active_set.clone());
|
||||
tli.bootstrap(&conf);
|
||||
tli.update_status_notify().await.unwrap();
|
||||
}
|
||||
// If we can't load a timeline, it's most likely because of a corrupted
|
||||
// directory. We will log an error and won't allow to delete/recreate
|
||||
@@ -178,9 +189,9 @@ impl GlobalTimelines {
|
||||
_guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
|
||||
ttid: TenantTimelineId,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies();
|
||||
let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
|
||||
|
||||
match Timeline::load_timeline(&conf, ttid) {
|
||||
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
|
||||
Ok(timeline) => {
|
||||
let tli = Arc::new(timeline);
|
||||
|
||||
@@ -191,7 +202,7 @@ impl GlobalTimelines {
|
||||
.timelines
|
||||
.insert(ttid, tli.clone());
|
||||
|
||||
tli.bootstrap(&conf, broker_active_set);
|
||||
tli.bootstrap(&conf);
|
||||
|
||||
Ok(tli)
|
||||
}
|
||||
@@ -210,10 +221,6 @@ impl GlobalTimelines {
|
||||
TIMELINES_STATE.lock().unwrap().get_conf().clone()
|
||||
}
|
||||
|
||||
pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
|
||||
TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
|
||||
}
|
||||
|
||||
/// Create a new timeline with the given id. If the timeline already exists, returns
|
||||
/// an existing timeline.
|
||||
pub async fn create(
|
||||
@@ -222,7 +229,7 @@ impl GlobalTimelines {
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let (conf, broker_active_set) = {
|
||||
let (conf, wal_backup_launcher_tx) = {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
if let Ok(timeline) = state.get(&ttid) {
|
||||
// Timeline already exists, return it.
|
||||
@@ -236,6 +243,7 @@ impl GlobalTimelines {
|
||||
let timeline = Arc::new(Timeline::create_empty(
|
||||
&conf,
|
||||
ttid,
|
||||
wal_backup_launcher_tx,
|
||||
server_info,
|
||||
commit_lsn,
|
||||
local_start_lsn,
|
||||
@@ -256,10 +264,7 @@ impl GlobalTimelines {
|
||||
// Write the new timeline to the disk and start background workers.
|
||||
// Bootstrap is transactional, so if it fails, the timeline will be deleted,
|
||||
// and the state on disk should remain unchanged.
|
||||
if let Err(e) = timeline
|
||||
.init_new(&mut shared_state, &conf, broker_active_set)
|
||||
.await
|
||||
{
|
||||
if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
|
||||
// Note: the most likely reason for init failure is that the timeline
|
||||
// directory already exists on disk. This happens when timeline is corrupted
|
||||
// and wasn't loaded from disk on startup because of that. We want to preserve
|
||||
@@ -276,6 +281,8 @@ impl GlobalTimelines {
|
||||
// We are done with bootstrap, release the lock, return the timeline.
|
||||
// {} block forces release before .await
|
||||
}
|
||||
timeline.update_status_notify().await?;
|
||||
timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -328,13 +335,12 @@ impl GlobalTimelines {
|
||||
let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
|
||||
match tli_res {
|
||||
Ok(timeline) => {
|
||||
let was_active = timeline.broker_active.load(Ordering::Relaxed);
|
||||
|
||||
// Take a lock and finish the deletion holding this mutex.
|
||||
let mut shared_state = timeline.write_shared_state().await;
|
||||
|
||||
info!("deleting timeline {}, only_local={}", ttid, only_local);
|
||||
let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
|
||||
let (dir_existed, was_active) =
|
||||
timeline.delete(&mut shared_state, only_local).await?;
|
||||
|
||||
// Remove timeline from the map.
|
||||
// FIXME: re-enable it once we fix the issue with recreation of deleted timelines
|
||||
@@ -343,7 +349,7 @@ impl GlobalTimelines {
|
||||
|
||||
Ok(TimelineDeleteForceResult {
|
||||
dir_existed,
|
||||
was_active, // TODO: we probably should remove this field
|
||||
was_active,
|
||||
})
|
||||
}
|
||||
Err(_) => {
|
||||
|
||||
@@ -1,90 +0,0 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::timeline::Timeline;
|
||||
|
||||
/// Set of timelines, supports operations:
|
||||
/// - add timeline
|
||||
/// - remove timeline
|
||||
/// - clone the set
|
||||
///
|
||||
/// Usually used for keeping subset of timelines. For example active timelines that require broker push.
|
||||
pub struct TimelinesSet {
|
||||
timelines: std::sync::Mutex<HashMap<TenantTimelineId, Arc<Timeline>>>,
|
||||
}
|
||||
|
||||
impl Default for TimelinesSet {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
timelines: std::sync::Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TimelinesSet {
|
||||
pub fn insert(&self, tli: Arc<Timeline>) {
|
||||
self.timelines.lock().unwrap().insert(tli.ttid, tli);
|
||||
}
|
||||
|
||||
pub fn delete(&self, ttid: &TenantTimelineId) {
|
||||
self.timelines.lock().unwrap().remove(ttid);
|
||||
}
|
||||
|
||||
/// If present is true, adds timeline to the set, otherwise removes it.
|
||||
pub fn set_present(&self, tli: Arc<Timeline>, present: bool) {
|
||||
if present {
|
||||
self.insert(tli);
|
||||
} else {
|
||||
self.delete(&tli.ttid);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_present(&self, ttid: &TenantTimelineId) -> bool {
|
||||
self.timelines.lock().unwrap().contains_key(ttid)
|
||||
}
|
||||
|
||||
/// Returns all timelines in the set.
|
||||
pub fn get_all(&self) -> Vec<Arc<Timeline>> {
|
||||
self.timelines.lock().unwrap().values().cloned().collect()
|
||||
}
|
||||
|
||||
/// Returns a timeline guard for easy presence control.
|
||||
pub fn guard(self: &Arc<Self>, tli: Arc<Timeline>) -> TimelineSetGuard {
|
||||
let is_present = self.is_present(&tli.ttid);
|
||||
TimelineSetGuard {
|
||||
timelines_set: self.clone(),
|
||||
tli,
|
||||
is_present,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Guard is used to add or remove timeline from the set.
|
||||
/// If the timeline present in set, it will be removed from it on drop.
|
||||
/// Note: do not use more than one guard for the same timeline, it caches the presence state.
|
||||
/// It is designed to be used in the manager task only.
|
||||
pub struct TimelineSetGuard {
|
||||
timelines_set: Arc<TimelinesSet>,
|
||||
tli: Arc<Timeline>,
|
||||
is_present: bool,
|
||||
}
|
||||
|
||||
impl TimelineSetGuard {
|
||||
/// Returns true if the state was changed.
|
||||
pub fn set(&mut self, present: bool) -> bool {
|
||||
if present == self.is_present {
|
||||
return false;
|
||||
}
|
||||
self.is_present = present;
|
||||
self.timelines_set.set_present(self.tli.clone(), present);
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineSetGuard {
|
||||
fn drop(&mut self) {
|
||||
// remove timeline from the map on drop
|
||||
self.timelines_set.delete(&self.tli.ttid);
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,7 @@ use utils::backoff;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use std::cmp::min;
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
@@ -29,10 +29,9 @@ use tracing::*;
|
||||
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
|
||||
use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS};
|
||||
use crate::timeline::{PeerInfo, Timeline};
|
||||
use crate::timeline_manager::StateSnapshot;
|
||||
use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
@@ -42,84 +41,35 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
pub struct WalBackupTaskHandle {
|
||||
/// Check whether wal backup is required for timeline. If yes, mark that launcher is
|
||||
/// aware of current status and return the timeline.
|
||||
async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
|
||||
match GlobalTimelines::get(ttid).ok() {
|
||||
Some(tli) => {
|
||||
tli.wal_backup_attend().await;
|
||||
Some(tli)
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
struct WalBackupTaskHandle {
|
||||
shutdown_tx: Sender<()>,
|
||||
handle: JoinHandle<()>,
|
||||
}
|
||||
|
||||
/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity?
|
||||
pub fn is_wal_backup_required(
|
||||
wal_seg_size: usize,
|
||||
num_computes: usize,
|
||||
state: &StateSnapshot,
|
||||
) -> bool {
|
||||
num_computes > 0 ||
|
||||
// Currently only the whole segment is offloaded, so compare segment numbers.
|
||||
(state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size))
|
||||
struct WalBackupTimelineEntry {
|
||||
timeline: Arc<Timeline>,
|
||||
handle: Option<WalBackupTaskHandle>,
|
||||
}
|
||||
|
||||
/// Based on peer information determine which safekeeper should offload; if it
|
||||
/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
|
||||
/// is running, kill it.
|
||||
pub async fn update_task(
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: TenantTimelineId,
|
||||
need_backup: bool,
|
||||
state: &StateSnapshot,
|
||||
entry: &mut Option<WalBackupTaskHandle>,
|
||||
) {
|
||||
let (offloader, election_dbg_str) =
|
||||
determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
|
||||
let elected_me = Some(conf.my_id) == offloader;
|
||||
|
||||
let should_task_run = need_backup && elected_me;
|
||||
|
||||
// start or stop the task
|
||||
if should_task_run != (entry.is_some()) {
|
||||
if should_task_run {
|
||||
info!("elected for backup: {}", election_dbg_str);
|
||||
|
||||
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
|
||||
let timeline_dir = conf.timeline_dir(&ttid);
|
||||
|
||||
let async_task = backup_task_main(
|
||||
ttid,
|
||||
timeline_dir,
|
||||
conf.workdir.clone(),
|
||||
conf.backup_parallel_jobs,
|
||||
shutdown_rx,
|
||||
);
|
||||
|
||||
let handle = if conf.current_thread_runtime {
|
||||
tokio::spawn(async_task)
|
||||
} else {
|
||||
WAL_BACKUP_RUNTIME.spawn(async_task)
|
||||
};
|
||||
|
||||
*entry = Some(WalBackupTaskHandle {
|
||||
shutdown_tx,
|
||||
handle,
|
||||
});
|
||||
} else {
|
||||
if !need_backup {
|
||||
// don't need backup at all
|
||||
info!("stepping down from backup, need_backup={}", need_backup);
|
||||
} else {
|
||||
// someone else has been elected
|
||||
info!("stepping down from backup: {}", election_dbg_str);
|
||||
}
|
||||
shut_down_task(entry).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
|
||||
if let Some(wb_handle) = entry.take() {
|
||||
async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) {
|
||||
if let Some(wb_handle) = entry.handle.take() {
|
||||
// Tell the task to shutdown. Error means task exited earlier, that's ok.
|
||||
let _ = wb_handle.shutdown_tx.send(()).await;
|
||||
// Await the task itself. TODO: restart panicked tasks earlier.
|
||||
if let Err(e) = wb_handle.handle.await {
|
||||
warn!("WAL backup task panicked: {}", e);
|
||||
warn!("WAL backup task for {} panicked: {}", ttid, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -176,6 +126,49 @@ fn determine_offloader(
|
||||
}
|
||||
}
|
||||
|
||||
/// Based on peer information determine which safekeeper should offload; if it
|
||||
/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
|
||||
/// is running, kill it.
|
||||
async fn update_task(
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: TenantTimelineId,
|
||||
entry: &mut WalBackupTimelineEntry,
|
||||
) {
|
||||
let alive_peers = entry.timeline.get_peers(conf).await;
|
||||
let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
|
||||
let (offloader, election_dbg_str) =
|
||||
determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
|
||||
let elected_me = Some(conf.my_id) == offloader;
|
||||
|
||||
if elected_me != (entry.handle.is_some()) {
|
||||
if elected_me {
|
||||
info!("elected for backup: {}", election_dbg_str);
|
||||
|
||||
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
|
||||
let timeline_dir = conf.timeline_dir(&ttid);
|
||||
|
||||
let handle = tokio::spawn(
|
||||
backup_task_main(
|
||||
ttid,
|
||||
timeline_dir,
|
||||
conf.workdir.clone(),
|
||||
conf.backup_parallel_jobs,
|
||||
shutdown_rx,
|
||||
)
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
entry.handle = Some(WalBackupTaskHandle {
|
||||
shutdown_tx,
|
||||
handle,
|
||||
});
|
||||
} else {
|
||||
info!("stepping down from backup: {}", election_dbg_str);
|
||||
shut_down_task(ttid, entry).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
|
||||
|
||||
// Storage must be configured and initialized when this is called.
|
||||
@@ -197,6 +190,67 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
|
||||
});
|
||||
}
|
||||
|
||||
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
|
||||
|
||||
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
|
||||
/// tasks. Having this in separate task simplifies locking, allows to reap
|
||||
/// panics and separate elections from offloading itself.
|
||||
pub async fn wal_backup_launcher_task_main(
|
||||
conf: SafeKeeperConf,
|
||||
mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
|
||||
) -> anyhow::Result<()> {
|
||||
info!(
|
||||
"WAL backup launcher started, remote config {:?}",
|
||||
conf.remote_storage
|
||||
);
|
||||
|
||||
// Presence in this map means launcher is aware s3 offloading is needed for
|
||||
// the timeline, but task is started only if it makes sense for to offload
|
||||
// from this safekeeper.
|
||||
let mut tasks: HashMap<TenantTimelineId, WalBackupTimelineEntry> = HashMap::new();
|
||||
|
||||
let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC));
|
||||
loop {
|
||||
tokio::select! {
|
||||
ttid = wal_backup_launcher_rx.recv() => {
|
||||
// channel is never expected to get closed
|
||||
let ttid = ttid.unwrap();
|
||||
if !conf.is_wal_backup_enabled() {
|
||||
continue; /* just drain the channel and do nothing */
|
||||
}
|
||||
async {
|
||||
let timeline = is_wal_backup_required(ttid).await;
|
||||
// do we need to do anything at all?
|
||||
if timeline.is_some() != tasks.contains_key(&ttid) {
|
||||
if let Some(timeline) = timeline {
|
||||
// need to start the task
|
||||
let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry {
|
||||
timeline,
|
||||
handle: None,
|
||||
});
|
||||
update_task(&conf, ttid, entry).await;
|
||||
} else {
|
||||
// need to stop the task
|
||||
info!("stopping WAL backup task");
|
||||
let mut entry = tasks.remove(&ttid).unwrap();
|
||||
shut_down_task(ttid, &mut entry).await;
|
||||
}
|
||||
}
|
||||
}.instrument(info_span!("WAL backup", ttid = %ttid)).await;
|
||||
}
|
||||
// For each timeline needing offloading, check if this safekeeper
|
||||
// should do the job and start/stop the task accordingly.
|
||||
_ = ticker.tick() => {
|
||||
for (ttid, entry) in tasks.iter_mut() {
|
||||
update_task(&conf, *ttid, entry)
|
||||
.instrument(info_span!("WAL backup", ttid = %ttid))
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct WalBackupTask {
|
||||
timeline: Arc<Timeline>,
|
||||
timeline_dir: Utf8PathBuf,
|
||||
@@ -207,7 +261,6 @@ struct WalBackupTask {
|
||||
}
|
||||
|
||||
/// Offload single timeline.
|
||||
#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
|
||||
async fn backup_task_main(
|
||||
ttid: TenantTimelineId,
|
||||
timeline_dir: Utf8PathBuf,
|
||||
@@ -215,8 +268,6 @@ async fn backup_task_main(
|
||||
parallel_jobs: usize,
|
||||
mut shutdown_rx: Receiver<()>,
|
||||
) {
|
||||
let _guard = WAL_BACKUP_TASKS.guard();
|
||||
|
||||
info!("started");
|
||||
let res = GlobalTimelines::get(ttid);
|
||||
if let Err(e) = res {
|
||||
|
||||
@@ -277,6 +277,14 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
debug!("started");
|
||||
let await_duration = conf.partial_backup_timeout;
|
||||
|
||||
let mut cancellation_rx = match tli.get_cancellation_rx() {
|
||||
Ok(rx) => rx,
|
||||
Err(_) => {
|
||||
info!("timeline canceled during task start");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// sleep for random time to avoid thundering herd
|
||||
{
|
||||
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
|
||||
@@ -319,7 +327,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
&& flush_lsn_rx.borrow().term == seg.term
|
||||
{
|
||||
tokio::select! {
|
||||
_ = backup.tli.cancel.cancelled() => {
|
||||
_ = cancellation_rx.changed() => {
|
||||
info!("timeline canceled");
|
||||
return;
|
||||
}
|
||||
@@ -332,7 +340,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
// if we don't have any data and zero LSNs, wait for something
|
||||
while flush_lsn_rx.borrow().lsn == Lsn(0) {
|
||||
tokio::select! {
|
||||
_ = backup.tli.cancel.cancelled() => {
|
||||
_ = cancellation_rx.changed() => {
|
||||
info!("timeline canceled");
|
||||
return;
|
||||
}
|
||||
@@ -349,7 +357,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
// waiting until timeout expires OR segno changes
|
||||
'inner: loop {
|
||||
tokio::select! {
|
||||
_ = backup.tli.cancel.cancelled() => {
|
||||
_ = cancellation_rx.changed() => {
|
||||
info!("timeline canceled");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -147,7 +147,6 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
|
||||
http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
|
||||
local_start_lsn: 0,
|
||||
availability_zone: None,
|
||||
standby_horizon: 0,
|
||||
};
|
||||
counter += 1;
|
||||
yield info;
|
||||
|
||||
@@ -42,7 +42,6 @@ message SafekeeperTimelineInfo {
|
||||
uint64 remote_consistent_lsn = 7;
|
||||
uint64 peer_horizon_lsn = 8;
|
||||
uint64 local_start_lsn = 9;
|
||||
uint64 standby_horizon = 14;
|
||||
// A connection string to use for WAL receiving.
|
||||
string safekeeper_connstr = 10;
|
||||
// HTTP endpoint connection string
|
||||
@@ -106,6 +105,4 @@ message SafekeeperDiscoveryResponse {
|
||||
string safekeeper_connstr = 4;
|
||||
// Availability zone of a safekeeper.
|
||||
optional string availability_zone = 5;
|
||||
// Replica apply LSN
|
||||
uint64 standby_horizon = 6;
|
||||
}
|
||||
|
||||
@@ -736,7 +736,6 @@ mod tests {
|
||||
http_connstr: "neon-1-sk-1.local:7677".to_owned(),
|
||||
local_start_lsn: 0,
|
||||
availability_zone: None,
|
||||
standby_horizon: 0,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -142,7 +142,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_resident_physical_size",
|
||||
"pageserver_io_operations_bytes_total",
|
||||
"pageserver_last_record_lsn",
|
||||
"pageserver_standby_horizon",
|
||||
"pageserver_smgr_query_seconds_bucket",
|
||||
"pageserver_smgr_query_seconds_count",
|
||||
"pageserver_smgr_query_seconds_sum",
|
||||
|
||||
@@ -1625,7 +1625,7 @@ class NeonCli(AbstractNeonCli):
|
||||
args.extend(["-c", "switch_aux_file_policy:v1"])
|
||||
|
||||
if aux_file_v2 is AuxFileStore.CrossValidation:
|
||||
args.extend(["-c", "switch_aux_file_policy:cross-validation"])
|
||||
args.extend(["-c", "switch_aux_file_policy:cross_validation"])
|
||||
|
||||
if set_default:
|
||||
args.append("--set-default")
|
||||
@@ -2721,12 +2721,7 @@ class PgBin:
|
||||
env.update(env_add)
|
||||
return env
|
||||
|
||||
def run(
|
||||
self,
|
||||
command: List[str],
|
||||
env: Optional[Env] = None,
|
||||
cwd: Optional[Union[str, Path]] = None,
|
||||
):
|
||||
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
|
||||
"""
|
||||
Run one of the postgres binaries.
|
||||
|
||||
@@ -2788,28 +2783,6 @@ class PgBin:
|
||||
log.info(f"last checkpoint at {checkpoint_lsn}")
|
||||
return Lsn(checkpoint_lsn)
|
||||
|
||||
def take_fullbackup(
|
||||
self,
|
||||
pageserver: NeonPageserver,
|
||||
tenant: TenantId,
|
||||
timeline: TimelineId,
|
||||
lsn: Lsn,
|
||||
output: Path,
|
||||
):
|
||||
"""
|
||||
Request fullbackup from pageserver, store it at 'output'.
|
||||
"""
|
||||
cmd = [
|
||||
"psql",
|
||||
"--no-psqlrc",
|
||||
pageserver.connstr(),
|
||||
"-c",
|
||||
f"fullbackup {tenant} {timeline} {lsn}",
|
||||
"-o",
|
||||
str(output),
|
||||
]
|
||||
self.run_capture(cmd)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
|
||||
@@ -4172,12 +4145,7 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
|
||||
|
||||
|
||||
# pg is the existing and running compute node, that we want to compare with a basebackup
|
||||
def check_restored_datadir_content(
|
||||
test_output_dir: Path,
|
||||
env: NeonEnv,
|
||||
endpoint: Endpoint,
|
||||
ignored_files: Optional[list[str]] = None,
|
||||
):
|
||||
def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
|
||||
# Get the timeline ID. We need it for the 'basebackup' command
|
||||
@@ -4230,10 +4198,6 @@ def check_restored_datadir_content(
|
||||
if not f.startswith("pg_xact") and not f.startswith("pg_multixact")
|
||||
]
|
||||
|
||||
if ignored_files:
|
||||
pgdata_files = [f for f in pgdata_files if f not in ignored_files]
|
||||
restored_files = [f for f in restored_files if f not in ignored_files]
|
||||
|
||||
# check that file sets are equal
|
||||
assert pgdata_files == restored_files
|
||||
|
||||
@@ -4324,17 +4288,6 @@ def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def log_replica_lag(primary: Endpoint, secondary: Endpoint):
|
||||
last_replay_lsn = Lsn(
|
||||
secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False)
|
||||
)
|
||||
primary_lsn = Lsn(
|
||||
primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False)
|
||||
)
|
||||
lag = primary_lsn - last_replay_lsn
|
||||
log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}")
|
||||
|
||||
|
||||
def wait_for_last_flush_lsn(
|
||||
env: NeonEnv,
|
||||
endpoint: Endpoint,
|
||||
|
||||
@@ -70,7 +70,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
# this is expected given our collaborative shutdown approach for the UploadQueue
|
||||
".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
|
||||
".*Compaction failed.*, retrying in .*: ShuttingDown",
|
||||
".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*",
|
||||
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
|
||||
".*Error processing HTTP request: NotFound: Timeline .* was not found",
|
||||
".*took more than expected to complete.*",
|
||||
@@ -92,10 +91,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*",
|
||||
# Can happen when the test shuts down the storage controller while it is calling the utilization API
|
||||
".*WARN.*path=/v1/utilization .*request was dropped before completing",
|
||||
# Can happen during shutdown
|
||||
".*scheduling deletion on drop failed: queue is in state Stopped.*",
|
||||
# Can happen during shutdown
|
||||
".*ignoring failure to find gc cutoffs: timeline shutting down.*",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -56,30 +56,20 @@ class InMemoryLayerInfo:
|
||||
class HistoricLayerInfo:
|
||||
kind: str
|
||||
layer_file_name: str
|
||||
layer_file_size: int
|
||||
layer_file_size: Optional[int]
|
||||
lsn_start: str
|
||||
lsn_end: Optional[str]
|
||||
remote: bool
|
||||
# None for image layers, true if pageserver thinks this is an L0 delta layer
|
||||
l0: Optional[bool]
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
|
||||
# instead of parsing the key range lets keep the definition of "L0" in pageserver
|
||||
l0_ness = d.get("l0")
|
||||
assert l0_ness is None or isinstance(l0_ness, bool)
|
||||
|
||||
size = d["layer_file_size"]
|
||||
assert isinstance(size, int)
|
||||
|
||||
return HistoricLayerInfo(
|
||||
kind=d["kind"],
|
||||
layer_file_name=d["layer_file_name"],
|
||||
layer_file_size=size,
|
||||
layer_file_size=d.get("layer_file_size"),
|
||||
lsn_start=d["lsn_start"],
|
||||
lsn_end=d.get("lsn_end"),
|
||||
remote=d["remote"],
|
||||
l0=l0_ness,
|
||||
)
|
||||
|
||||
|
||||
@@ -593,7 +583,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
timeline_id: TimelineId,
|
||||
force_repartition=False,
|
||||
force_image_layer_creation=False,
|
||||
wait_until_uploaded=False,
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
query = {}
|
||||
@@ -601,8 +590,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
query["force_repartition"] = "true"
|
||||
if force_image_layer_creation:
|
||||
query["force_image_layer_creation"] = "true"
|
||||
if wait_until_uploaded:
|
||||
query["wait_until_uploaded"] = "true"
|
||||
|
||||
log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
@@ -669,7 +656,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
timeline_id: TimelineId,
|
||||
force_repartition=False,
|
||||
force_image_layer_creation=False,
|
||||
wait_until_uploaded=False,
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
query = {}
|
||||
@@ -677,8 +663,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
query["force_repartition"] = "true"
|
||||
if force_image_layer_creation:
|
||||
query["force_image_layer_creation"] = "true"
|
||||
if wait_until_uploaded:
|
||||
query["wait_until_uploaded"] = "true"
|
||||
|
||||
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
|
||||
@@ -4,13 +4,10 @@ import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tarfile
|
||||
import threading
|
||||
import time
|
||||
from hashlib import sha256
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
IO,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
@@ -18,10 +15,8 @@ from typing import (
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Set,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
from urllib.parse import urlencode
|
||||
|
||||
@@ -495,68 +490,12 @@ def assert_no_errors(log_file, service, allowed_errors):
|
||||
|
||||
@enum.unique
|
||||
class AuxFileStore(str, enum.Enum):
|
||||
V1 = "v1"
|
||||
V2 = "v2"
|
||||
CrossValidation = "cross-validation"
|
||||
V1 = "V1"
|
||||
V2 = "V2"
|
||||
CrossValidation = "CrossValidation"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"'aux-{self.value}'"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"'aux-{self.value}'"
|
||||
|
||||
|
||||
def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]):
|
||||
"""
|
||||
This is essentially:
|
||||
|
||||
lines=$(comm -3 \
|
||||
<(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
|
||||
<(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
|
||||
| wc -l)
|
||||
[ "$lines" = "0" ]
|
||||
|
||||
But in a more mac friendly fashion.
|
||||
"""
|
||||
started_at = time.time()
|
||||
|
||||
def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
|
||||
assert reader is not None
|
||||
digest = sha256(usedforsecurity=False)
|
||||
while True:
|
||||
buf = reader.read(64 * 1024)
|
||||
if not buf:
|
||||
break
|
||||
digest.update(buf)
|
||||
return digest.digest()
|
||||
|
||||
def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
|
||||
with tarfile.open(p) as f:
|
||||
matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
|
||||
ret = list(
|
||||
map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
|
||||
)
|
||||
ret.sort(key=lambda t: t[0])
|
||||
return ret
|
||||
|
||||
left_list, right_list = map(build_hash_list, [left, right])
|
||||
|
||||
assert len(left_list) == len(
|
||||
right_list
|
||||
), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}"
|
||||
|
||||
mismatching = set()
|
||||
|
||||
for left_tuple, right_tuple in zip(left_list, right_list):
|
||||
left_path, left_hash = left_tuple
|
||||
right_path, right_hash = right_tuple
|
||||
assert (
|
||||
left_path == right_path
|
||||
), f"file count matched, expected these to be same paths: {left_path}, {right_path}"
|
||||
if left_hash != right_hash:
|
||||
mismatching.add(left_path)
|
||||
|
||||
assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}"
|
||||
|
||||
elapsed = time.time() - started_at
|
||||
log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
|
||||
|
||||
@@ -17,13 +17,9 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# eviction might be the first one after an attach to access the layers
|
||||
".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
|
||||
# detach can happen before we get to validate the generation number
|
||||
".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
|
||||
]
|
||||
# eviction might be the first one after an attach to access the layers
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
|
||||
)
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
return env
|
||||
@@ -166,7 +162,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
"checkpoint_distance": 10000,
|
||||
"checkpoint_timeout": "13m",
|
||||
"compaction_algorithm": {
|
||||
"kind": "tiered",
|
||||
"kind": "Tiered",
|
||||
},
|
||||
"eviction_policy": {
|
||||
"kind": "LayerAccessThreshold",
|
||||
@@ -194,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
"trace_read_requests": True,
|
||||
"walreceiver_connect_timeout": "13m",
|
||||
"image_layer_creation_check_threshold": 1,
|
||||
"switch_aux_file_policy": "cross-validation",
|
||||
"switch_aux_file_policy": "CrossValidation",
|
||||
}
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
AuxFileStore,
|
||||
NeonEnvBuilder,
|
||||
logical_replication_sync,
|
||||
)
|
||||
@@ -15,7 +14,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
tenant_config = client.tenant_config(tenant_id).effective_config
|
||||
tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
|
||||
tenant_config["switch_aux_file_policy"] = "V2"
|
||||
client.set_tenant_config(tenant_id, tenant_config)
|
||||
# aux file v2 is enabled on the write path, so for now, it should be unset (or null)
|
||||
assert (
|
||||
@@ -50,10 +49,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
|
||||
|
||||
with env.pageserver.http_client() as client:
|
||||
# aux file v2 flag should be enabled at this point
|
||||
assert (
|
||||
client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
|
||||
== AuxFileStore.V2
|
||||
)
|
||||
assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2"
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_config = client.tenant_config(tenant_id).effective_config
|
||||
tenant_config["switch_aux_file_policy"] = "V1"
|
||||
@@ -63,7 +59,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
|
||||
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
|
||||
"last_aux_file_policy"
|
||||
]
|
||||
== AuxFileStore.V2
|
||||
== "V2"
|
||||
)
|
||||
env.pageserver.restart()
|
||||
with env.pageserver.http_client() as client:
|
||||
@@ -72,5 +68,5 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
|
||||
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
|
||||
"last_aux_file_policy"
|
||||
]
|
||||
== AuxFileStore.V2
|
||||
== "V2"
|
||||
)
|
||||
|
||||
@@ -165,6 +165,7 @@ def test_sharding_compaction(
|
||||
image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
|
||||
|
||||
# Pageserver should assert rather than emit an empty layer file, but double check here
|
||||
assert layer.layer_file_size is not None
|
||||
assert layer.layer_file_size > 0
|
||||
|
||||
shard_has_image_layers.append(len(image_layer_sizes) > 1)
|
||||
@@ -177,7 +178,7 @@ def test_sharding_compaction(
|
||||
#
|
||||
# We only do this check with tiny stripes, because large stripes may not give all shards enough
|
||||
# data to have statistically significant image layers
|
||||
avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)
|
||||
avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) # type: ignore
|
||||
log.info(f"Shard {shard_id} average image layer size: {avg_size}")
|
||||
assert avg_size > compaction_target_size / 2
|
||||
|
||||
@@ -194,8 +195,8 @@ def test_sharding_compaction(
|
||||
|
||||
|
||||
class CompactionAlgorithm(str, enum.Enum):
|
||||
LEGACY = "legacy"
|
||||
TIERED = "tiered"
|
||||
LEGACY = "Legacy"
|
||||
TIERED = "Tiered"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.common_types import Lsn, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
@@ -19,16 +19,17 @@ def test_fullbackup(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
port_distributor: PortDistributor,
|
||||
pg_distrib_dir: Path,
|
||||
test_output_dir: Path,
|
||||
):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# endpoint needs to be alive until the fullbackup so that we have
|
||||
# prev_record_lsn for the vanilla_pg to start in read-write mode
|
||||
# for some reason this does not happen if endpoint is shutdown.
|
||||
endpoint_main = env.endpoints.create_start("main")
|
||||
env.neon_cli.create_branch("test_fullbackup")
|
||||
endpoint_main = env.endpoints.create_start("test_fullbackup")
|
||||
|
||||
with endpoint_main.cursor() as cur:
|
||||
timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
|
||||
|
||||
# data loading may take a while, so increase statement timeout
|
||||
cur.execute("SET statement_timeout='300s'")
|
||||
cur.execute(
|
||||
@@ -40,13 +41,17 @@ def test_fullbackup(
|
||||
lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
|
||||
log.info(f"start_backup_lsn = {lsn}")
|
||||
|
||||
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
|
||||
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
|
||||
psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
|
||||
|
||||
# Get and unpack fullbackup from pageserver
|
||||
restored_dir_path = env.repo_dir / "restored_datadir"
|
||||
os.mkdir(restored_dir_path, 0o750)
|
||||
query = f"fullbackup {env.initial_tenant} {timeline} {lsn}"
|
||||
tar_output_file = test_output_dir / "fullbackup.tar"
|
||||
pg_bin.take_fullbackup(
|
||||
env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file
|
||||
)
|
||||
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
|
||||
pg_bin.run_capture(cmd, env=psql_env)
|
||||
subprocess_capture(
|
||||
env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)]
|
||||
)
|
||||
@@ -56,7 +61,7 @@ def test_fullbackup(
|
||||
# use resetwal to overwrite it
|
||||
pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
|
||||
cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
|
||||
pg_bin.run_capture(cmd)
|
||||
pg_bin.run_capture(cmd, env=psql_env)
|
||||
|
||||
# Restore from the backup and find the data we inserted
|
||||
port = port_distributor.get_port()
|
||||
|
||||
@@ -1,21 +1,9 @@
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
log_replica_lag,
|
||||
tenant_get_shards,
|
||||
wait_replica_caughtup,
|
||||
)
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
|
||||
|
||||
|
||||
# Check for corrupted WAL messages which might otherwise go unnoticed if
|
||||
@@ -116,28 +104,19 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
|
||||
wait_replica_caughtup(primary, secondary2)
|
||||
|
||||
|
||||
# Test two different scenarios related to gc of data needed by hot standby.
|
||||
# We had an issue that a standby server made GetPage requests with an
|
||||
# old LSN, based on the last-written LSN cache, to avoid waits in the
|
||||
# pageserver. However, requesting a page with a very old LSN, such
|
||||
# that the GC horizon has already advanced past it, results in an
|
||||
# error from the pageserver:
|
||||
# "Bad request: tried to request a page version that was garbage collected"
|
||||
#
|
||||
# When pause_apply is False, standby is mostly caught up with the primary.
|
||||
# However, in compute <-> pageserver protocol version 1 only one LSN had been
|
||||
# sent to the pageserver in page request, and to avoid waits in the pageserver
|
||||
# it was last-written LSN cache value. If page hasn't been updated for a long
|
||||
# time that resulted in an error from the pageserver: "Bad request: tried to
|
||||
# request a page version that was garbage collected". For primary this wasn't a
|
||||
# problem because pageserver always bumped LSN to the newest one; for standy
|
||||
# that would be incorrect since we might get page fresher then apply LSN. Hence,
|
||||
# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN
|
||||
# in case of standby) and not_modified_since which could be used as an
|
||||
# optimization to avoid waiting.
|
||||
# To avoid that, the compute<-> pageserver protocol was updated so
|
||||
# that that the standby now sends two LSNs, the old last-written LSN
|
||||
# and the current replay LSN.
|
||||
#
|
||||
# https://github.com/neondatabase/neon/issues/6211
|
||||
#
|
||||
# When pause_apply is True we model standby lagging behind primary (e.g. due to
|
||||
# high max_standby_streaming_delay). To prevent pageserver from removing data
|
||||
# still needed by the standby apply LSN is propagated in standby -> safekeepers
|
||||
# -> broker -> pageserver flow so that pageserver could hold off gc for it.
|
||||
@pytest.mark.parametrize("pause_apply", [False, True])
|
||||
def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
|
||||
def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_conf = {
|
||||
# set PITR interval to be small, so we can do GC
|
||||
"pitr_interval": "0 s",
|
||||
@@ -181,9 +160,6 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
|
||||
# so we still remember the LSNs of the pages.
|
||||
s_cur.execute("SELECT clear_buffer_cache()")
|
||||
|
||||
if pause_apply:
|
||||
s_cur.execute("SELECT pg_wal_replay_pause()")
|
||||
|
||||
# Do other stuff on the primary, to advance the WAL
|
||||
p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
|
||||
|
||||
@@ -200,155 +176,6 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
|
||||
# generates use old not_modified_since LSNs, older than
|
||||
# the GC cutoff, but new request LSNs. (In protocol
|
||||
# version 1 there was only one LSN, and this failed.)
|
||||
log_replica_lag(primary, secondary)
|
||||
s_cur.execute("SELECT COUNT(*) FROM test")
|
||||
log_replica_lag(primary, secondary)
|
||||
res = s_cur.fetchone()
|
||||
assert res[0] == 10000
|
||||
|
||||
|
||||
def run_pgbench(connstr: str, pg_bin: PgBin):
|
||||
log.info(f"Start a pgbench workload on pg {connstr}")
|
||||
# s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
|
||||
log.info("pgbench init done")
|
||||
pg_bin.run_capture(["pgbench", "-T60", connstr])
|
||||
|
||||
|
||||
# assert that pgbench_accounts and its index are created.
|
||||
def pgbench_accounts_initialized(ep):
|
||||
ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass")
|
||||
|
||||
|
||||
# Test that hot_standby_feedback works in neon (it is forwarded through
|
||||
# safekeepers). That is, ensure queries on standby don't fail during load on
|
||||
# primary under the following conditions:
|
||||
# - pgbench bombards primary with updates.
|
||||
# - On the secondary we run long select of the updated table.
|
||||
# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts
|
||||
# so apply doesn't need to wait.
|
||||
# - Do agressive vacuum on primary which still shouldn't create conflicts.
|
||||
# Actually this appears to be redundant due to microvacuum existence.
|
||||
#
|
||||
# Without hs feedback enabled we'd see 'User query might have needed to see row
|
||||
# versions that must be removed.' errors.
|
||||
def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
env = neon_env_builder.init_start()
|
||||
agressive_vacuum_conf = [
|
||||
"log_autovacuum_min_duration = 0",
|
||||
"autovacuum_naptime = 10s",
|
||||
"autovacuum_vacuum_threshold = 25",
|
||||
"autovacuum_vacuum_scale_factor = 0.1",
|
||||
"autovacuum_vacuum_cost_delay = -1",
|
||||
]
|
||||
with env.endpoints.create_start(
|
||||
branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf
|
||||
) as primary:
|
||||
# It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with
|
||||
# 'User was holding shared buffer pin for too long.'.
|
||||
with env.endpoints.new_replica_start(
|
||||
origin=primary,
|
||||
endpoint_id="secondary",
|
||||
config_lines=[
|
||||
"max_standby_streaming_delay=2s",
|
||||
"neon.protocol_version=2",
|
||||
"hot_standby_feedback=true",
|
||||
],
|
||||
) as secondary:
|
||||
log.info(
|
||||
f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
|
||||
)
|
||||
t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
|
||||
t.start()
|
||||
# Wait until pgbench_accounts is created + filled on replica *and*
|
||||
# index is created. Otherwise index creation would conflict with
|
||||
# read queries and hs feedback won't save us.
|
||||
wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
|
||||
|
||||
# Test should fail if hs feedback is disabled anyway, but cross
|
||||
# check that walproposer sets some xmin.
|
||||
def xmin_is_not_null():
|
||||
slot_xmin = primary.safe_psql_scalar(
|
||||
"select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
|
||||
log_query=False,
|
||||
)
|
||||
log.info(f"xmin is {slot_xmin}")
|
||||
assert int(slot_xmin) > 0
|
||||
|
||||
wait_until(10, 1.0, xmin_is_not_null)
|
||||
for _ in range(1, 5):
|
||||
# in debug mode takes about 5-7s
|
||||
balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
|
||||
log.info(f"balance={balance}")
|
||||
log_replica_lag(primary, secondary)
|
||||
t.join()
|
||||
|
||||
# check xmin is reset when standby is gone
|
||||
def xmin_is_null():
|
||||
slot_xmin = primary.safe_psql_scalar(
|
||||
"select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
|
||||
log_query=False,
|
||||
)
|
||||
log.info(f"xmin is {slot_xmin}")
|
||||
assert slot_xmin is None
|
||||
|
||||
wait_until(10, 1.0, xmin_is_null)
|
||||
|
||||
|
||||
# Test race condition between WAL replay and backends performing queries
|
||||
# https://github.com/neondatabase/neon/issues/7791
|
||||
def test_replica_query_race(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
primary_ep = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
endpoint_id="primary",
|
||||
)
|
||||
|
||||
with primary_ep.connect() as p_con:
|
||||
with p_con.cursor() as p_cur:
|
||||
p_cur.execute("CREATE EXTENSION neon_test_utils")
|
||||
p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter")
|
||||
|
||||
standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby")
|
||||
time.sleep(1)
|
||||
|
||||
# In primary, run a lot of UPDATEs on a single page
|
||||
finished = False
|
||||
writecounter = 1
|
||||
|
||||
async def primary_workload():
|
||||
nonlocal writecounter, finished
|
||||
conn = await primary_ep.connect_async()
|
||||
while writecounter < 10000:
|
||||
writecounter += 1
|
||||
await conn.execute(f"UPDATE test SET counter = {writecounter}")
|
||||
finished = True
|
||||
|
||||
# In standby, at the same time, run queries on it. And repeatedly drop caches
|
||||
async def standby_workload():
|
||||
nonlocal writecounter, finished
|
||||
conn = await standby_ep.connect_async()
|
||||
reads = 0
|
||||
while not finished:
|
||||
readcounter = await conn.fetchval("SELECT counter FROM test")
|
||||
|
||||
# Check that the replica is keeping up with the primary. In local
|
||||
# testing, the lag between primary and standby is much smaller, in
|
||||
# the ballpark of 2-3 counter values. But be generous in case there's
|
||||
# some hiccup.
|
||||
# assert(writecounter - readcounter < 1000)
|
||||
assert readcounter <= writecounter
|
||||
if reads % 100 == 0:
|
||||
log.info(f"read {reads}: counter {readcounter}, last update {writecounter}")
|
||||
reads += 1
|
||||
|
||||
await conn.execute("SELECT clear_buffer_cache()")
|
||||
|
||||
async def both():
|
||||
await asyncio.gather(
|
||||
primary_workload(),
|
||||
standby_workload(),
|
||||
)
|
||||
|
||||
asyncio.run(both())
|
||||
|
||||
@@ -21,7 +21,7 @@ from fixtures.pageserver.utils import (
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture
|
||||
from fixtures.utils import subprocess_capture
|
||||
|
||||
|
||||
def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
|
||||
@@ -163,7 +163,7 @@ def test_import_from_pageserver_small(
|
||||
|
||||
num_rows = 3000
|
||||
lsn = _generate_data(num_rows, endpoint)
|
||||
_import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
|
||||
_import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir)
|
||||
|
||||
|
||||
@pytest.mark.timeout(1800)
|
||||
@@ -193,7 +193,9 @@ def test_import_from_pageserver_multisegment(
|
||||
log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
|
||||
assert logical_size > 1024**3 # = 1GB
|
||||
|
||||
tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
|
||||
tar_output_file = _import(
|
||||
num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir
|
||||
)
|
||||
|
||||
# Check if the backup data contains multiple segment files
|
||||
cnt_seg_files = 0
|
||||
@@ -233,6 +235,7 @@ def _import(
|
||||
env: NeonEnv,
|
||||
pg_bin: PgBin,
|
||||
timeline: TimelineId,
|
||||
pg_distrib_dir: Path,
|
||||
test_output_dir: Path,
|
||||
) -> Path:
|
||||
"""Test importing backup data to the pageserver.
|
||||
@@ -245,9 +248,15 @@ def _import(
|
||||
path to the backup archive file"""
|
||||
log.info(f"start_backup_lsn = {lsn}")
|
||||
|
||||
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
|
||||
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
|
||||
psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
|
||||
|
||||
# Get a fullbackup from pageserver
|
||||
query = f"fullbackup { env.initial_tenant} {timeline} {lsn}"
|
||||
tar_output_file = test_output_dir / "fullbackup.tar"
|
||||
pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file)
|
||||
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
|
||||
pg_bin.run_capture(cmd, env=psql_env)
|
||||
|
||||
# Stop the first pageserver instance, erase all its data
|
||||
env.endpoints.stop_all()
|
||||
@@ -292,15 +301,26 @@ def _import(
|
||||
wait_for_upload(client, tenant, timeline, lsn)
|
||||
|
||||
# Check it worked
|
||||
endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn)
|
||||
endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
|
||||
assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
|
||||
|
||||
# Take another fullbackup
|
||||
query = f"fullbackup { tenant} {timeline} {lsn}"
|
||||
new_tar_output_file = test_output_dir / "fullbackup-new.tar"
|
||||
pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file)
|
||||
cmd = [
|
||||
"psql",
|
||||
"--no-psqlrc",
|
||||
env.pageserver.connstr(),
|
||||
"-c",
|
||||
query,
|
||||
"-o",
|
||||
str(new_tar_output_file),
|
||||
]
|
||||
pg_bin.run_capture(cmd, env=psql_env)
|
||||
|
||||
# Check it's the same as the first fullbackup
|
||||
assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set())
|
||||
# TODO pageserver should be checking checksum
|
||||
assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
|
||||
|
||||
# Check that gc works
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
@@ -272,14 +272,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
|
||||
resident_physical_size_metric == 0
|
||||
), "ensure that resident_physical_size metric is zero"
|
||||
assert resident_physical_size_metric == sum(
|
||||
layer.layer_file_size for layer in info.historic_layers if not layer.remote
|
||||
layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
|
||||
), "ensure that resident_physical_size metric corresponds to layer map dump"
|
||||
|
||||
remote_physical_size_metric = ps_http.get_timeline_metric(
|
||||
tenant_id, timeline_id, "pageserver_remote_physical_size"
|
||||
)
|
||||
assert remote_physical_size_metric == sum(
|
||||
layer.layer_file_size for layer in info.historic_layers if layer.remote
|
||||
layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
|
||||
), "ensure that remote_physical_size metric corresponds to layer map dump"
|
||||
|
||||
log.info("before runnning GC, ensure that remote_physical size is zero")
|
||||
|
||||
@@ -5,7 +5,7 @@ from pathlib import Path
|
||||
|
||||
from fixtures.common_types import Lsn, TenantId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
)
|
||||
@@ -71,17 +71,22 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
|
||||
def test_import_at_2bil(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
pg_bin: PgBin,
|
||||
pg_distrib_dir: Path,
|
||||
pg_bin,
|
||||
vanilla_pg,
|
||||
):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
|
||||
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
|
||||
psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
|
||||
|
||||
# Reset the vanilla Postgres instance to somewhat before 2 billion transactions.
|
||||
pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
|
||||
cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)]
|
||||
pg_bin.run_capture(cmd)
|
||||
pg_bin.run_capture(cmd, env=psql_env)
|
||||
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
|
||||
|
||||
@@ -540,6 +540,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
|
||||
|
||||
for layer in layers.historic_layers:
|
||||
log.info(f"pre-compact: {layer}")
|
||||
assert layer.layer_file_size is not None, "we must know layer file sizes"
|
||||
layer_sizes += layer.layer_file_size
|
||||
pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
|
||||
@@ -287,7 +287,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
|
||||
total_historic_bytes += sum(
|
||||
layer.layer_file_size
|
||||
for layer in layer_map.historic_layers
|
||||
if Lsn(layer.lsn_start) > initdb_lsn
|
||||
if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn
|
||||
)
|
||||
total_ephemeral_layers += len(layer_map.in_memory_layers)
|
||||
|
||||
|
||||
@@ -575,10 +575,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_timelines = {}
|
||||
|
||||
# This mirrors a constant in `downloader.rs`
|
||||
default_download_period_secs = 60
|
||||
|
||||
# The upload period, which will also be the download once the secondary has seen its first heatmap
|
||||
upload_period_secs = 30
|
||||
freshen_interval_secs = 60
|
||||
|
||||
for _i in range(0, tenant_count):
|
||||
tenant_id = TenantId.generate()
|
||||
@@ -590,32 +587,17 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
placement_policy='{"Attached":1}',
|
||||
# Run with a low heatmap period so that we can avoid having to do synthetic API calls
|
||||
# to trigger the upload promptly.
|
||||
conf={"heatmap_period": f"{upload_period_secs}s"},
|
||||
conf={"heatmap_period": "1s"},
|
||||
)
|
||||
env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
|
||||
|
||||
tenant_timelines[tenant_id] = [timeline_a, timeline_b]
|
||||
|
||||
def await_log(pageserver, deadline, expression):
|
||||
"""
|
||||
Wrapper around assert_log_contains that waits with a deadline rather than timeout
|
||||
"""
|
||||
now = time.time()
|
||||
if now > deadline:
|
||||
raise RuntimeError(f"Timed out waiting for {expression}")
|
||||
else:
|
||||
timeout = int(deadline - now) + 1
|
||||
try:
|
||||
wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore
|
||||
except:
|
||||
log.error(f"Timed out waiting for '{expression}'")
|
||||
raise
|
||||
|
||||
t_start = time.time()
|
||||
|
||||
# Wait long enough that the background downloads should happen; we expect all the inital layers
|
||||
# of all the initial timelines to show up on the secondary location of each tenant.
|
||||
initial_download_deadline = time.time() + default_download_period_secs * 3
|
||||
time.sleep(freshen_interval_secs * 1.5)
|
||||
|
||||
for tenant_id, timelines in tenant_timelines.items():
|
||||
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
|
||||
@@ -623,32 +605,16 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
# We only have two: the other one must be secondary
|
||||
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
|
||||
|
||||
now = time.time()
|
||||
if now > initial_download_deadline:
|
||||
raise RuntimeError("Timed out waiting for initial secondary download")
|
||||
else:
|
||||
for timeline_id in timelines:
|
||||
log.info(
|
||||
f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}"
|
||||
)
|
||||
await_log(
|
||||
ps_secondary,
|
||||
initial_download_deadline,
|
||||
f".*{timeline_id}.*Wrote timeline_detail.*",
|
||||
)
|
||||
|
||||
for timeline_id in timelines:
|
||||
log.info(
|
||||
f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}"
|
||||
)
|
||||
log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
|
||||
# One or more layers should be present for all timelines
|
||||
assert ps_secondary.list_layers(tenant_id, timeline_id)
|
||||
|
||||
# Delete the second timeline: this should be reflected later on the secondary
|
||||
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
|
||||
|
||||
# Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
|
||||
deletion_deadline = time.time() + upload_period_secs * 3
|
||||
# Wait long enough for the secondary locations to see the deletion
|
||||
time.sleep(freshen_interval_secs * 1.5)
|
||||
|
||||
for tenant_id, timelines in tenant_timelines.items():
|
||||
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
|
||||
@@ -656,24 +622,11 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
# We only have two: the other one must be secondary
|
||||
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
|
||||
|
||||
expect_del_timeline = timelines[1]
|
||||
log.info(
|
||||
f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}"
|
||||
)
|
||||
await_log(
|
||||
ps_secondary,
|
||||
deletion_deadline,
|
||||
f".*Timeline no longer in heatmap.*{expect_del_timeline}.*",
|
||||
)
|
||||
|
||||
# This one was not deleted
|
||||
assert ps_secondary.list_layers(tenant_id, timelines[0])
|
||||
|
||||
# This one was deleted
|
||||
log.info(
|
||||
f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
|
||||
)
|
||||
assert not ps_secondary.list_layers(tenant_id, expect_del_timeline)
|
||||
assert not ps_secondary.list_layers(tenant_id, timelines[1])
|
||||
|
||||
t_end = time.time()
|
||||
|
||||
@@ -687,7 +640,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
|
||||
|
||||
expect_download_rate = 1.0 / upload_period_secs
|
||||
expect_download_rate = 1.0 / freshen_interval_secs
|
||||
log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
|
||||
|
||||
assert download_rate < expect_download_rate * 2
|
||||
|
||||
@@ -1,25 +1,16 @@
|
||||
#
|
||||
# This file runs pg_regress-based tests.
|
||||
#
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, cast
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
check_restored_datadir_content,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.remote_storage import s3_storage
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Optional
|
||||
|
||||
from fixtures.neon_fixtures import PgBin
|
||||
from pytest import CaptureFixture
|
||||
|
||||
|
||||
# Run the main PostgreSQL regression tests, in src/test/regress.
|
||||
#
|
||||
@@ -28,14 +19,12 @@ def test_pg_regress(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
build_type: str,
|
||||
pg_bin: PgBin,
|
||||
capsys: CaptureFixture[str],
|
||||
pg_bin,
|
||||
capsys,
|
||||
base_dir: Path,
|
||||
pg_distrib_dir: Path,
|
||||
shard_count: Optional[int],
|
||||
):
|
||||
DBNAME = "regression"
|
||||
|
||||
"""
|
||||
:param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this
|
||||
many shards.
|
||||
@@ -53,7 +42,7 @@ def test_pg_regress(
|
||||
|
||||
# Connect to postgres and create a database called "regression".
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
endpoint.safe_psql("CREATE DATABASE regression")
|
||||
|
||||
# Create some local directories for pg_regress to run in.
|
||||
runpath = test_output_dir / "regress"
|
||||
@@ -88,67 +77,7 @@ def test_pg_regress(
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
|
||||
|
||||
ignored_files: Optional[list[str]] = None
|
||||
|
||||
# Neon handles unlogged relations in a special manner. During a
|
||||
# basebackup, we ship the init fork as the main fork. This presents a
|
||||
# problem in that the endpoint's data directory and the basebackup will
|
||||
# have differences and will fail the eventual file comparison.
|
||||
#
|
||||
# Unlogged tables were introduced in version 9.1. ALTER TABLE grew
|
||||
# support for setting the persistence of a table in 9.5. The reason that
|
||||
# this doesn't affect versions < 15 (but probably would between 9.1 and
|
||||
# 9.5) is that all the regression tests that deal with unlogged tables
|
||||
# up until that point dropped the unlogged tables or set them to logged
|
||||
# at some point during the test.
|
||||
#
|
||||
# In version 15, Postgres grew support for unlogged sequences, and with
|
||||
# that came a few more regression tests. These tests did not all drop
|
||||
# the unlogged tables/sequences prior to finishing.
|
||||
#
|
||||
# But unlogged sequences came with a bug in that, sequences didn't
|
||||
# inherit the persistence of their "parent" tables if they had one. This
|
||||
# was fixed and backported to 15, thus exacerbating our problem a bit.
|
||||
#
|
||||
# So what we can do is just ignore file differences between the data
|
||||
# directory and basebackup for unlogged relations.
|
||||
results = cast(
|
||||
"list[tuple[str, str]]",
|
||||
endpoint.safe_psql(
|
||||
"""
|
||||
SELECT
|
||||
relkind,
|
||||
pg_relation_filepath(
|
||||
pg_filenode_relation(reltablespace, relfilenode)
|
||||
) AS unlogged_relation_paths
|
||||
FROM pg_class
|
||||
WHERE relpersistence = 'u'
|
||||
""",
|
||||
dbname=DBNAME,
|
||||
),
|
||||
)
|
||||
|
||||
unlogged_relation_files: list[str] = []
|
||||
for r in results:
|
||||
unlogged_relation_files.append(r[1])
|
||||
# This is related to the following Postgres commit:
|
||||
#
|
||||
# commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
|
||||
# Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
|
||||
# Date: 2023-08-23 09:21:31 -0500
|
||||
#
|
||||
# Use the buffer cache when initializing an unlogged index.
|
||||
#
|
||||
# This patch was backpatched to 16. Without it, the LSN in the
|
||||
# page header would be 0/0 in the data directory, which wouldn't
|
||||
# match the LSN generated during the basebackup, thus creating
|
||||
# a difference.
|
||||
if env.pg_version <= PgVersion.V15 and r[0] == "i":
|
||||
unlogged_relation_files.append(f"{r[1]}_init")
|
||||
|
||||
ignored_files = unlogged_relation_files
|
||||
|
||||
check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
|
||||
check_restored_datadir_content(test_output_dir, env, endpoint)
|
||||
|
||||
|
||||
# Run the PostgreSQL "isolation" tests, in src/test/isolation.
|
||||
@@ -157,8 +86,8 @@ def test_pg_regress(
|
||||
def test_isolation(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
pg_bin: PgBin,
|
||||
capsys: CaptureFixture[str],
|
||||
pg_bin,
|
||||
capsys,
|
||||
base_dir: Path,
|
||||
pg_distrib_dir: Path,
|
||||
shard_count: Optional[int],
|
||||
@@ -213,8 +142,8 @@ def test_isolation(
|
||||
def test_sql_regress(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
pg_bin: PgBin,
|
||||
capsys: CaptureFixture[str],
|
||||
pg_bin,
|
||||
capsys,
|
||||
base_dir: Path,
|
||||
pg_distrib_dir: Path,
|
||||
shard_count: Optional[int],
|
||||
|
||||
@@ -632,6 +632,7 @@ def test_sharding_ingest_layer_sizes(
|
||||
historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start)
|
||||
|
||||
for layer in historic_layers:
|
||||
assert layer.layer_file_size is not None
|
||||
if layer.layer_file_size < expect_layer_size // 2:
|
||||
classification = "Small"
|
||||
small_layer_count += 1
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user