Compare commits

..

1 Commits

Author SHA1 Message Date
Heikki Linnakangas
8dc1c48605 Fix race condition in XLogWaitForReplayOf()
ConditionVariablePrepareToSleep has this comment on it:

> * Caution: "before entering the loop" means you *must* test the exit
> * condition between calling ConditionVariablePrepareToSleep and calling
> * ConditionVariableSleep.  If that is inconvenient, omit calling
> * ConditionVariablePrepareToSleep.

We were not obeying that: we did not test the exit condition correctly
between the ConditionVariablePrepareToSleep and ConditionVariableSleep
calls, because the test that we had in between them only checked the
local 'replayRecPtr' variable, without updating it from shared memory.

That wasn't too serious, because the loop includes a 10 second
timeout, and would retry and succeed if the original update was
missed. Still, better fix it.

To fix, just remove the ConditionVariablePrepareToSleep() call. As the
comment says, that's also correct, and even more efficient if we
assume that sleeping is rare.
2024-05-19 21:07:29 +03:00
109 changed files with 1543 additions and 4525 deletions

View File

@@ -17,7 +17,6 @@
!libs/
!neon_local/
!pageserver/
!patches/
!pgxn/
!proxy/
!s3_scrubber/

View File

@@ -3,13 +3,13 @@ description: 'Create Branch using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
project_id:
description: 'ID of the Project to create Branch in'
desctiption: 'ID of the Project to create Branch in'
required: true
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
outputs:
dsn:

View File

@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
project_id:
description: 'ID of the Project which should be deleted'
desctiption: 'ID of the Project which should be deleted'
required: true
branch_id:
description: 'ID of the branch to delete'
desctiption: 'ID of the branch to delete'
required: true
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
runs:

View File

@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
region_id:
description: 'Region ID, if not set the project will be created in the default region'
desctiption: 'Region ID, if not set the project will be created in the default region'
default: aws-us-east-2
postgres_version:
description: 'Postgres version; default is 15'
default: '15'
desctiption: 'Postgres version; default is 15'
default: 15
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
provisioner:
description: 'k8s-pod or k8s-neonvm'
desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'
compute_units:
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
default: '[1, 1]'
outputs:

View File

@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
project_id:
description: 'ID of the Project to delete'
desctiption: 'ID of the Project to delete'
required: true
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
runs:

View File

@@ -548,7 +548,7 @@ jobs:
report-benchmarks-failures:
needs: [ benchmarks, create-test-report ]
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
runs-on: ubuntu-latest
steps:
@@ -723,13 +723,9 @@ jobs:
uses: ./.github/workflows/trigger-e2e-tests.yml
secrets: inherit
neon-image-arch:
neon-image:
needs: [ check-permissions, build-build-tools-image, tag ]
strategy:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: [ self-hosted, gen3, large ]
steps:
- name: Checkout
@@ -751,6 +747,12 @@ jobs:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- uses: docker/build-push-action@v5
with:
context: .
@@ -762,52 +764,25 @@ jobs:
push: true
pull: true
file: Dockerfile
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
cache-from: type=registry,ref=neondatabase/neon:cache
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
tags: |
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
neondatabase/neon:${{needs.tag.outputs.build-tag}}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
neon-image:
needs: [ neon-image-arch, tag ]
runs-on: ubuntu-latest
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch image
run: |
docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Push multi-arch image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
compute-node-image-arch:
compute-node-image:
needs: [ check-permissions, build-build-tools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
steps:
- name: Checkout
@@ -854,14 +829,15 @@ jobs:
push: true
pull: true
file: Dockerfile.compute-node
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
tags: |
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
if: matrix.version == 'v16'
if: ${{ matrix.version == 'v16' }}
uses: docker/build-push-action@v5
with:
target: compute-tools-image
@@ -875,57 +851,14 @@ jobs:
pull: true
file: Dockerfile.compute-node
tags: |
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
compute-node-image:
needs: [ compute-node-image-arch, tag ]
runs-on: ubuntu-latest
strategy:
matrix:
version: [ v14, v15, v16 ]
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch compute-node image
run: |
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
- name: Create multi-arch compute-tools image
if: matrix.version == 'v16'
run: |
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
- name: Push multi-arch compute-tools image to ECR
if: matrix.version == 'v16'
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, gen3, large ]
@@ -933,8 +866,11 @@ jobs:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
defaults:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.29.3
VM_BUILDER_VERSION: v0.28.1
steps:
- name: Checkout
@@ -947,48 +883,26 @@ jobs:
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
chmod +x vm-builder
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
# it won't have the proper authentication (written at v0.6.0)
- name: Pulling compute-node image
run: |
docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Build vm image
run: |
./vm-builder \
-spec=vm-image-spec.yaml \
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Pushing vm-compute-node image
run: |
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ check-permissions, tag, neon-image, compute-node-image ]
strategy:
fail-fast: false
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
runs-on: [ self-hosted, gen3, small ]
steps:
- name: Checkout
@@ -1006,7 +920,7 @@ jobs:
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
run: |
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
echo "Pageserver version string: $pageserver_version"
@@ -1032,48 +946,78 @@ jobs:
promote-images:
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: ubuntu-latest
env:
VERSIONS: v14 v15 v16
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
# Don't add if-condition here.
# The job should always be run because we have dependant other jobs that shouldn't be skipped
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Copy vm-compute-node images to ECR
- name: Install Crane & ECR helper
run: |
for version in ${VERSIONS}; do
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Copy vm-compute-node images to Docker Hub
run: |
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
- name: Add latest tag to images
if: github.ref_name == 'main'
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
run: |
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
docker buildx imagetools create -t $repo/neon:latest \
$repo/neon:${{ needs.tag.outputs.build-tag }}
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
docker buildx imagetools create -t $repo/compute-tools:latest \
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
- name: Push images to production ECR
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
run: |
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
for version in ${VERSIONS}; do
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
- name: Configure Docker Hub login
run: |
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
echo "" > /github/home/.docker/config.json
crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
done
- name: Push vm-compute-node to Docker Hub
run: |
crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
- name: Push latest tags to Docker Hub
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
run: |
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]

49
Cargo.lock generated
View File

@@ -1072,9 +1072,9 @@ dependencies = [
[[package]]
name = "chrono"
version = "0.4.38"
version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
dependencies = [
"android-tzdata",
"iana-time-zone",
@@ -1082,7 +1082,7 @@ dependencies = [
"num-traits",
"serde",
"wasm-bindgen",
"windows-targets 0.52.4",
"windows-targets 0.48.0",
]
[[package]]
@@ -1109,7 +1109,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
dependencies = [
"ciborium-io",
"half 1.8.2",
"half",
]
[[package]]
@@ -1471,21 +1471,26 @@ dependencies = [
[[package]]
name = "crossbeam-deque"
version = "0.8.5"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
dependencies = [
"cfg-if",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
version = "0.9.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"memoffset 0.8.0",
"scopeguard",
]
[[package]]
@@ -2273,17 +2278,6 @@ version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
[[package]]
name = "half"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
dependencies = [
"cfg-if",
"crunchy",
"num-traits",
]
[[package]]
name = "hash32"
version = "0.3.1"
@@ -3908,13 +3902,12 @@ dependencies = [
[[package]]
name = "parquet"
version = "51.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"ahash",
"bytes",
"chrono",
"half 2.4.1",
"hashbrown 0.14.5",
"num",
"num-bigint",
@@ -3923,13 +3916,12 @@ dependencies = [
"thrift",
"twox-hash",
"zstd",
"zstd-sys",
]
[[package]]
name = "parquet_derive"
version = "51.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"parquet",
"proc-macro2",
@@ -3956,9 +3948,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
[[package]]
name = "pbkdf2"
version = "0.12.2"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
dependencies = [
"digest",
"hmac",
@@ -4381,7 +4373,6 @@ dependencies = [
name = "proxy"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"async-compression",
"async-trait",
@@ -4398,7 +4389,6 @@ dependencies = [
"chrono",
"clap",
"consumption_metrics",
"crossbeam-deque",
"dashmap",
"env_logger",
"fallible-iterator",
@@ -7470,7 +7460,6 @@ dependencies = [
name = "workspace_hack"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"aws-config",
"aws-runtime",

View File

@@ -41,7 +41,6 @@ license = "Apache-2.0"
## All dependency versions, used in the project
[workspace.dependencies]
ahash = "0.8"
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
@@ -75,7 +74,6 @@ clap = { version = "4.0", features = ["derive"] }
comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
crossbeam-deque = "0.8.5"
crossbeam-utils = "0.8.5"
dashmap = { version = "5.5.0", features = ["raw-api"] }
either = "1.8"
@@ -124,8 +122,8 @@ opentelemetry = "0.20.0"
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.12.0"
parking_lot = "0.12"
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "51.0.0"
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "49.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
procfs = "0.14"
@@ -246,8 +244,8 @@ tonic-build = "0.9"
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
################# Binary contents sections

View File

@@ -241,17 +241,11 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
FROM build-deps AS vector-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY patches/pgvector.patch /pgvector.patch
# By default, pgvector Makefile uses `-march=native`. We don't want that,
# because we build the images on different machines than where we run them.
# Pass OPTFLAGS="" to remove it.
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
patch -p1 < /pgvector.patch && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
#########################################################################################

View File

@@ -1,6 +1,4 @@
[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech)
[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)
# Neon

View File

@@ -1,5 +1,3 @@
use std::path::Path;
use anyhow::{anyhow, Context};
use tracing::warn;
@@ -19,24 +17,17 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
.arg(size_bytes.to_string())
.spawn();
if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
return Ok(());
}
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => {
// The command failed. Maybe it was because the resize-swap file doesn't exist?
// The --once flag causes it to delete itself on success so we don't disable swap
// while postgres is running; maybe this is fine.
match Path::new(RESIZE_SWAP_BIN).try_exists() {
Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
// The path doesn't exist; we're actually ok
Ok(false) => {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
Ok(())
},
}
}
false => Err(anyhow!("process exited with {status}")),
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| {

View File

@@ -243,13 +243,9 @@ impl StorageController {
anyhow::bail!("initdb failed with status {status}");
}
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", self.postgres_port),
format!("port = {}", self.postgres_port),
)
.await?;
};

View File

@@ -1,4 +1,4 @@
ARG REPOSITORY=neondatabase
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG COMPUTE_IMAGE=compute-node-v14
ARG TAG=latest

View File

@@ -8,6 +8,8 @@
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
# to verify custom image builds (e.g pre-published ones).
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
set -eux -o pipefail
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

View File

@@ -307,7 +307,7 @@ impl KeySpace {
}
/// Merge another keyspace into the current one.
/// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
/// Note: the keyspaces must not ovelap (enforced via assertions)
pub fn merge(&mut self, other: &KeySpace) {
let all_ranges = self
.ranges

View File

@@ -9,6 +9,7 @@ use std::{
collections::HashMap,
io::{BufRead, Read},
num::{NonZeroU64, NonZeroUsize},
str::FromStr,
sync::atomic::AtomicUsize,
time::{Duration, SystemTime},
};
@@ -161,22 +162,6 @@ impl std::fmt::Debug for TenantState {
}
}
/// A temporary lease to a specific lsn inside a timeline.
/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
#[serde_as]
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LsnLease {
#[serde_as(as = "SystemTimeAsRfc3339Millis")]
pub valid_until: SystemTime,
}
serde_with::serde_conv!(
SystemTimeAsRfc3339Millis,
SystemTime,
|time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
|value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
);
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum ActivatingFrom {
@@ -305,7 +290,7 @@ pub struct TenantConfig {
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
// defer parsing compaction_algorithm, like eviction_policy
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
pub compaction_algorithm: Option<CompactionAlgorithm>,
pub gc_horizon: Option<u64>,
pub gc_period: Option<String>,
pub image_creation_threshold: Option<usize>,
@@ -333,28 +318,14 @@ pub struct TenantConfig {
/// Unset -> V1
/// -> V2
/// -> CrossValidation -> V2
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AuxFilePolicy {
/// V1 aux file policy: store everything in AUX_FILE_KEY
#[strum(ascii_case_insensitive)]
V1,
/// V2 aux file policy: store in the AUX_FILE keyspace
#[strum(ascii_case_insensitive)]
V2,
/// Cross validation runs both formats on the write path and does validation
/// on the read path.
#[strum(ascii_case_insensitive)]
CrossValidation,
}
@@ -420,6 +391,23 @@ impl AuxFilePolicy {
}
}
impl FromStr for AuxFilePolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.to_lowercase();
if s == "v1" {
Ok(Self::V1)
} else if s == "v2" {
Ok(Self::V2)
} else if s == "crossvalidation" || s == "cross_validation" {
Ok(Self::CrossValidation)
} else {
anyhow::bail!("cannot parse {} to aux file policy", s)
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind")]
pub enum EvictionPolicy {
@@ -438,28 +426,13 @@ impl EvictionPolicy {
}
}
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind")]
pub enum CompactionAlgorithm {
Legacy,
Tiered,
}
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
pub struct CompactionAlgorithmSettings {
pub kind: CompactionAlgorithm,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct EvictionPolicyLayerAccessThreshold {
#[serde(with = "humantime_serde")]
@@ -816,8 +789,6 @@ pub enum HistoricLayerInfo {
lsn_end: Lsn,
remote: bool,
access_stats: LayerAccessStats,
l0: bool,
},
Image {
layer_file_name: String,
@@ -1416,7 +1387,6 @@ impl PagestreamBeMessage {
#[cfg(test)]
mod tests {
use serde_json::json;
use std::str::FromStr;
use super::*;
@@ -1679,14 +1649,4 @@ mod tests {
AuxFilePolicy::V2
));
}
#[test]
fn test_aux_parse() {
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
assert_eq!(
AuxFilePolicy::from_str("cross-validation").unwrap(),
AuxFilePolicy::CrossValidation
);
}
}

View File

@@ -559,14 +559,6 @@ impl ShardIdentity {
}
}
/// Obtains the shard number and count combined into a `ShardIndex`.
pub fn shard_index(&self) -> ShardIndex {
ShardIndex {
shard_count: self.count,
shard_number: self.number,
}
}
pub fn shard_slug(&self) -> String {
if self.count > ShardCount(0) {
format!("-{:02x}{:02x}", self.number.0, self.count.0)

View File

@@ -820,11 +820,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
Ok(ProcessMsgResult::Continue)
}
/// - Log as info/error result of handling COPY stream and send back
/// ErrorResponse if that makes sense.
/// - Shutdown the stream if we got Terminate.
/// - Then close the connection because we don't handle exiting from COPY
/// stream normally.
/// Log as info/error result of handling COPY stream and send back
/// ErrorResponse if that makes sense. Shutdown the stream if we got
/// Terminate. TODO: transition into waiting for Sync msg if we initiate the
/// close.
pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
use CopyStreamHandlerEnd::*;
@@ -850,6 +849,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
}
}
if let Terminate = &end {
self.state = ProtoState::Closed;
}
let err_to_send_and_errcode = match &end {
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
@@ -879,12 +882,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
error!("failed to send ErrorResponse: {}", ee);
}
}
// Proper COPY stream finishing to continue using the connection is not
// implemented at the server side (we don't need it so far). To prevent
// further usages of the connection, close it.
self.framed.shutdown().await.ok();
self.state = ProtoState::Closed;
}
}

View File

@@ -178,13 +178,6 @@ impl PgConnectionConfig {
}
}
impl fmt::Display for PgConnectionConfig {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// The password is intentionally hidden and not part of this display string.
write!(f, "postgresql://{}:{}", self.host, self.port)
}
}
impl fmt::Debug for PgConnectionConfig {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`

View File

@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
// Is there enough space on the page for another logical message and an
// XLOG_SWITCH? If not, start over.
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
continue;
}
@@ -373,29 +373,31 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
&[&(repeats as i32)],
)?;
info!(
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
client.pg_current_wal_insert_lsn()?,
XLOG_SIZE_OF_XLOG_RECORD
);
// Emit the XLOG_SWITCH
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
let xlog_switch_record_end: PgLsn =
client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
!= XLOG_SIZE_OF_XLOG_SHORT_PHD
{
warn!(
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
xlog_switch_record_end,
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
);
continue;
}
return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
break;
}
info!(
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
client.pg_current_wal_insert_lsn()?,
XLOG_SIZE_OF_XLOG_RECORD
);
// Emit the XLOG_SWITCH
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
let next_segment = PgLsn::from(0x0200_0000);
ensure!(
xlog_switch_record_end < next_segment,
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
xlog_switch_record_end,
next_segment
);
ensure!(
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
xlog_switch_record_end,
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
);
Ok(vec![before_xlog_switch, xlog_switch_record_end])
}
}

View File

@@ -50,9 +50,6 @@ pub struct SkTimelineInfo {
pub safekeeper_connstr: Option<String>,
#[serde(default)]
pub http_connstr: Option<String>,
// Minimum of all active RO replicas flush LSN
#[serde(default = "lsn_invalid")]
pub standby_horizon: Lsn,
}
#[derive(Debug, Clone, Deserialize, Serialize)]

View File

@@ -496,9 +496,9 @@ mod tests {
// TODO: When updating Postgres versions, this test will cause
// problems. Postgres version in message needs updating.
//
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
vec![
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,

View File

@@ -257,37 +257,6 @@ paths:
schema:
$ref: "#/components/schemas/LsnByTimestampResponse"
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
format: hex
post:
description: Obtain lease for the given LSN
parameters:
- name: lsn
in: query
required: true
schema:
type: string
format: hex
description: A LSN to obtain the lease for
responses:
"200":
description: OK
content:
application/json:
schema:
$ref: "#/components/schemas/LsnLease"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
parameters:
- name: tenant_id
@@ -612,80 +581,6 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
ŕequired: true
schema:
type: string
put:
description: |
Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
Current implementation might not be retryable across failure cases, but will be enhanced in future.
Detaching should be expected to be expensive operation. Timeouts should be retried.
responses:
"200":
description: |
The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
If any timelines were deleted after reparenting, they might not be on this list.
content:
application/json:
schema:
$ref: "#/components/schemas/AncestorDetached"
"400":
description: |
Number of early checks meaning the timeline cannot be detached now:
- the ancestor of timeline has an ancestor: not supported, see RFC
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"404":
description: Tenant or timeline not found.
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"409":
description: |
The timeline can never be detached:
- timeline has no ancestor, implying that the timeline has never had an ancestor
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"500":
description: |
Transient error, for example, pageserver shutdown happened while
processing the request but we were unable to distinguish that. Must
be retried.
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: |
Temporarily unavailable, please retry. Possible reasons:
- another timeline detach for the same tenant is underway, please retry later
- detected shutdown error
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/:
get:
description: Get tenants list
@@ -1085,15 +980,6 @@ components:
type: string
enum: [past, present, future, nodata]
LsnLease:
type: object
required:
- valid_until
properties:
valid_until:
type: string
format: date-time
PageserverUtilization:
type: object
required:
@@ -1151,19 +1037,6 @@ components:
format: int64
description: How many bytes of layer content were in the latest layer heatmap
AncestorDetached:
type: object
required:
- reparented_timelines
properties:
reparented_timelines:
type: array
description: Set of reparented timeline ids
properties:
type: string
format: hex
description: TimelineId
Error:
type: object

View File

@@ -16,7 +16,6 @@ use hyper::header;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
@@ -75,7 +74,6 @@ use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::Timeline;
use crate::tenant::GetTimelineError;
use crate::tenant::SpawnMode;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::{config::PageServerConf, tenant::mgr};
@@ -281,13 +279,6 @@ impl From<GetTenantError> for ApiError {
}
}
impl From<GetTimelineError> for ApiError {
fn from(gte: GetTimelineError) -> Self {
// Rationale: tenant is activated only after eligble timelines activate
ApiError::NotFound(gte.into())
}
}
impl From<GetActiveTenantError> for ApiError {
fn from(e: GetActiveTenantError) -> ApiError {
match e {
@@ -395,7 +386,7 @@ async fn build_timeline_info_common(
let guard = timeline.last_received_wal.lock().unwrap();
if let Some(info) = guard.as_ref() {
(
Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
Some(info.last_received_msg_lsn),
Some(info.last_received_msg_ts),
)
@@ -652,7 +643,9 @@ async fn timeline_preserve_initdb_handler(
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let timeline = tenant.get_timeline(timeline_id, false)?;
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(|e| ApiError::NotFound(e.into()))?;
timeline
.preserve_initdb_archive()
@@ -694,7 +687,9 @@ async fn timeline_detail_handler(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline = tenant.get_timeline(timeline_id, false)?;
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(|e| ApiError::NotFound(e.into()))?;
let timeline_info = build_timeline_info(
&timeline,
@@ -1706,32 +1701,6 @@ async fn handle_tenant_break(
json_response(StatusCode::OK, ())
}
// Obtains an lsn lease on the given timeline.
async fn lsn_lease_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let lsn: Lsn = parse_query_param(&request, "lsn")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let result = timeline
.make_lsn_lease(lsn, &ctx)
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
json_response(StatusCode::OK, result)
}
// Run GC immediately on given timeline.
async fn timeline_gc_handler(
mut request: Request<Body>,
@@ -1767,8 +1736,6 @@ async fn timeline_compact_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1777,9 +1744,6 @@ async fn timeline_compact_handler(
.compact(&cancel, flags, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
}
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1804,8 +1768,6 @@ async fn timeline_checkpoint_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1819,10 +1781,6 @@ async fn timeline_checkpoint_handler(
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
}
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1906,11 +1864,14 @@ async fn timeline_detach_ancestor_handler(
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
let ctx = &ctx;
let timeline = tenant.get_timeline(timeline_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))?;
let (_guard, prepared) = timeline
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
.await?;
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
let res = state
.tenant_manager
@@ -2044,7 +2005,9 @@ async fn active_timeline_of_active_tenant(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
Ok(tenant.get_timeline(timeline_id, true)?)
tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))
}
async fn always_panic_handler(
@@ -2308,31 +2271,6 @@ async fn post_tracing_event_handler(
json_response(StatusCode::OK, ())
}
async fn force_aux_policy_switch_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
let policy: AuxFilePolicy = json_request(&mut r).await?;
let state = get_state(&r);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
timeline
.do_switch_aux_policy(policy)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn put_io_engine_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
@@ -2410,9 +2348,19 @@ async fn list_aux_files(
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
json_response(StatusCode::OK, files)
let process = || async move {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
Ok::<_, anyhow::Error>(files)
};
match process().await {
Ok(st) => json_response(StatusCode::OK, st),
Err(err) => json_response(
StatusCode::INTERNAL_SERVER_ERROR,
ApiError::InternalServerError(err).to_string(),
),
}
}
async fn ingest_aux_files(
@@ -2430,22 +2378,24 @@ async fn ingest_aux_files(
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let mut modification = timeline.begin_modification(
Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
for (fname, content) in body.aux_files {
modification
.put_file(&fname, content.as_bytes(), &ctx)
.await
.map_err(ApiError::InternalServerError)?;
}
modification
.commit(&ctx)
.await
.map_err(ApiError::InternalServerError)?;
let process = || async move {
let mut modification = timeline.begin_modification(Lsn(
timeline.get_last_record_lsn().0 + 8
) /* advance LSN by 8 */);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
for (fname, content) in body.aux_files {
modification
.put_file(&fname, content.as_bytes(), &ctx)
.await?;
}
modification.commit(&ctx).await?;
Ok::<_, anyhow::Error>(())
};
json_response(StatusCode::OK, ())
match process().await {
Ok(st) => json_response(StatusCode::OK, st),
Err(err) => Err(ApiError::InternalServerError(err)),
}
}
/// Report on the largest tenants on this pageserver, for the storage controller to identify
@@ -2751,10 +2701,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|r| api_handler(r, get_timestamp_of_lsn_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|r| api_handler(r, lsn_lease_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
|r| api_handler(r, timeline_gc_handler),
@@ -2828,10 +2774,6 @@ pub fn make_router(
|r| api_handler(r, timeline_collect_keyspace),
)
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|r| api_handler(r, force_aux_policy_switch_handler),
)
.get("/v1/utilization", |r| api_handler(r, get_utilization))
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",

View File

@@ -525,15 +525,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_standby_horizon",
"Standby apply LSN for which GC is hold off, by timeline.",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_resident_physical_size",
@@ -1867,6 +1858,7 @@ pub(crate) struct WalIngestMetrics {
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) time_spent_on_ingest: Histogram,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -1890,6 +1882,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
"Number of WAL records filtered out due to sharding"
)
.expect("failed to define a metric"),
time_spent_on_ingest: register_histogram!(
"pageserver_wal_ingest_put_value_seconds",
"Actual time spent on ingesting a record",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric"),
});
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -2100,7 +2098,6 @@ pub(crate) struct TimelineMetrics {
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
@@ -2170,9 +2167,6 @@ impl TimelineMetrics {
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2218,7 +2212,6 @@ impl TimelineMetrics {
find_gc_cutoffs_histo,
load_layer_map_histo,
last_record_gauge,
standby_horizon_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
aux_file_size_gauge,
@@ -2253,7 +2246,6 @@ impl TimelineMetrics {
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

View File

@@ -19,7 +19,6 @@ use pageserver_api::models::{
};
use pageserver_api::shard::ShardIndex;
use pageserver_api::shard::ShardNumber;
use pageserver_api::shard::TenantShardId;
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
use pq_proto::framed::ConnectionError;
use pq_proto::FeStartupPacket;
@@ -34,7 +33,6 @@ use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use std::time::SystemTime;
use tokio::io::AsyncWriteExt;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::io::StreamReader;
@@ -907,39 +905,6 @@ impl PageServerHandler {
}
}
#[instrument(skip_all, fields(shard_id, %lsn))]
async fn handle_make_lsn_lease<IO>(
&self,
pgb: &mut PostgresBackend<IO>,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<(), QueryError>
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
let timeline = self
.get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
.await?;
let lease = timeline.make_lsn_lease(lsn, ctx)?;
let valid_until = lease
.valid_until
.duration_since(SystemTime::UNIX_EPOCH)
.map_err(|e| QueryError::Other(e.into()))?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
b"valid_until",
)]))?
.write_message_noflush(&BeMessage::DataRow(&[Some(
&valid_until.as_millis().to_be_bytes(),
)]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
Ok(())
}
#[instrument(skip_all, fields(shard_id))]
async fn handle_get_rel_exists_request(
&mut self,
@@ -1521,8 +1486,9 @@ where
let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string:?}");
let parts = query_string.split_whitespace().collect::<Vec<_>>();
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
if query_string.starts_with("pagestream_v2 ") {
let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
@@ -1547,7 +1513,9 @@ where
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
} else if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
@@ -1572,7 +1540,10 @@ where
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
} else if query_string.starts_with("basebackup ") {
let (_, params_raw) = query_string.split_at("basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for basebackup command"
@@ -1590,23 +1561,26 @@ where
self.check_permission(Some(tenant_id))?;
let lsn = if let Some(lsn_str) = params.get(2) {
let lsn = if params.len() >= 3 {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
let gzip = match params.get(3) {
Some(&"--gzip") => true,
None => false,
Some(third_param) => {
let gzip = if params.len() >= 4 {
if params[3] == "--gzip" {
true
} else {
return Err(QueryError::Other(anyhow::anyhow!(
"Parameter in position 3 unknown {third_param}",
)))
"Parameter in position 3 unknown {}",
params[3],
)));
}
} else {
false
};
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
@@ -1630,7 +1604,10 @@ where
res?;
}
// return pair of prev_lsn and last_lsn
else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
else if query_string.starts_with("get_last_record_rlsn ") {
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for get_last_record_rlsn command"
@@ -1672,7 +1649,10 @@ where
.await?;
}
// same as basebackup, but result includes relational data as well
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
else if query_string.starts_with("fullbackup ") {
let (_, params_raw) = query_string.split_at("fullbackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for fullbackup command"
@@ -1689,18 +1669,18 @@ where
.record("timeline_id", field::display(timeline_id));
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if let Some(lsn_str) = params.get(2) {
let lsn = if params.len() > 2 {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
let prev_lsn = if params.len() > 3 {
Some(
Lsn::from_str(prev_lsn_str)
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
)
} else {
None
@@ -1733,7 +1713,8 @@ where
// 2. Run:
// cat my_backup/base.tar | psql -h $PAGESERVER \
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
let params = &parts[2..];
let (_, params_raw) = query_string.split_at("import basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 5 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import basebackup command"
@@ -1782,7 +1763,8 @@ where
//
// Files are scheduled to be persisted to remote storage, and the
// caller should poll the http api to check when that is done.
let params = &parts[2..];
let (_, params_raw) = query_string.split_at("import wal ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 4 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import wal command"
@@ -1820,45 +1802,10 @@ where
// important because psycopg2 executes "SET datestyle TO 'ISO'"
// on connect
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("lease lsn ") {
let params = &parts[2..];
if params.len() != 3 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number {} for lease lsn command",
params.len()
)));
}
let tenant_shard_id = TenantShardId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_shard_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_shard_id.tenant_id))?;
// The caller is responsible for providing correct lsn.
let lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
match self
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
.await
{
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error obtaining lsn lease for {lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else if let Some(params) = parts.strip_prefix(&["show"]) {
} else if query_string.starts_with("show ") {
// show <tenant_id>
let (_, params_raw) = query_string.split_at("show ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 1 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for config command"

View File

@@ -9,6 +9,7 @@
use super::tenant::{PageReconstructError, Timeline};
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::WAL_INGEST;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::walrecord::NeonWalRecord;
use crate::{aux_file, repository::*};
@@ -39,11 +40,7 @@ use utils::bin_ser::DeserializeError;
use utils::vec_map::{VecMap, VecMapOrdering};
use utils::{bin_ser::BeSer, lsn::Lsn};
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
const MAX_AUX_FILE_DELTAS: usize = 1024;
#[derive(Debug)]
pub enum LsnForTimestamp {
@@ -1480,24 +1477,11 @@ impl<'a> DatadirModification<'a> {
// Allowed switch path:
// * no aux files -> v1/v2/cross-validation
// * cross-validation->v2
let current_policy = if current_policy.is_none() {
// This path will only be hit once per tenant: we will decide the final policy in this code block.
// The next call to `put_file` will always have `last_aux_file_policy != None`.
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
if aux_files_key_v1.is_empty() {
None
} else {
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
Some(AuxFilePolicy::V1)
}
} else {
current_policy
};
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
self.tline.do_switch_aux_policy(switch_policy)?;
self.tline.last_aux_file_policy.store(Some(switch_policy));
self.tline
.remote_client
.schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
switch_policy
} else {
@@ -1714,6 +1698,8 @@ impl<'a> DatadirModification<'a> {
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
let mut writer = self.tline.writer().await;
let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
@@ -1753,6 +1739,8 @@ impl<'a> DatadirModification<'a> {
writer.update_directory_entries_count(kind, count as u64);
}
timer.observe_duration();
Ok(())
}
@@ -1788,12 +1776,6 @@ impl<'a> DatadirModification<'a> {
self.tline.get(key, lsn, ctx).await
}
/// Only used during unit tests, force putting a key into the modification.
#[cfg(test)]
pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
self.put(key, val);
}
fn put(&mut self, key: Key, val: Value) {
let values = self.pending_updates.entry(key).or_default();
// Replace the previous value if it exists at the same lsn

View File

@@ -3964,20 +3964,18 @@ mod tests {
use super::*;
use crate::keyspace::KeySpaceAccum;
use crate::pgdatadir_mapping::AuxFilesDirectory;
use crate::repository::{Key, Value};
use crate::tenant::harness::*;
use crate::tenant::timeline::CompactFlags;
use crate::DEFAULT_PG_VERSION;
use bytes::{Bytes, BytesMut};
use bytes::BytesMut;
use hex_literal::hex;
use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
use pageserver_api::models::CompactionAlgorithm;
use rand::{thread_rng, Rng};
use tests::storage_layer::ValuesReconstructState;
use tests::timeline::{GetVectoredError, ShutdownMode};
use utils::bin_ser::BeSer;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4779,12 +4777,7 @@ mod tests {
info!("Doing vectored read on {:?}", read);
let vectored_res = tline
.get_vectored_impl(
read.clone(),
reads_lsn,
&mut ValuesReconstructState::new(),
&ctx,
)
.get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
.await;
tline
.validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
@@ -4833,7 +4826,7 @@ mod tests {
.get_vectored_impl(
aux_keyspace.clone(),
read_lsn,
&mut ValuesReconstructState::new(),
ValuesReconstructState::new(),
&ctx,
)
.await;
@@ -4978,7 +4971,7 @@ mod tests {
.get_vectored_impl(
read.clone(),
current_lsn,
&mut ValuesReconstructState::new(),
ValuesReconstructState::new(),
&ctx,
)
.await?;
@@ -5113,7 +5106,7 @@ mod tests {
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
},
query_lsn,
&mut ValuesReconstructState::new(),
ValuesReconstructState::new(),
&ctx,
)
.await;
@@ -5169,9 +5162,7 @@ mod tests {
compaction_algorithm: CompactionAlgorithm,
) -> anyhow::Result<()> {
let mut harness = TenantHarness::create(name)?;
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
kind: compaction_algorithm,
};
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5528,9 +5519,7 @@ mod tests {
compaction_algorithm: CompactionAlgorithm,
) -> anyhow::Result<()> {
let mut harness = TenantHarness::create(name)?;
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
kind: compaction_algorithm,
};
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -5558,7 +5547,7 @@ mod tests {
.await?;
const NUM_KEYS: usize = 1000;
const STEP: usize = 10000; // random update + scan base_key + idx * STEP
const STEP: usize = 100; // random update + scan base_key + idx * STEP
let cancel = CancellationToken::new();
@@ -5591,7 +5580,7 @@ mod tests {
let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
for iter in 0..=10 {
for _ in 0..10 {
// Read all the blocks
for (blknum, last_lsn) in updated.iter().enumerate() {
test_key.field6 = (blknum * STEP) as u32;
@@ -5606,7 +5595,7 @@ mod tests {
.get_vectored_impl(
keyspace.clone(),
lsn,
&mut ValuesReconstructState::default(),
ValuesReconstructState::default(),
&ctx,
)
.await?
@@ -5642,88 +5631,14 @@ mod tests {
updated[blknum] = lsn;
}
// Perform two cycles of flush, compact, and GC
for round in 0..2 {
tline.freeze_and_flush().await?;
tline
.compact(
&cancel,
if iter % 5 == 0 && round == 0 {
let mut flags = EnumSet::new();
flags.insert(CompactFlags::ForceImageLayerCreation);
flags.insert(CompactFlags::ForceRepartition);
flags
} else {
EnumSet::empty()
},
&ctx,
)
.await?;
tenant
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
.await?;
}
}
Ok(())
}
#[tokio::test]
async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
let cancel = CancellationToken::new();
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
base_key.field1 = AUX_KEY_PREFIX;
let test_key = base_key;
let mut lsn = Lsn(0x10);
for _ in 0..20 {
lsn = Lsn(lsn.0 + 0x10);
let mut writer = tline.writer().await;
writer
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", 0, lsn))),
&ctx,
)
// Perform a cycle of flush, compact, and GC
tline.freeze_and_flush().await?;
tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
tenant
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
.await?;
writer.finish_write(lsn);
drop(writer);
tline.freeze_and_flush().await?; // force create a delta layer
}
let before_num_l0_delta_files = tline
.layers
.read()
.await
.layer_map()
.get_level0_deltas()?
.len();
tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
let after_num_l0_delta_files = tline
.layers
.read()
.await
.layer_map()
.get_level0_deltas()?
.len();
assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
assert_eq!(
tline.get(test_key, lsn, &ctx).await?,
test_img(&format!("{} at {}", 0, lsn))
);
Ok(())
}
@@ -6002,498 +5917,4 @@ mod tests {
Some(&bytes::Bytes::from_static(b"last"))
);
}
#[tokio::test]
async fn aux_file_policy_force_switch() {
let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
let (tenant, ctx) = harness.load().await;
let mut lsn = Lsn(0x08);
let tline: Arc<Timeline> = tenant
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
assert_eq!(
tline.last_aux_file_policy.load(),
None,
"no aux file is written so it should be unset"
);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification
.put_file("pg_logical/mappings/test1", b"first", &ctx)
.await
.unwrap();
modification.commit(&ctx).await.unwrap();
}
tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
assert_eq!(
tline.last_aux_file_policy.load(),
Some(AuxFilePolicy::V2),
"dirty index_part.json reflected state is yet to be updated"
);
// lose all data from v1
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(files.get("pg_logical/mappings/test1"), None);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification
.put_file("pg_logical/mappings/test2", b"second", &ctx)
.await
.unwrap();
modification.commit(&ctx).await.unwrap();
}
// read data ingested in v2
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(
files.get("pg_logical/mappings/test2"),
Some(&bytes::Bytes::from_static(b"second"))
);
// lose all data from v1
assert_eq!(files.get("pg_logical/mappings/test1"), None);
}
#[tokio::test]
async fn aux_file_policy_auto_detect() {
let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
let (tenant, ctx) = harness.load().await;
let mut lsn = Lsn(0x08);
let tline: Arc<Timeline> = tenant
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
assert_eq!(
tline.last_aux_file_policy.load(),
None,
"no aux file is written so it should be unset"
);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: vec![(
"test_file".to_string(),
Bytes::copy_from_slice(b"test_file"),
)]
.into_iter()
.collect(),
})
.unwrap();
modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
modification.commit(&ctx).await.unwrap();
}
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification
.put_file("pg_logical/mappings/test1", b"first", &ctx)
.await
.unwrap();
modification.commit(&ctx).await.unwrap();
}
assert_eq!(
tline.last_aux_file_policy.load(),
Some(AuxFilePolicy::V1),
"keep using v1 because there are aux files writting with v1"
);
// we can still read the auxfile v1
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(
files.get("pg_logical/mappings/test1"),
Some(&bytes::Bytes::from_static(b"first"))
);
assert_eq!(
files.get("test_file"),
Some(&bytes::Bytes::from_static(b"test_file"))
);
}
#[tokio::test]
async fn test_metadata_image_creation() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_image_creation")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
const NUM_KEYS: usize = 1000;
const STEP: usize = 10000; // random update + scan base_key + idx * STEP
let cancel = CancellationToken::new();
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
base_key.field1 = AUX_KEY_PREFIX;
let mut test_key = base_key;
let mut lsn = Lsn(0x10);
async fn scan_with_statistics(
tline: &Timeline,
keyspace: &KeySpace,
lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
let mut reconstruct_state = ValuesReconstructState::default();
let res = tline
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
.await?;
Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
}
#[allow(clippy::needless_range_loop)]
for blknum in 0..NUM_KEYS {
lsn = Lsn(lsn.0 + 0x10);
test_key.field6 = (blknum * STEP) as u32;
let mut writer = tline.writer().await;
writer
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
}
let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
for iter in 1..=10 {
for _ in 0..NUM_KEYS {
lsn = Lsn(lsn.0 + 0x10);
let blknum = thread_rng().gen_range(0..NUM_KEYS);
test_key.field6 = (blknum * STEP) as u32;
let mut writer = tline.writer().await;
writer
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
}
tline.freeze_and_flush().await?;
if iter % 5 == 0 {
let (_, before_delta_file_accessed) =
scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
tline
.compact(
&cancel,
{
let mut flags = EnumSet::new();
flags.insert(CompactFlags::ForceImageLayerCreation);
flags.insert(CompactFlags::ForceRepartition);
flags
},
&ctx,
)
.await?;
let (_, after_delta_file_accessed) =
scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
// Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
assert!(
after_delta_file_accessed <= 2,
"after_delta_file_accessed={after_delta_file_accessed}"
);
}
}
Ok(())
}
#[tokio::test]
async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
let cancel = CancellationToken::new();
let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
let mut lsn = Lsn(0x20);
{
let mut writer = tline.writer().await;
writer
.put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
.await?;
writer.finish_write(lsn);
drop(writer);
tline.freeze_and_flush().await?; // this will create a image layer
}
let child = tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
.await
.unwrap();
lsn.0 += 0x10;
{
let mut writer = child.writer().await;
writer
.put(
base_key_child,
lsn,
&Value::Image(test_img("data key 2")),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
child.freeze_and_flush().await?; // this will create a delta
{
// update the partitioning to include the test key space, otherwise they
// will be dropped by image layer creation
let mut guard = child.partitioning.lock().await;
let ((partitioning, _), partition_lsn) = &mut *guard;
partitioning
.parts
.push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
*partition_lsn = lsn;
}
child
.compact(
&cancel,
{
let mut set = EnumSet::empty();
set.insert(CompactFlags::ForceImageLayerCreation);
set
},
&ctx,
)
.await?; // force create an image layer for the keys, TODO: check if the image layer is created
}
async fn get_vectored_impl_wrapper(
tline: &Arc<Timeline>,
key: Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Option<Bytes>, GetVectoredError> {
let mut reconstruct_state = ValuesReconstructState::new();
let mut res = tline
.get_vectored_impl(
KeySpace::single(key..key.next()),
lsn,
&mut reconstruct_state,
ctx,
)
.await?;
Ok(res.pop_last().map(|(k, v)| {
assert_eq!(k, key);
v.unwrap()
}))
}
// test vectored get on parent timeline
assert_eq!(
get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
Some(test_img("data key 1"))
);
assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
.await
.unwrap_err()
.is_missing_key_error());
assert!(
get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx)
.await
.unwrap_err()
.is_missing_key_error()
);
// test vectored get on child timeline
assert_eq!(
get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
Some(test_img("data key 1"))
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
Some(test_img("data key 2"))
);
assert!(
get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx)
.await
.unwrap_err()
.is_missing_key_error()
);
Ok(())
}
#[tokio::test]
async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
let cancel = CancellationToken::new();
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
base_key.field1 = AUX_KEY_PREFIX;
base_key_child.field1 = AUX_KEY_PREFIX;
base_key_nonexist.field1 = AUX_KEY_PREFIX;
let mut lsn = Lsn(0x20);
{
let mut writer = tline.writer().await;
writer
.put(
base_key,
lsn,
&Value::Image(test_img("metadata key 1")),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
tline.freeze_and_flush().await?; // this will create an image layer
tline
.compact(
&cancel,
{
let mut set = EnumSet::empty();
set.insert(CompactFlags::ForceImageLayerCreation);
set.insert(CompactFlags::ForceRepartition);
set
},
&ctx,
)
.await?; // force create an image layer for metadata keys
tenant
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
.await?;
}
let child = tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
.await
.unwrap();
lsn.0 += 0x10;
{
let mut writer = child.writer().await;
writer
.put(
base_key_child,
lsn,
&Value::Image(test_img("metadata key 2")),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
child.freeze_and_flush().await?;
child
.compact(
&cancel,
{
let mut set = EnumSet::empty();
set.insert(CompactFlags::ForceImageLayerCreation);
set.insert(CompactFlags::ForceRepartition);
set
},
&ctx,
)
.await?; // force create an image layer for metadata keys
tenant
.gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
.await?;
}
async fn get_vectored_impl_wrapper(
tline: &Arc<Timeline>,
key: Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Option<Bytes>, GetVectoredError> {
let mut reconstruct_state = ValuesReconstructState::new();
let mut res = tline
.get_vectored_impl(
KeySpace::single(key..key.next()),
lsn,
&mut reconstruct_state,
ctx,
)
.await?;
Ok(res.pop_last().map(|(k, v)| {
assert_eq!(k, key);
v.unwrap()
}))
}
// test vectored get on parent timeline
assert_eq!(
get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
Some(test_img("metadata key 1"))
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
None
);
// test vectored get on child timeline
assert_eq!(
get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
Some(test_img("metadata key 2"))
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
None
);
Ok(())
}
}

View File

@@ -238,13 +238,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
io_buf,
Err(Error::new(
ErrorKind::Other,
format!("blob too large ({len} bytes)"),
format!("blob too large ({} bytes)", len),
)),
);
}
if len > 0x0fff_ffff {
tracing::warn!("writing blob above future limit ({len} bytes)");
}
let mut len_buf = (len as u32).to_be_bytes();
len_buf[0] |= 0x80;
io_buf.extend_from_slice(&len_buf[..]);

View File

@@ -11,7 +11,6 @@
use anyhow::bail;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::CompactionAlgorithm;
use pageserver_api::models::CompactionAlgorithmSettings;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, ThrottleConfig};
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -321,7 +320,7 @@ pub struct TenantConf {
pub compaction_period: Duration,
// Level0 delta layer threshold for compaction.
pub compaction_threshold: usize,
pub compaction_algorithm: CompactionAlgorithmSettings,
pub compaction_algorithm: CompactionAlgorithm,
// Determines how much history is retained, to allow
// branching and read replicas at an older point in time.
// The unit is #of bytes of WAL.
@@ -407,7 +406,7 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
pub compaction_algorithm: Option<CompactionAlgorithm>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
@@ -498,9 +497,7 @@ impl TenantConfOpt {
.unwrap_or(global_conf.compaction_threshold),
compaction_algorithm: self
.compaction_algorithm
.as_ref()
.unwrap_or(&global_conf.compaction_algorithm)
.clone(),
.unwrap_or(global_conf.compaction_algorithm),
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
image_creation_threshold: self
@@ -553,9 +550,7 @@ impl Default for TenantConf {
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
.expect("cannot parse default compaction period"),
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
compaction_algorithm: CompactionAlgorithmSettings {
kind: DEFAULT_COMPACTION_ALGORITHM,
},
compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
gc_horizon: DEFAULT_GC_HORIZON,
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
.expect("cannot parse default gc period"),

View File

@@ -7,7 +7,7 @@ use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::LocationConfigMode;
use pageserver_api::shard::{
ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
};
use pageserver_api::upcall_api::ReAttachResponseTenant;
use rand::{distributions::Alphanumeric, Rng};
@@ -127,8 +127,6 @@ pub(crate) enum ShardSelector {
First,
/// Pick the shard that holds this key
Page(Key),
/// The shard ID is known: pick the given shard
Known(ShardIndex),
}
/// A convenience for use with the re_attach ControlPlaneClient function: rather
@@ -2069,11 +2067,6 @@ impl TenantManager {
return ShardResolveResult::Found(tenant.clone());
}
}
ShardSelector::Known(shard)
if tenant.shard_identity.shard_index() == shard =>
{
return ShardResolveResult::Found(tenant.clone());
}
_ => continue,
}
}

View File

@@ -62,10 +62,14 @@ use super::{
CommandRequest, DownloadCommand,
};
/// For each tenant, default period for how long must have passed since the last download_tenant call before
/// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
/// download, if the uploader populated it.
const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
/// For each tenant, how long must have passed since the last download_tenant call before
/// calling it again. This is approximately the time by which local data is allowed
/// to fall behind remote data.
///
/// TODO: this should just be a default, and the actual period should be controlled
/// via the heatmap itself
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
/// Range of concurrency we may use when downloading layers within a timeline. This is independent
/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
@@ -93,7 +97,7 @@ pub(super) async fn downloader_task(
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("secondary_download_scheduler"))
.instrument(info_span!("secondary_downloads"))
.await
}
@@ -148,22 +152,14 @@ pub(super) struct SecondaryDetailTimeline {
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
}
// Aspects of a heatmap that we remember after downloading it
#[derive(Clone, Debug)]
struct DownloadSummary {
etag: Etag,
#[allow(unused)]
mtime: SystemTime,
upload_period: Duration,
}
/// This state is written by the secondary downloader, it is opaque
/// to TenantManager
#[derive(Debug)]
pub(super) struct SecondaryDetail {
pub(super) config: SecondaryLocationConfig,
last_download: Option<DownloadSummary>,
last_download: Option<Instant>,
last_etag: Option<Etag>,
next_download: Option<Instant>,
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
}
@@ -193,6 +189,7 @@ impl SecondaryDetail {
Self {
config,
last_download: None,
last_etag: None,
next_download: None,
timelines: HashMap::new(),
}
@@ -246,8 +243,9 @@ impl SecondaryDetail {
struct PendingDownload {
secondary_state: Arc<SecondaryTenant>,
last_download: Option<DownloadSummary>,
last_download: Option<Instant>,
target_time: Option<Instant>,
period: Option<Duration>,
}
impl scheduler::PendingJob for PendingDownload {
@@ -297,17 +295,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
tracing::debug!("Secondary tenant download completed");
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
// take priority to run again.
let mut detail = secondary_state.detail.lock().unwrap();
let period = detail
.last_download
.as_ref()
.map(|d| d.upload_period)
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
// We advance next_download irrespective of errors: we don't want error cases to result in
// expensive busy-polling.
detail.next_download = Some(Instant::now() + period_jitter(period, 5));
detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
}
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -340,11 +331,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
if detail.next_download.is_none() {
// Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times. Subsequent
// rounds will use a smaller jitter to avoid accidentally synchronizing later.
detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
"Using our constant, which is known to be small compared with clock range",
));
}
(detail.last_download.clone(), detail.next_download.unwrap())
(detail.last_download, detail.next_download.unwrap())
};
if now > next_download {
@@ -352,6 +343,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
secondary_state: secondary_tenant,
last_download,
target_time: Some(next_download),
period: Some(DOWNLOAD_FRESHEN_INTERVAL),
})
} else {
None
@@ -377,6 +369,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
Ok(PendingDownload {
target_time: None,
period: None,
last_download: None,
secondary_state: tenant,
})
@@ -393,6 +386,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
secondary_state,
last_download,
target_time,
period,
} = job;
let (completion, barrier) = utils::completion::channel();
@@ -429,15 +423,20 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
// If the job had a target execution time, we may check our final execution
// time against that for observability purposes.
if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
// Elapsed time includes any scheduling lag as well as the execution of the job
let elapsed = Instant::now().duration_since(target_time);
if let (Some(target_time), Some(period)) = (target_time, period) {
// Only track execution lag if this isn't our first download: otherwise, it is expected
// that execution will have taken longer than our configured interval, for example
// when starting up a pageserver and
if last_download.is_some() {
// Elapsed time includes any scheduling lag as well as the execution of the job
let elapsed = Instant::now().duration_since(target_time);
warn_when_period_overrun(
elapsed,
last_download.upload_period,
BackgroundLoopKind::SecondaryDownload,
);
warn_when_period_overrun(
elapsed,
period,
BackgroundLoopKind::SecondaryDownload,
);
}
}
CompleteDownload {
@@ -526,12 +525,12 @@ impl<'a> TenantDownloader<'a> {
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
// We will use the etag from last successful download to make the download conditional on changes
let last_download = self
let last_etag = self
.secondary_state
.detail
.lock()
.unwrap()
.last_download
.last_etag
.clone();
// Download the tenant's heatmap
@@ -540,7 +539,7 @@ impl<'a> TenantDownloader<'a> {
etag: heatmap_etag,
bytes: heatmap_bytes,
} = match tokio::select!(
bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
_ = self.secondary_state.cancel.cancelled() => return Ok(())
) {
HeatMapDownload::Unmodified => {
@@ -569,39 +568,6 @@ impl<'a> TenantDownloader<'a> {
heatmap.timelines.len()
);
// Get or initialize the local disk state for the timelines we will update
let mut timeline_states = HashMap::new();
for timeline in &heatmap.timelines {
let timeline_state = self
.secondary_state
.detail
.lock()
.unwrap()
.timelines
.get(&timeline.timeline_id)
.cloned();
let timeline_state = match timeline_state {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state =
init_timeline_state(self.conf, tenant_shard_id, timeline).await;
// Re-acquire detail lock now that we're done with async load from local FS
self.secondary_state
.detail
.lock()
.unwrap()
.timelines
.insert(timeline.timeline_id, timeline_state.clone());
timeline_state
}
};
timeline_states.insert(timeline.timeline_id, timeline_state);
}
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
// principle that deletions should be done before writes wherever possible, and so that we can use this
// phase to initialize our SecondaryProgress.
@@ -612,10 +578,6 @@ impl<'a> TenantDownloader<'a> {
// Download the layers in the heatmap
for timeline in heatmap.timelines {
let timeline_state = timeline_states
.remove(&timeline.timeline_id)
.expect("Just populated above");
if self.secondary_state.cancel.is_cancelled() {
tracing::debug!(
"Cancelled before downloading timeline {}",
@@ -625,7 +587,7 @@ impl<'a> TenantDownloader<'a> {
}
let timeline_id = timeline.timeline_id;
self.download_timeline(timeline, timeline_state, ctx)
self.download_timeline(timeline, ctx)
.instrument(tracing::info_span!(
"secondary_download_timeline",
tenant_id=%tenant_shard_id.tenant_id,
@@ -637,30 +599,7 @@ impl<'a> TenantDownloader<'a> {
// Only update last_etag after a full successful download: this way will not skip
// the next download, even if the heatmap's actual etag is unchanged.
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
etag: heatmap_etag,
mtime: heatmap_mtime,
upload_period: heatmap
.upload_period_ms
.map(|ms| Duration::from_millis(ms as u64))
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
});
// Robustness: we should have updated progress properly, but in case we didn't, make sure
// we don't leave the tenant in a state where we claim to have successfully downloaded
// everything, but our progress is incomplete. The invariant here should be that if
// we have set `last_download` to this heatmap's etag, then the next time we see that
// etag we can safely do no work (i.e. we must be complete).
let mut progress = self.secondary_state.progress.lock().unwrap();
debug_assert!(progress.layers_downloaded == progress.layers_total);
debug_assert!(progress.bytes_downloaded == progress.bytes_total);
if progress.layers_downloaded != progress.layers_total
|| progress.bytes_downloaded != progress.bytes_total
{
tracing::warn!("Correcting drift in progress stats ({progress:?})");
progress.layers_downloaded = progress.layers_total;
progress.bytes_downloaded = progress.bytes_total;
}
self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
Ok(())
}
@@ -837,7 +776,6 @@ impl<'a> TenantDownloader<'a> {
async fn download_timeline(
&self,
timeline: HeatMapTimeline,
timeline_state: SecondaryDetailTimeline,
ctx: &RequestContext,
) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -846,6 +784,34 @@ impl<'a> TenantDownloader<'a> {
// Accumulate updates to the state
let mut touched = Vec::new();
// Clone a view of what layers already exist on disk
let timeline_state = self
.secondary_state
.detail
.lock()
.unwrap()
.timelines
.get(&timeline.timeline_id)
.cloned();
let timeline_state = match timeline_state {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state =
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
// Re-acquire detail lock now that we're done with async load from local FS
self.secondary_state
.detail
.lock()
.unwrap()
.timelines
.insert(timeline.timeline_id, timeline_state.clone());
timeline_state
}
};
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
let mut download_futs = Vec::new();
@@ -1013,11 +979,6 @@ impl<'a> TenantDownloader<'a> {
);
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
tracing::info!(
"Starting download of layer {}, size {}",
layer.name,
layer.metadata.file_size
);
let downloaded_bytes = match download_layer_file(
self.conf,
self.remote_storage,
@@ -1040,14 +1001,6 @@ impl<'a> TenantDownloader<'a> {
"Skipped downloading missing layer {}, raced with compaction/gc?",
layer.name
);
// If the layer is 404, adjust the progress statistics to reflect that we will not download it.
let mut progress = self.secondary_state.progress.lock().unwrap();
progress.layers_total = progress.layers_total.saturating_sub(1);
progress.bytes_total = progress
.bytes_total
.saturating_sub(layer.metadata.file_size);
return Ok(None);
}
Err(e) => return Err(e.into()),

View File

@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("heatmap_upload_scheduler"))
.instrument(info_span!("heatmap_uploader"))
.await
}

View File

@@ -179,13 +179,6 @@ where
// Schedule some work, if concurrency limit permits it
self.spawn_pending();
// This message is printed every scheduling iteration as proof of liveness when looking at logs
tracing::info!(
"Status: {} tasks running, {} pending",
self.running.len(),
self.pending.len()
);
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
@@ -265,11 +258,7 @@ where
self.tasks.spawn(fut);
let replaced = self.running.insert(tenant_shard_id, in_progress);
debug_assert!(replaced.is_none());
if replaced.is_some() {
tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
}
self.running.insert(tenant_shard_id, in_progress);
}
/// For all pending tenants that are elegible for execution, spawn their task.
@@ -279,9 +268,7 @@ where
while !self.pending.is_empty() && self.running.len() < self.concurrency {
// unwrap: loop condition includes !is_empty()
let pending = self.pending.pop_front().unwrap();
if !self.running.contains_key(pending.get_tenant_shard_id()) {
self.do_spawn(pending);
}
self.do_spawn(pending);
}
}
@@ -334,8 +321,7 @@ where
let tenant_shard_id = job.get_tenant_shard_id();
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Command already running, waiting for it");
tracing::info!("Command already running, waiting for it");
barrier
} else {
let running = self.spawn_now(job);

View File

@@ -113,20 +113,12 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
}
}
/// Bag of data accumulated during a vectored get..
/// Bag of data accumulated during a vectored get
pub(crate) struct ValuesReconstructState {
/// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
/// should not expect to get anything from this hashmap.
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
/// The keys which are already retrieved
keys_done: KeySpaceRandomAccum,
/// The keys covered by the image layers
keys_with_image_coverage: Option<Range<Key>>,
// Statistics that are still accessible as a caller of `get_vectored_impl`.
layers_visited: u32,
delta_layers_visited: u32,
}
impl ValuesReconstructState {
@@ -134,9 +126,7 @@ impl ValuesReconstructState {
Self {
keys: HashMap::new(),
keys_done: KeySpaceRandomAccum::new(),
keys_with_image_coverage: None,
layers_visited: 0,
delta_layers_visited: 0,
}
}
@@ -150,17 +140,8 @@ impl ValuesReconstructState {
}
}
pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
pub(crate) fn on_layer_visited(&mut self) {
self.layers_visited += 1;
if let ReadableLayer::PersistentLayer(layer) = layer {
if layer.layer_desc().is_delta() {
self.delta_layers_visited += 1;
}
}
}
pub(crate) fn get_delta_layers_visited(&self) -> u32 {
self.delta_layers_visited
}
pub(crate) fn get_layers_visited(&self) -> u32 {
@@ -190,16 +171,6 @@ impl ValuesReconstructState {
}
}
/// On hitting image layer, we can mark all keys in this range as done, because
/// if the image layer does not contain a key, it is deleted/never added.
pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
assert_eq!(
prev_val, None,
"should consume the keyspace before the next iteration"
);
}
/// Update the state collected for a given key.
/// Returns true if this was the last value needed for the key and false otherwise.
///
@@ -262,12 +233,8 @@ impl ValuesReconstructState {
/// Returns the key space describing the keys that have
/// been marked as completed since the last call to this function.
/// Returns individual keys done, and the image layer coverage.
pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
(
self.keys_done.consume_keyspace(),
self.keys_with_image_coverage.take(),
)
pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
self.keys_done.consume_keyspace()
}
}

View File

@@ -158,7 +158,6 @@ pub struct ImageLayerInner {
index_start_blk: u32,
index_root_blk: u32,
key_range: Range<Key>,
lsn: Lsn,
file: VirtualFile,
@@ -420,7 +419,6 @@ impl ImageLayerInner {
file,
file_id,
max_vectored_read_bytes,
key_range: actual_summary.key_range,
}))
}
@@ -480,8 +478,6 @@ impl ImageLayerInner {
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
.await;
reconstruct_state.on_image_layer_visited(&self.key_range);
Ok(())
}
@@ -650,7 +646,7 @@ impl ImageLayerWriterInner {
lsn,
},
);
trace!("creating image layer {}", path);
info!("new image layer {path}");
let mut file = {
VirtualFile::open_with_options(
&path,
@@ -770,7 +766,7 @@ impl ImageLayerWriterInner {
// FIXME: why not carry the virtualfile here, it supports renaming?
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
info!("created image layer {}", layer.local_path());
trace!("created image layer {}", layer.local_path());
Ok(layer)
}

View File

@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
use tracing::Instrument;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use utils::sync::{gate, heavier_once_cell};
use utils::sync::heavier_once_cell;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
@@ -1264,7 +1264,6 @@ impl LayerInner {
lsn_end: lsn_range.end,
remote: !resident,
access_stats,
l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
}
} else {
let lsn = self.desc.image_layer_lsn();
@@ -1333,7 +1332,7 @@ impl LayerInner {
is_good_to_continue(&rx.borrow_and_update())?;
let Ok(gate) = timeline.gate.enter() else {
let Ok(_gate) = timeline.gate.enter() else {
return Err(EvictionCancelled::TimelineGone);
};
@@ -1421,7 +1420,7 @@ impl LayerInner {
Self::spawn_blocking(move || {
let _span = span.entered();
let res = self.evict_blocking(&timeline, &gate, &permit);
let res = self.evict_blocking(&timeline, &permit);
let waiters = self.inner.initializer_count();
@@ -1447,7 +1446,6 @@ impl LayerInner {
fn evict_blocking(
&self,
timeline: &Timeline,
_gate: &gate::GateGuard,
_permit: &heavier_once_cell::InitPermit,
) -> Result<(), EvictionCancelled> {
// now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`

View File

@@ -347,33 +347,37 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
mod test {
use super::*;
#[test]
fn image_layer_parse() {
fn image_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Image(ImageLayerName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
Ok(())
}
#[test]
fn delta_layer_parse() {
fn delta_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Delta(DeltaLayerName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
..Lsn::from_hex("000000000154C481").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
Ok(())
}
}

View File

@@ -18,14 +18,14 @@ use fail::fail_point;
use once_cell::sync::Lazy;
use pageserver_api::{
key::{
AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
NON_INHERITED_SPARSE_RANGE,
},
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
models::{
AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
TimelineState,
},
reltag::BlockNumber,
shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -60,6 +60,7 @@ use std::{
ops::ControlFlow,
};
use crate::tenant::timeline::init::LocalLayerFileMetadata;
use crate::{
aux_file::AuxFileSizeEstimator,
tenant::{
@@ -88,9 +89,6 @@ use crate::{
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
};
use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
use crate::{
pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
};
use crate::{
pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
virtual_file::{MaybeFatalIo, VirtualFile},
@@ -269,8 +267,6 @@ pub struct Timeline {
// Atomic would be more appropriate here.
last_freeze_ts: RwLock<Instant>,
pub(crate) standby_horizon: AtomicLsn,
// WAL redo manager. `None` only for broken tenants.
walredo_mgr: Option<Arc<super::WalRedoManager>>,
@@ -350,8 +346,8 @@ pub struct Timeline {
// though let's keep them both for better error visibility.
pub initdb_lsn: Lsn,
/// When did we last calculate the partitioning? Make it pub to test cases.
pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
/// When did we last calculate the partitioning?
partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
/// Configuration: how often should the partitioning be recalculated.
repartition_threshold: u64,
@@ -485,11 +481,6 @@ impl GcCutoffs {
}
}
pub(crate) struct TimelineVisitOutcome {
completed_keyspace: KeySpace,
image_covered_keyspace: KeySpace,
}
/// An error happened in a get() operation.
#[derive(thiserror::Error, Debug)]
pub(crate) enum PageReconstructError {
@@ -514,13 +505,6 @@ pub(crate) enum PageReconstructError {
MissingKey(MissingKeyError),
}
impl GetVectoredError {
#[cfg(test)]
pub(crate) fn is_missing_key_error(&self) -> bool {
matches!(self, Self::MissingKey(_))
}
}
#[derive(Debug)]
pub struct MissingKeyError {
key: Key,
@@ -798,11 +782,6 @@ pub(crate) enum ShutdownMode {
Hard,
}
struct ImageLayerCreationOutcome {
image: Option<ResidentLayer>,
next_start_key: Key,
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -904,7 +883,7 @@ impl Timeline {
}
let vectored_res = self
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
.get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
.await;
if self.conf.validate_vectored_get {
@@ -1049,12 +1028,7 @@ impl Timeline {
}
GetVectoredImpl::Vectored => {
let vectored_res = self
.get_vectored_impl(
keyspace.clone(),
lsn,
&mut ValuesReconstructState::new(),
ctx,
)
.get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
.await;
if self.conf.validate_vectored_get {
@@ -1142,7 +1116,7 @@ impl Timeline {
.get_vectored_impl(
keyspace.clone(),
lsn,
&mut ValuesReconstructState::default(),
ValuesReconstructState::default(),
ctx,
)
.await;
@@ -1219,7 +1193,7 @@ impl Timeline {
&self,
keyspace: KeySpace,
lsn: Lsn,
reconstruct_state: &mut ValuesReconstructState,
mut reconstruct_state: ValuesReconstructState,
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
let get_kind = if keyspace.total_raw_size() == 1 {
@@ -1231,7 +1205,7 @@ impl Timeline {
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
.for_get_kind(get_kind)
.start_timer();
self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
.await?;
get_data_timer.stop_and_record();
@@ -1240,8 +1214,7 @@ impl Timeline {
.start_timer();
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
let layers_visited = reconstruct_state.get_layers_visited();
for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
for (key, res) in reconstruct_state.keys {
match res {
Err(err) => {
results.insert(key, Err(err));
@@ -1532,20 +1505,6 @@ impl Timeline {
Ok(())
}
/// Obtains a temporary lease blocking garbage collection for the given LSN
pub(crate) fn make_lsn_lease(
&self,
_lsn: Lsn,
_ctx: &RequestContext,
) -> anyhow::Result<LsnLease> {
const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
let lease = LsnLease {
valid_until: SystemTime::now() + LEASE_LENGTH,
};
// TODO: dummy implementation
Ok(lease)
}
/// Flush to disk all data that was written with the put_* functions
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
@@ -1700,7 +1659,7 @@ impl Timeline {
return Ok(());
}
match self.get_compaction_algorithm_settings().kind {
match self.get_compaction_algorithm() {
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
}
@@ -2096,14 +2055,12 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
}
fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
let tenant_conf = &self.tenant_conf.load();
tenant_conf
.tenant_conf
.compaction_algorithm
.as_ref()
.unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm)
.clone()
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
}
fn get_eviction_policy(&self) -> EvictionPolicy {
@@ -2297,8 +2254,6 @@ impl Timeline {
compaction_lock: tokio::sync::Mutex::default(),
gc_lock: tokio::sync::Mutex::default(),
standby_horizon: AtomicLsn::new(0),
timeline_get_throttle: resources.timeline_get_throttle,
aux_files: tokio::sync::Mutex::new(AuxFilesState {
@@ -3332,15 +3287,12 @@ impl Timeline {
let mut cont_lsn = Lsn(request_lsn.0 + 1);
let missing_keyspace = loop {
loop {
if self.cancel.is_cancelled() {
return Err(GetVectoredError::Cancelled);
}
let TimelineVisitOutcome {
completed_keyspace: completed,
image_covered_keyspace,
} = Self::get_vectored_reconstruct_data_timeline(
let completed = Self::get_vectored_reconstruct_data_timeline(
timeline,
keyspace.clone(),
cont_lsn,
@@ -3359,31 +3311,12 @@ impl Timeline {
ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
});
// Keyspace is fully retrieved
if keyspace.is_empty() {
break None;
// Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
// into ancestor timelines). TODO: is there any other metadata which we want to inherit?
if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
break;
}
// Not fully retrieved but no ancestor timeline.
if timeline.ancestor_timeline.is_none() {
break Some(keyspace);
}
// Now we see if there are keys covered by the image layer but does not exist in the
// image layer, which means that the key does not exist.
// The block below will stop the vectored search if any of the keys encountered an image layer
// which did not contain a snapshot for said key. Since we have already removed all completed
// keys from `keyspace`, we expect there to be no overlap between it and the image covered key
// space. If that's not the case, we had at least one key encounter a gap in the image layer
// and stop the search as a result of that.
let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
if !removed.is_empty() {
break Some(removed);
}
// If we reached this point, `remove_overlapping_with` should not have made any change to the
// keyspace.
// Take the min to avoid reconstructing a page with data newer than request Lsn.
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
timeline_owned = timeline
@@ -3391,14 +3324,14 @@ impl Timeline {
.await
.map_err(GetVectoredError::GetReadyAncestorError)?;
timeline = &*timeline_owned;
};
}
if let Some(missing_keyspace) = missing_keyspace {
if keyspace.total_raw_size() != 0 {
return Err(GetVectoredError::MissingKey(MissingKeyError {
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
shard: self
.shard_identity
.get_shard_number(&missing_keyspace.start().unwrap()),
.get_shard_number(&keyspace.start().unwrap()),
cont_lsn,
request_lsn,
ancestor_lsn: Some(timeline.ancestor_lsn),
@@ -3423,9 +3356,6 @@ impl Timeline {
///
/// At each iteration pop the top of the fringe (the layer with the highest Lsn)
/// and get all the required reconstruct data from the layer in one go.
///
/// Returns the completed keyspace and the keyspaces with image coverage. The caller
/// decides how to deal with these two keyspaces.
async fn get_vectored_reconstruct_data_timeline(
timeline: &Timeline,
keyspace: KeySpace,
@@ -3433,27 +3363,20 @@ impl Timeline {
reconstruct_state: &mut ValuesReconstructState,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<TimelineVisitOutcome, GetVectoredError> {
) -> Result<KeySpace, GetVectoredError> {
let mut unmapped_keyspace = keyspace.clone();
let mut fringe = LayerFringe::new();
let mut completed_keyspace = KeySpace::default();
let mut image_covered_keyspace = KeySpaceRandomAccum::new();
loop {
if cancel.is_cancelled() {
return Err(GetVectoredError::Cancelled);
}
let (keys_done_last_step, keys_with_image_coverage) =
reconstruct_state.consume_done_keys();
let keys_done_last_step = reconstruct_state.consume_done_keys();
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
completed_keyspace.merge(&keys_done_last_step);
if let Some(keys_with_image_coverage) = keys_with_image_coverage {
unmapped_keyspace
.remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone()));
image_covered_keyspace.add_range(keys_with_image_coverage);
}
// Do not descent any further if the last layer we visited
// completed all keys in the keyspace it inspected. This is not
@@ -3525,16 +3448,13 @@ impl Timeline {
unmapped_keyspace = keyspace_to_read;
cont_lsn = next_cont_lsn;
reconstruct_state.on_layer_visited(&layer_to_read);
reconstruct_state.on_layer_visited();
} else {
break;
}
}
Ok(TimelineVisitOutcome {
completed_keyspace,
image_covered_keyspace: image_covered_keyspace.consume_keyspace(),
})
Ok(completed_keyspace)
}
/// # Cancel-safety
@@ -3882,7 +3802,7 @@ impl Timeline {
}
}
let (layers_to_upload, delta_layers_to_add) = if create_image_layer {
let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
// require downloading anything during initial import.
let ((rel_partition, metadata_partition), _lsn) = self
@@ -3899,23 +3819,26 @@ impl Timeline {
}
// For metadata, always create delta layers.
let delta_layers = if !metadata_partition.parts.is_empty() {
// In the current implementation, the metadata partition will only have one part, and the part will only have
// one single key range. This might change in the future.
let mut delta_layers_created = Vec::new();
for ks in &metadata_partition.parts {
for range in &ks.0.ranges {
let layer = self
.create_delta_layer(&frozen_layer, Some(range.clone()), ctx)
.await?;
if let Some(layer) = layer {
delta_layers_created.push(layer);
}
}
}
delta_layers_created
let delta_layer = if !metadata_partition.parts.is_empty() {
assert_eq!(
metadata_partition.parts.len(),
1,
"currently sparse keyspace should only contain a single aux file keyspace"
);
let metadata_keyspace = &metadata_partition.parts[0];
assert_eq!(
metadata_keyspace.0.ranges.len(),
1,
"aux file keyspace should be a single range"
);
self.create_delta_layer(
&frozen_layer,
Some(metadata_keyspace.0.ranges[0].clone()),
ctx,
)
.await?
} else {
Vec::new()
None
};
// For image layers, we add them immediately into the layer map.
@@ -3930,8 +3853,12 @@ impl Timeline {
.await?,
);
layers_to_upload.extend(delta_layers.iter().cloned());
(layers_to_upload, delta_layers)
if let Some(delta_layer) = delta_layer {
layers_to_upload.push(delta_layer.clone());
(layers_to_upload, Some(delta_layer))
} else {
(layers_to_upload, None)
}
} else {
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
@@ -3939,7 +3866,12 @@ impl Timeline {
let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
panic!("delta layer cannot be empty if no filter is applied");
};
(vec![layer.clone()], vec![layer])
(
// FIXME: even though we have a single image and single delta layer assumption
// we push them to vec
vec![layer.clone()],
Some(layer),
)
};
pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
@@ -3960,7 +3892,7 @@ impl Timeline {
return Err(FlushLayerError::Cancelled);
}
guard.finish_flush_l0_layer(&delta_layers_to_add, &frozen_layer, &self.metrics);
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
if self.set_disk_consistent_lsn(disk_consistent_lsn) {
// Schedule remote uploads that will reflect our new disk_consistent_lsn
@@ -4202,176 +4134,6 @@ impl Timeline {
false
}
/// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large,
/// so that at most one image layer will be produced from this function.
async fn create_image_layer_for_rel_blocks(
self: &Arc<Self>,
partition: &KeySpace,
mut image_layer_writer: ImageLayerWriter,
lsn: Lsn,
ctx: &RequestContext,
img_range: Range<Key>,
start: Key,
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
let mut wrote_keys = false;
let mut key_request_accum = KeySpaceAccum::new();
for range in &partition.ranges {
let mut key = range.start;
while key < range.end {
// Decide whether to retain this key: usually we do, but sharded tenants may
// need to drop keys that don't belong to them. If we retain the key, add it
// to `key_request_accum` for later issuing a vectored get
if self.shard_identity.is_key_disposable(&key) {
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
key,
self.shard_identity.get_shard_number(&key)
);
} else {
key_request_accum.add_key(key);
}
let last_key_in_range = key.next() == range.end;
key = key.next();
// Maybe flush `key_rest_accum`
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|| (last_key_in_range && key_request_accum.raw_size() > 0)
{
let results = self
.get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
.await?;
for (img_key, img) in results {
let img = match img {
Ok(img) => img,
Err(err) => {
// If we fail to reconstruct a VM or FSM page, we can zero the
// page without losing any actual user data. That seems better
// than failing repeatedly and getting stuck.
//
// We had a bug at one point, where we truncated the FSM and VM
// in the pageserver, but the Postgres didn't know about that
// and continued to generate incremental WAL records for pages
// that didn't exist in the pageserver. Trying to replay those
// WAL records failed to find the previous image of the page.
// This special case allows us to recover from that situation.
// See https://github.com/neondatabase/neon/issues/2601.
//
// Unfortunately we cannot do this for the main fork, or for
// any metadata keys, keys, as that would lead to actual data
// loss.
if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
ZERO_PAGE.clone()
} else {
return Err(CreateImageLayersError::PageReconstructError(err));
}
}
};
// Write all the keys we just read into our new image layer.
image_layer_writer.put_image(img_key, img, ctx).await?;
wrote_keys = true;
}
}
}
}
if wrote_keys {
// Normal path: we have written some data into the new image layer for this
// partition, so flush it to disk.
let image_layer = image_layer_writer.finish(self, ctx).await?;
Ok(ImageLayerCreationOutcome {
image: Some(image_layer),
next_start_key: img_range.end,
})
} else {
// Special case: the image layer may be empty if this is a sharded tenant and the
// partition does not cover any keys owned by this shard. In this case, to ensure
// we don't leave gaps between image layers, leave `start` where it is, so that the next
// layer we write will cover the key range that we just scanned.
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
Ok(ImageLayerCreationOutcome {
image: None,
next_start_key: start,
})
}
}
/// Create an image layer for metadata keys. This function produces one image layer for all metadata
/// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
/// would not be too large to fit in a single image layer.
#[allow(clippy::too_many_arguments)]
async fn create_image_layer_for_metadata_keys(
self: &Arc<Self>,
partition: &KeySpace,
mut image_layer_writer: ImageLayerWriter,
lsn: Lsn,
ctx: &RequestContext,
img_range: Range<Key>,
mode: ImageLayerCreationMode,
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
assert!(!matches!(mode, ImageLayerCreationMode::Initial));
// Metadata keys image layer creation.
let mut reconstruct_state = ValuesReconstructState::default();
let data = self
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
.await?;
let (data, total_kb_retrieved, total_key_retrieved) = {
let mut new_data = BTreeMap::new();
let mut total_kb_retrieved = 0;
let mut total_key_retrieved = 0;
for (k, v) in data {
let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
total_kb_retrieved += KEY_SIZE + v.len();
total_key_retrieved += 1;
new_data.insert(k, v);
}
(new_data, total_kb_retrieved / 1024, total_key_retrieved)
};
let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
debug!(
"generate image layers for metadata keys: trigger_generation={trigger_generation}, \
delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
total_key_retrieved={total_key_retrieved}"
);
if !trigger_generation && mode == ImageLayerCreationMode::Try {
return Ok(ImageLayerCreationOutcome {
image: None,
next_start_key: img_range.end,
});
}
let has_keys = !data.is_empty();
for (k, v) in data {
// Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
// considers this situation properly.
// if v.is_empty() {
// continue;
// }
// No need to handle sharding b/c metadata keys are always on the 0-th shard.
// TODO: split image layers to avoid too large layer files. Too large image files are not handled
// on the normal data path either.
image_layer_writer.put_image(k, v, ctx).await?;
}
Ok(ImageLayerCreationOutcome {
image: if has_keys {
let image_layer = image_layer_writer.finish(self, ctx).await?;
Some(image_layer)
} else {
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
None
},
next_start_key: img_range.end,
})
}
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
async fn create_image_layers(
self: &Arc<Timeline>,
@@ -4413,17 +4175,19 @@ impl Timeline {
for partition in partitioning.parts.iter() {
let img_range = start..partition.ranges.last().unwrap().end;
let compact_metadata = partition.overlaps(&Key::metadata_key_range());
if compact_metadata {
for range in &partition.ranges {
assert!(
range.start.field1 >= METADATA_KEY_BEGIN_PREFIX
&& range.end.field1 <= METADATA_KEY_END_PREFIX,
"metadata keys must be partitioned separately"
);
}
if mode == ImageLayerCreationMode::Initial {
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
if partition.overlaps(&Key::metadata_key_range()) {
// TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
// rather big change. Keep this patch small for now.
match mode {
ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
// skip image layer creation anyways for metadata keys.
start = img_range.end;
continue;
}
ImageLayerCreationMode::Initial => {
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
}
}
} else if let ImageLayerCreationMode::Try = mode {
// check_for_image_layers = false -> skip
@@ -4434,7 +4198,7 @@ impl Timeline {
}
}
let image_layer_writer = ImageLayerWriter::new(
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
@@ -4450,39 +4214,87 @@ impl Timeline {
)))
});
if !compact_metadata {
let ImageLayerCreationOutcome {
image,
next_start_key,
} = self
.create_image_layer_for_rel_blocks(
partition,
image_layer_writer,
lsn,
ctx,
img_range,
start,
)
.await?;
let mut wrote_keys = false;
start = next_start_key;
image_layers.extend(image);
let mut key_request_accum = KeySpaceAccum::new();
for range in &partition.ranges {
let mut key = range.start;
while key < range.end {
// Decide whether to retain this key: usually we do, but sharded tenants may
// need to drop keys that don't belong to them. If we retain the key, add it
// to `key_request_accum` for later issuing a vectored get
if self.shard_identity.is_key_disposable(&key) {
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
key,
self.shard_identity.get_shard_number(&key)
);
} else {
key_request_accum.add_key(key);
}
let last_key_in_range = key.next() == range.end;
key = key.next();
// Maybe flush `key_rest_accum`
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|| (last_key_in_range && key_request_accum.raw_size() > 0)
{
let results = self
.get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
.await?;
for (img_key, img) in results {
let img = match img {
Ok(img) => img,
Err(err) => {
// If we fail to reconstruct a VM or FSM page, we can zero the
// page without losing any actual user data. That seems better
// than failing repeatedly and getting stuck.
//
// We had a bug at one point, where we truncated the FSM and VM
// in the pageserver, but the Postgres didn't know about that
// and continued to generate incremental WAL records for pages
// that didn't exist in the pageserver. Trying to replay those
// WAL records failed to find the previous image of the page.
// This special case allows us to recover from that situation.
// See https://github.com/neondatabase/neon/issues/2601.
//
// Unfortunately we cannot do this for the main fork, or for
// any metadata keys, keys, as that would lead to actual data
// loss.
if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
{
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
ZERO_PAGE.clone()
} else {
return Err(CreateImageLayersError::PageReconstructError(
err,
));
}
}
};
// Write all the keys we just read into our new image layer.
image_layer_writer.put_image(img_key, img, ctx).await?;
wrote_keys = true;
}
}
}
}
if wrote_keys {
// Normal path: we have written some data into the new image layer for this
// partition, so flush it to disk.
start = img_range.end;
let image_layer = image_layer_writer.finish(self, ctx).await?;
image_layers.push(image_layer);
} else {
let ImageLayerCreationOutcome {
image,
next_start_key,
} = self
.create_image_layer_for_metadata_keys(
partition,
image_layer_writer,
lsn,
ctx,
img_range,
mode,
)
.await?;
start = next_start_key;
image_layers.extend(image);
// Special case: the image layer may be empty if this is a sharded tenant and the
// partition does not cover any keys owned by this shard. In this case, to ensure
// we don't leave gaps between image layers, leave `start` where it is, so that the next
// layer we write will cover the key range that we just scanned.
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
}
}
@@ -4596,14 +4408,6 @@ impl Timeline {
) -> Result<Vec<TimelineId>, anyhow::Error> {
detach_ancestor::complete(self, tenant, prepared, ctx).await
}
/// Switch aux file policy and schedule upload to the index part.
pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
self.last_aux_file_policy.store(Some(policy));
self.remote_client
.schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
Ok(())
}
}
/// Top-level failure to compact.
@@ -4860,32 +4664,7 @@ impl Timeline {
(horizon_cutoff, pitr_cutoff, retain_lsns)
};
let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
let standby_horizon = self.standby_horizon.load();
// Hold GC for the standby, but as a safety guard do it only within some
// reasonable lag.
if standby_horizon != Lsn::INVALID {
if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) {
const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB
if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG {
new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff);
trace!("holding off GC for standby apply LSN {}", standby_horizon);
} else {
warn!(
"standby is lagging for more than {}MB, not holding gc for it",
MAX_ALLOWED_STANDBY_LAG / 1024 / 1024
)
}
}
}
// Reset standby horizon to ignore it if it is not updated till next GC.
// It is an easy way to unset it when standby disappears without adding
// more conf options.
self.standby_horizon.store(Lsn::INVALID);
self.metrics
.standby_horizon_gauge
.set(Lsn::INVALID.0 as i64);
let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
let res = self
.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)

View File

@@ -9,10 +9,7 @@ use std::ops::{Deref, Range};
use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
RecordedDuration, Timeline,
};
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
use anyhow::{anyhow, Context};
use enumset::EnumSet;
@@ -25,13 +22,14 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
use crate::tenant::timeline::{Layer, ResidentLayer};
use crate::tenant::DeltaLayer;
use crate::tenant::PageReconstructError;
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
use crate::{page_cache, ZERO_PAGE};
use crate::keyspace::KeySpace;
use crate::repository::Key;
@@ -118,13 +116,9 @@ impl Timeline {
// 3. Create new image layers for partitions that have been modified
// "enough".
let mut partitioning = dense_partitioning;
partitioning
.parts
.extend(sparse_partitioning.into_dense().parts);
let image_layers = self
let dense_layers = self
.create_image_layers(
&partitioning,
&dense_partitioning,
lsn,
if flags.contains(CompactFlags::ForceImageLayerCreation) {
ImageLayerCreationMode::Force
@@ -136,8 +130,24 @@ impl Timeline {
.await
.map_err(anyhow::Error::from)?;
self.upload_new_image_layers(image_layers)?;
partitioning.parts.len()
// For now, nothing will be produced...
let sparse_layers = self
.create_image_layers(
&sparse_partitioning.clone().into_dense(),
lsn,
if flags.contains(CompactFlags::ForceImageLayerCreation) {
ImageLayerCreationMode::Force
} else {
ImageLayerCreationMode::Try
},
&image_ctx,
)
.await
.map_err(anyhow::Error::from)?;
assert!(sparse_layers.is_empty());
self.upload_new_image_layers(dense_layers)?;
dense_partitioning.parts.len()
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
@@ -489,11 +499,8 @@ impl Timeline {
for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
if let Some(prev_key) = prev {
// just first fast filter, do not create hole entries for metadata keys. The last hole in the
// compaction is the gap between data key and metadata keys.
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
&& !Key::is_metadata_key(&prev_key)
{
// just first fast filter
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
let key_range = prev_key..next_key;
// Measuring hole by just subtraction of i128 representation of key range boundaries
// has not so much sense, because largest holes will corresponds field1/field2 changes.
@@ -1152,10 +1159,10 @@ impl TimelineAdaptor {
lsn: Lsn,
key_range: &Range<Key>,
ctx: &RequestContext,
) -> Result<(), CreateImageLayersError> {
) -> Result<(), PageReconstructError> {
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
let image_layer_writer = ImageLayerWriter::new(
let mut image_layer_writer = ImageLayerWriter::new(
self.timeline.conf,
self.timeline.timeline_id,
self.timeline.tenant_shard_id,
@@ -1166,34 +1173,47 @@ impl TimelineAdaptor {
.await?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
Err(CreateImageLayersError::Other(anyhow::anyhow!(
Err(PageReconstructError::Other(anyhow::anyhow!(
"failpoint image-layer-writer-fail-before-finish"
)))
});
let keyspace = KeySpace {
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
};
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
let start = Key::MIN;
let ImageLayerCreationOutcome {
image,
next_start_key: _,
} = self
.timeline
.create_image_layer_for_rel_blocks(
&keyspace,
image_layer_writer,
lsn,
ctx,
key_range.clone(),
start,
)
.await?;
if let Some(image_layer) = image {
self.new_images.push(image_layer);
let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
for range in &keyspace_ranges {
let mut key = range.start;
while key < range.end {
let img = match self.timeline.get(key, lsn, ctx).await {
Ok(img) => img,
Err(err) => {
// If we fail to reconstruct a VM or FSM page, we can zero the
// page without losing any actual user data. That seems better
// than failing repeatedly and getting stuck.
//
// We had a bug at one point, where we truncated the FSM and VM
// in the pageserver, but the Postgres didn't know about that
// and continued to generate incremental WAL records for pages
// that didn't exist in the pageserver. Trying to replay those
// WAL records failed to find the previous image of the page.
// This special case allows us to recover from that situation.
// See https://github.com/neondatabase/neon/issues/2601.
//
// Unfortunately we cannot do this for the main fork, or for
// any metadata keys, keys, as that would lead to actual data
// loss.
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
ZERO_PAGE.clone()
} else {
return Err(err);
}
}
};
image_layer_writer.put_image(key, img, ctx).await?;
key = key.next();
}
}
let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
self.new_images.push(image_layer);
timer.stop_and_record();

View File

@@ -12,7 +12,7 @@ use crate::{
};
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
#[derive(Debug, thiserror::Error)]
pub(crate) enum Error {
@@ -41,27 +41,6 @@ pub(crate) enum Error {
Unexpected(#[source] anyhow::Error),
}
impl From<Error> for ApiError {
fn from(value: Error) -> Self {
match value {
e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
// TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
Error::ShuttingDown => ApiError::ShuttingDown,
Error::OtherTimelineDetachOngoing(_) => {
ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
}
// All of these contain shutdown errors, in fact, it's the most common
e @ Error::FlushAncestor(_)
| e @ Error::RewrittenDeltaDownloadFailed(_)
| e @ Error::CopyDeltaPrefix(_)
| e @ Error::UploadRewritten(_)
| e @ Error::CopyFailed(_)
| e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
}
}
}
pub(crate) struct PreparedTimelineDetach {
layers: Vec<Layer>,
}
@@ -96,11 +75,6 @@ pub(super) async fn prepare(
.as_ref()
.map(|tl| (tl.clone(), detached.ancestor_lsn))
else {
// TODO: check if we have already been detached; for this we need to read the stored data
// on remote client, for that we need a follow-up which makes uploads cheaper and maintains
// a projection of the commited data.
//
// the error is wrong per openapi
return Err(NoAncestor);
};
@@ -110,7 +84,7 @@ pub(super) async fn prepare(
if ancestor.ancestor_timeline.is_some() {
// non-technical requirement; we could flatten N ancestors just as easily but we chose
// not to, at least initially
// not to
return Err(TooManyAncestors);
}

View File

@@ -166,7 +166,7 @@ impl LayerManager {
/// Flush a frozen layer and add the written delta layer to the layer map.
pub(crate) fn finish_flush_l0_layer(
&mut self,
delta_layers: &[ResidentLayer],
delta_layer: Option<&ResidentLayer>,
frozen_layer_for_check: &Arc<InMemoryLayer>,
metrics: &TimelineMetrics,
) {
@@ -181,12 +181,10 @@ impl LayerManager {
// layer to disk at the same time, that would not work.
assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
if !delta_layers.is_empty() {
if let Some(l) = delta_layer {
let mut updates = self.layer_map.batch_update();
for l in delta_layers {
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
metrics.record_new_file_metrics(l.layer_desc().file_size);
}
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
metrics.record_new_file_metrics(l.layer_desc().file_size);
updates.flush();
}
}

View File

@@ -705,7 +705,6 @@ impl ConnectionManagerState {
commit_lsn: info.commit_lsn,
safekeeper_connstr: info.safekeeper_connstr,
availability_zone: info.availability_zone,
standby_horizon: info.standby_horizon,
}
}
MessageType::SafekeeperDiscoveryResponse => {
@@ -726,21 +725,6 @@ impl ConnectionManagerState {
WALRECEIVER_BROKER_UPDATES.inc();
trace!(
"safekeeper info update: standby_horizon(cutoff)={}",
timeline_update.standby_horizon
);
if timeline_update.standby_horizon != 0 {
// ignore reports from safekeepers not connected to replicas
self.timeline
.standby_horizon
.store(Lsn(timeline_update.standby_horizon));
self.timeline
.metrics
.standby_horizon_gauge
.set(timeline_update.standby_horizon as i64);
}
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
let old_entry = self.wal_stream_candidates.insert(
new_safekeeper_id,
@@ -1110,7 +1094,6 @@ mod tests {
commit_lsn,
safekeeper_connstr: safekeeper_connstr.to_owned(),
availability_zone: None,
standby_horizon: 0,
},
latest_update,
}

View File

@@ -1,78 +0,0 @@
From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 2 Feb 2024 22:26:45 +0200
Subject: [PATCH 1/1] Make v0.6.0 work with Neon
Now that the WAL-logging happens as a separate step at the end of the
build, we need a few neon-specific hints to make it work.
---
src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
index 680789b..ec54dea 100644
--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+#ifdef NEON_SMGR
+ smgr_start_unlogged_build(RelationGetSmgr(indexRel));
+#endif
+
/* Perform inserts */
HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
+#endif
+
/* Close relations within worker */
index_close(indexRel, indexLockmode);
table_close(heapRel, heapLockmode);
@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
SeedRandom(42);
#endif
+#ifdef NEON_SMGR
+ smgr_start_unlogged_build(RelationGetSmgr(index));
+#endif
+
InitBuildState(buildstate, heap, index, indexInfo, forkNum);
BuildGraph(buildstate, forkNum);
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
if (RelationNeedsWAL(index))
+ {
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+#ifdef NEON_SMGR
+ {
+#if PG_VERSION_NUM >= 160000
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+ }
+#endif
+ }
+
+#ifdef NEON_SMGR
+ smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+
FreeBuildState(buildstate);
}
--
2.39.2

View File

@@ -49,7 +49,7 @@ char *neon_auth_token;
int readahead_buffer_size = 128;
int flush_every_n_requests = 8;
int neon_protocol_version = 2;
int neon_protocol_version = 1;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
"Version of compute<->page server protocol",
NULL,
&neon_protocol_version,
2, /* use protocol version 2 */
1, /* default to old protocol for now */
1, /* min */
2, /* max */
PGC_SU_BACKEND,

View File

@@ -45,7 +45,6 @@
*/
#include "postgres.h"
#include "access/parallel.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "access/xlogdefs.h"
@@ -1349,10 +1348,6 @@ PageIsEmptyHeapPage(char *buffer)
return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
}
/*
* A page is being evicted from the shared buffer cache. Update the
* last-written LSN of the page, and WAL-log it if needed.
*/
static void
#if PG_MAJORVERSION_NUM < 16
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1361,7 +1356,12 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
#endif
{
XLogRecPtr lsn = PageGetLSN((Page) buffer);
bool log_page;
if (ShutdownRequestPending)
return;
/* Don't log any pages if we're not allowed to do so. */
if (!XLogInsertAllowed())
return;
/*
* Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1370,21 +1370,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
* correctness, the non-logged updates are not critical. But we want to
* have a reasonably up-to-date VM and FSM in the page server.
*/
log_page = false;
if (force)
{
Assert(XLogInsertAllowed());
log_page = true;
}
else if (XLogInsertAllowed() &&
!ShutdownRequestPending &&
(forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
{
log_page = true;
}
if (log_page)
if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
{
/* FSM is never WAL-logged and we don't care. */
XLogRecPtr recptr;
recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
@@ -1397,8 +1385,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
}
if (lsn == InvalidXLogRecPtr)
else if (lsn == InvalidXLogRecPtr)
{
/*
* When PostgreSQL extends a relation, it calls smgrextend() with an
@@ -1434,31 +1421,19 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
}
else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
else
{
/*
* Its a bad sign if there is a page with zero LSN in the buffer
* cache in a standby, too. However, PANICing seems like a cure
* worse than the disease, as the damage has likely already been
* done in the primary. So in a standby, make this an assertion,
* and in a release build just LOG the error and soldier on. We
* update the last-written LSN of the page with a conservative
* value in that case, which is the last replayed LSN.
*/
ereport(RecoveryInProgress() ? LOG : PANIC,
ereport(PANIC,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
Assert(false);
lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
}
}
else
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1551,92 +1526,8 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
if (RecoveryInProgress())
{
/*---
* In broad strokes, a replica always requests the page at the current
* replay LSN. But looking closer, what exactly is the replay LSN? Is
* it the last replayed record, or the record being replayed? And does
* the startup process performing the replay need to do something
* differently than backends running queries? Let's take a closer look
* at the different scenarios:
*
* 1. Startup process reads a page, last_written_lsn is old.
*
* Read the old version of the page. We will apply the WAL record on
* it to bring it up-to-date.
*
* We could read the new version, with the changes from this WAL
* record already applied, to offload the work of replaying the record
* to the pageserver. The pageserver might not have received the WAL
* record yet, though, so a read of the old page version and applying
* the record ourselves is likely faster. Also, the redo function
* might be surprised if the changes have already applied. That's
* normal during crash recovery, but not in hot standby.
*
* 2. Startup process reads a page, last_written_lsn == record we're
* replaying.
*
* Can this happen? There are a few theoretical cases when it might:
*
* A) The redo function reads the same page twice. We had already read
* and applied the changes once, and now we're reading it for the
* second time. That would be a rather silly thing for a redo
* function to do, and I'm not aware of any that would do it.
*
* B) The redo function modifies multiple pages, and it already
* applied the changes to one of the pages, released the lock on
* it, and is now reading a second page. Furthermore, the first
* page was already evicted from the buffer cache, and also from
* the last-written LSN cache, so that the per-relation or global
* last-written LSN was already updated. All the WAL redo functions
* hold the locks on pages that they modify, until all the changes
* have been modified (?), which would make that impossible.
* However, we skip the locking, if the page isn't currently in the
* page cache (see neon_redo_read_buffer_filter below).
*
* Even if the one of the above cases were possible in theory, they
* would also require the pages being modified by the redo function to
* be immediately evicted from the page cache.
*
* So this probably does not happen in practice. But if it does, we
* request the new version, including the changes from the record
* being replayed. That seems like the correct behavior in any case.
*
* 3. Backend process reads a page with old last-written LSN
*
* Nothing special here. Read the old version.
*
* 4. Backend process reads a page with last_written_lsn == record being replayed
*
* This can happen, if the redo function has started to run, and saw
* that the page isn't present in the page cache (see
* neon_redo_read_buffer_filter below). Normally, in a normal
* Postgres server, the redo function would hold a lock on the page,
* so we would get blocked waiting the redo function to release the
* lock. To emulate that, wait for the WAL replay of the record to
* finish.
*/
/* Request the page at the end of the last fully replayed LSN. */
XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
if (last_written_lsn > replay_lsn)
{
/* GetCurrentReplayRecPtr was introduced in v15 */
#if PG_VERSION_NUM >= 150000
Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
#endif
/*
* Cases 2 and 4. If this is a backend (case 4), the
* neon_read_at_lsn() call later will wait for the WAL record to be
* fully replayed.
*/
result.request_lsn = last_written_lsn;
}
else
{
/* cases 1 and 3 */
result.request_lsn = replay_lsn;
}
/* Request the page at the last replayed LSN. */
result.request_lsn = GetXLogReplayRecPtr(NULL);
result.not_modified_since = last_written_lsn;
result.effective_request_lsn = result.request_lsn;
Assert(last_written_lsn <= result.request_lsn);
@@ -2931,14 +2822,10 @@ neon_start_unlogged_build(SMgrRelation reln)
reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
/*
* Create the local file. In a parallel build, the leader is expected to
* call this first and do it.
*
* FIXME: should we pass isRedo true to create the tablespace dir if it
* doesn't exist? Is it needed?
*/
if (!IsParallelWorker())
mdcreate(reln, MAIN_FORKNUM, false);
mdcreate(reln, MAIN_FORKNUM, false);
}
/*
@@ -2962,17 +2849,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
/*
* In a parallel build, (only) the leader process performs the 2nd
* phase.
*/
if (IsParallelWorker())
{
unlogged_build_rel = NULL;
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
}
else
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
}
/*
@@ -3324,7 +3201,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
BufferTag tag;
uint32 hash;
LWLock *partitionLock;
int buf_id;
Buffer buffer;
bool no_redo_needed;
if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
@@ -3362,20 +3239,20 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
else
{
/* Try to find the relevant buffer */
buf_id = BufTableLookup(&tag, hash);
buffer = BufTableLookup(&tag, hash);
no_redo_needed = buf_id < 0;
no_redo_needed = buffer < 0;
}
/* In both cases st lwlsn past this WAL record */
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
/*
* we don't have the buffer in memory, update lwLsn past this record, also
* evict page from file cache
*/
if (no_redo_needed)
{
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
lfc_evict(rinfo, forknum, blkno);
}
LWLockRelease(partitionLock);

View File

@@ -1852,30 +1852,34 @@ static void
CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
{
hs->ts = 0;
hs->xmin = InvalidFullTransactionId;
hs->catalog_xmin = InvalidFullTransactionId;
hs->xmin.value = ~0; /* largest unsigned value */
hs->catalog_xmin.value = ~0; /* largest unsigned value */
for (int i = 0; i < wp->n_safekeepers; i++)
{
if (wp->safekeeper[i].state == SS_ACTIVE)
if (wp->safekeeper[i].appendResponse.hs.ts != 0)
{
HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;
if (FullTransactionIdIsNormal(skhs->xmin)
&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
{
hs->xmin = skhs->xmin;
hs->ts = skhs->ts;
}
if (FullTransactionIdIsNormal(skhs->catalog_xmin)
&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
{
hs->catalog_xmin = skhs->catalog_xmin;
hs->ts = skhs->ts;
}
}
}
if (hs->xmin.value == ~0)
hs->xmin = InvalidFullTransactionId;
if (hs->catalog_xmin.value == ~0)
hs->catalog_xmin = InvalidFullTransactionId;
}
/*
@@ -1942,28 +1946,14 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
}
CombineHotStanbyFeedbacks(&hsFeedback, wp);
if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
{
FullTransactionId xmin = hsFeedback.xmin;
FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
FullTransactionId next_xid = ReadNextFullTransactionId();
/*
* Page server is updating nextXid in checkpoint each 1024 transactions,
* so feedback xmin can be actually larger then nextXid and
* function TransactionIdInRecentPast return false in this case,
* preventing update of slot's xmin.
*/
if (FullTransactionIdPrecedes(next_xid, xmin))
xmin = next_xid;
if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
catalog_xmin = next_xid;
agg_hs_feedback = hsFeedback;
elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
ProcessStandbyHSFeedback(hsFeedback.ts,
XidFromFullTransactionId(xmin),
EpochFromFullTransactionId(xmin),
XidFromFullTransactionId(catalog_xmin),
EpochFromFullTransactionId(catalog_xmin));
XidFromFullTransactionId(hsFeedback.xmin),
EpochFromFullTransactionId(hsFeedback.xmin),
XidFromFullTransactionId(hsFeedback.catalog_xmin),
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
}
CheckGracefulShutdown(wp);

21
poetry.lock generated
View File

@@ -2405,7 +2405,6 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2530,13 +2529,13 @@ files = [
[[package]]
name = "requests"
version = "2.32.0"
version = "2.31.0"
description = "Python HTTP for Humans."
optional = false
python-versions = ">=3.8"
python-versions = ">=3.7"
files = [
{file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
{file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
{file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
{file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
]
[package.dependencies]
@@ -2960,16 +2959,6 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3207,4 +3196,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"

View File

@@ -9,7 +9,6 @@ default = []
testing = []
[dependencies]
ahash.workspace = true
anyhow.workspace = true
async-compression.workspace = true
async-trait.workspace = true
@@ -25,7 +24,6 @@ camino.workspace = true
chrono.workspace = true
clap.workspace = true
consumption_metrics.workspace = true
crossbeam-deque.workspace = true
dashmap.workspace = true
env_logger.workspace = true
framed-websockets.workspace = true
@@ -54,6 +52,7 @@ opentelemetry.workspace = true
parking_lot.workspace = true
parquet.workspace = true
parquet_derive.workspace = true
pbkdf2 = { workspace = true, features = ["simple", "std"] }
pin-project-lite.workspace = true
postgres_backend.workspace = true
pq_proto.workspace = true
@@ -107,7 +106,6 @@ workspace_hack.workspace = true
camino-tempfile.workspace = true
fallible-iterator.workspace = true
tokio-tungstenite.workspace = true
pbkdf2 = { workspace = true, features = ["simple", "std"] }
rcgen.workspace = true
rstest.workspace = true
tokio-postgres-rustls.workspace = true

View File

@@ -365,10 +365,7 @@ async fn authenticate_with_secret(
config: &'static AuthenticationConfig,
) -> auth::Result<ComputeCredentials> {
if let Some(password) = unauthenticated_password {
let ep = EndpointIdInt::from(&info.endpoint);
let auth_outcome =
validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
let auth_outcome = validate_password_and_exchange(&password, secret).await?;
let keys = match auth_outcome {
crate::sasl::Outcome::Success(key) => key,
crate::sasl::Outcome::Failure(reason) => {
@@ -389,7 +386,7 @@ async fn authenticate_with_secret(
// Currently, we use it for websocket connections (latency).
if allow_cleartext {
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
return hacks::authenticate_cleartext(ctx, info, client, secret, config).await;
return hacks::authenticate_cleartext(ctx, info, client, secret).await;
}
// Finally, proceed with the main auth flow (SCRAM-based).
@@ -557,7 +554,7 @@ mod tests {
context::RequestMonitoring,
proxy::NeonOptions,
rate_limiter::{EndpointRateLimiter, RateBucketInfo},
scram::{threadpool::ThreadPool, ServerSecret},
scram::ServerSecret,
stream::{PqStream, Stream},
};
@@ -599,7 +596,6 @@ mod tests {
}
static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
thread_pool: ThreadPool::new(1),
scram_protocol_timeout: std::time::Duration::from_secs(5),
rate_limiter_enabled: true,
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),

View File

@@ -3,10 +3,8 @@ use super::{
};
use crate::{
auth::{self, AuthFlow},
config::AuthenticationConfig,
console::AuthSecret,
context::RequestMonitoring,
intern::EndpointIdInt,
sasl,
stream::{self, Stream},
};
@@ -22,7 +20,6 @@ pub async fn authenticate_cleartext(
info: ComputeUserInfo,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
secret: AuthSecret,
config: &'static AuthenticationConfig,
) -> auth::Result<ComputeCredentials> {
warn!("cleartext auth flow override is enabled, proceeding");
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -30,14 +27,8 @@ pub async fn authenticate_cleartext(
// pause the timer while we communicate with the client
let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
let ep = EndpointIdInt::from(&info.endpoint);
let auth_flow = AuthFlow::new(client)
.begin(auth::CleartextPassword {
secret,
endpoint: ep,
pool: config.thread_pool.clone(),
})
.begin(auth::CleartextPassword(secret))
.await?;
drop(paused);
// cleartext auth is only allowed to the ws/http protocol.

View File

@@ -5,14 +5,12 @@ use crate::{
config::TlsServerEndPoint,
console::AuthSecret,
context::RequestMonitoring,
intern::EndpointIdInt,
sasl,
scram::{self, threadpool::ThreadPool},
sasl, scram,
stream::{PqStream, Stream},
};
use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
use std::{io, sync::Arc};
use std::io;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::info;
@@ -55,11 +53,7 @@ impl AuthMethod for PasswordHack {
/// Use clear-text password auth called `password` in docs
/// <https://www.postgresql.org/docs/current/auth-password.html>
pub struct CleartextPassword {
pub pool: Arc<ThreadPool>,
pub endpoint: EndpointIdInt,
pub secret: AuthSecret,
}
pub struct CleartextPassword(pub AuthSecret);
impl AuthMethod for CleartextPassword {
#[inline(always)]
@@ -132,13 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
.strip_suffix(&[0])
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
let outcome = validate_password_and_exchange(
&self.state.pool,
self.state.endpoint,
password,
self.state.secret,
)
.await?;
let outcome = validate_password_and_exchange(password, self.state.0).await?;
if let sasl::Outcome::Success(_) = &outcome {
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -193,8 +181,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
}
pub(crate) async fn validate_password_and_exchange(
pool: &ThreadPool,
endpoint: EndpointIdInt,
password: &[u8],
secret: AuthSecret,
) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -208,7 +194,7 @@ pub(crate) async fn validate_password_and_exchange(
}
// perform scram authentication as both client and server to validate the keys
AuthSecret::Scram(scram_secret) => {
let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
let outcome = crate::scram::exchange(&scram_secret, password).await?;
let client_key = match outcome {
sasl::Outcome::Success(client_key) => client_key,

View File

@@ -27,7 +27,6 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use proxy::redis::elasticache;
use proxy::redis::notifications;
use proxy::scram::threadpool::ThreadPool;
use proxy::serverless::cancel_set::CancelSet;
use proxy::serverless::GlobalConnPoolOptions;
use proxy::usage_metrics;
@@ -133,9 +132,6 @@ struct ProxyCliArgs {
/// timeout for scram authentication protocol
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
scram_protocol_timeout: tokio::time::Duration,
/// size of the threadpool for password hashing
#[clap(long, default_value_t = 4)]
scram_thread_pool_size: u8,
/// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
require_client_ip: bool,
@@ -493,9 +489,6 @@ async fn main() -> anyhow::Result<()> {
/// ProxyConfig is created at proxy startup, and lives forever.
fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
Metrics::install(thread_pool.metrics.clone());
let tls_config = match (&args.tls_key, &args.tls_cert) {
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(
key_path,
@@ -631,7 +624,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
};
let authentication_config = AuthenticationConfig {
thread_pool,
scram_protocol_timeout: args.scram_protocol_timeout,
rate_limiter_enabled: args.auth_rate_limit_enabled,
rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),

View File

@@ -2,7 +2,6 @@ use crate::{
auth::{self, backend::AuthRateLimiter},
console::locks::ApiLocks,
rate_limiter::RateBucketInfo,
scram::threadpool::ThreadPool,
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
Host,
};
@@ -62,7 +61,6 @@ pub struct HttpConfig {
}
pub struct AuthenticationConfig {
pub thread_pool: Arc<ThreadPool>,
pub scram_protocol_timeout: tokio::time::Duration,
pub rate_limiter_enabled: bool,
pub rate_limiter: AuthRateLimiter,

View File

@@ -307,7 +307,7 @@ where
}
async fn upload_parquet(
mut w: SerializedFileWriter<Writer<BytesMut>>,
w: SerializedFileWriter<Writer<BytesMut>>,
len: i64,
storage: &GenericRemoteStorage,
) -> anyhow::Result<Writer<BytesMut>> {
@@ -319,15 +319,11 @@ async fn upload_parquet(
// I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
// finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
let (mut buffer, metadata) =
tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> {
let metadata = w.finish()?;
let buffer = std::mem::take(w.inner_mut().get_mut());
Ok((buffer, metadata))
})
let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish())
.await
.unwrap()?;
let mut buffer = writer.into_inner();
let data = buffer.split().freeze();
let compression = len as f64 / len_uncompressed as f64;
@@ -355,7 +351,7 @@ async fn upload_parquet(
"{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
))?;
let cancel = CancellationToken::new();
let maybe_err = backoff::retry(
backoff::retry(
|| async {
let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
storage
@@ -372,12 +368,7 @@ async fn upload_parquet(
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("request_data_upload")
.err();
if let Some(err) = maybe_err {
tracing::warn!(%id, %err, "failed to upload request data");
}
.context("request_data_upload")?;
Ok(buffer.writer())
}
@@ -483,11 +474,10 @@ mod tests {
RequestData {
session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
timestamp: chrono::DateTime::from_timestamp_millis(
timestamp: chrono::NaiveDateTime::from_timestamp_millis(
rng.gen_range(1703862754..1803862754),
)
.unwrap()
.naive_utc(),
.unwrap(),
application_name: Some("test".to_owned()),
username: Some(hex::encode(rng.gen::<[u8; 4]>())),
endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
@@ -570,15 +560,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1315314, 3, 6000),
(1315307, 3, 6000),
(1315367, 3, 6000),
(1315324, 3, 6000),
(1315454, 3, 6000),
(1315296, 3, 6000),
(1315088, 3, 6000),
(1315324, 3, 6000),
(438713, 1, 2000)
(1315008, 3, 6000),
(1315001, 3, 6000),
(1315061, 3, 6000),
(1315018, 3, 6000),
(1315148, 3, 6000),
(1314990, 3, 6000),
(1314782, 3, 6000),
(1315018, 3, 6000),
(438575, 1, 2000)
]
);
@@ -608,11 +598,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1222212, 5, 10000),
(1228362, 5, 10000),
(1230156, 5, 10000),
(1229518, 5, 10000),
(1220796, 5, 10000)
(1221738, 5, 10000),
(1227888, 5, 10000),
(1229682, 5, 10000),
(1229044, 5, 10000),
(1220322, 5, 10000)
]
);
@@ -644,11 +634,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1207859, 5, 10000),
(1207590, 5, 10000),
(1207883, 5, 10000),
(1207871, 5, 10000),
(1208126, 5, 10000)
(1207385, 5, 10000),
(1207116, 5, 10000),
(1207409, 5, 10000),
(1207397, 5, 10000),
(1207652, 5, 10000)
]
);
@@ -673,15 +663,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1315314, 3, 6000),
(1315307, 3, 6000),
(1315367, 3, 6000),
(1315324, 3, 6000),
(1315454, 3, 6000),
(1315296, 3, 6000),
(1315088, 3, 6000),
(1315324, 3, 6000),
(438713, 1, 2000)
(1315008, 3, 6000),
(1315001, 3, 6000),
(1315061, 3, 6000),
(1315018, 3, 6000),
(1315148, 3, 6000),
(1314990, 3, 6000),
(1314782, 3, 6000),
(1315018, 3, 6000),
(438575, 1, 2000)
]
);
@@ -718,7 +708,7 @@ mod tests {
// files are smaller than the size threshold, but they took too long to fill so were flushed early
assert_eq!(
file_stats,
[(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
[(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
);
tmpdir.close().unwrap();

View File

@@ -1,11 +1,11 @@
use std::sync::{Arc, OnceLock};
use std::sync::OnceLock;
use lasso::ThreadedRodeo;
use measured::{
label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
label::StaticLabelSet,
metric::{histogram::Thresholds, name::MetricName},
Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
LabelGroup, MetricGroup,
Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
MetricGroup,
};
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
@@ -14,36 +14,26 @@ use tokio::time::{self, Instant};
use crate::console::messages::ColdStartInfo;
#[derive(MetricGroup)]
#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
pub struct Metrics {
#[metric(namespace = "proxy")]
#[metric(init = ProxyMetrics::new(thread_pool))]
pub proxy: ProxyMetrics,
#[metric(namespace = "wake_compute_lock")]
pub wake_compute_lock: ApiLockMetrics,
}
static SELF: OnceLock<Metrics> = OnceLock::new();
impl Metrics {
pub fn install(thread_pool: Arc<ThreadPoolMetrics>) {
SELF.set(Metrics::new(thread_pool))
.ok()
.expect("proxy metrics must not be installed more than once");
}
pub fn get() -> &'static Self {
#[cfg(test)]
return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0))));
#[cfg(not(test))]
SELF.get()
.expect("proxy metrics must be installed by the main() function")
static SELF: OnceLock<Metrics> = OnceLock::new();
SELF.get_or_init(|| Metrics {
proxy: ProxyMetrics::default(),
wake_compute_lock: ApiLockMetrics::new(),
})
}
}
#[derive(MetricGroup)]
#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
#[metric(new())]
pub struct ProxyMetrics {
#[metric(flatten)]
pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
@@ -139,10 +129,6 @@ pub struct ProxyMetrics {
#[metric(namespace = "connect_compute_lock")]
pub connect_compute_lock: ApiLockMetrics,
#[metric(namespace = "scram_pool")]
#[metric(init = thread_pool)]
pub scram_pool: Arc<ThreadPoolMetrics>,
}
#[derive(MetricGroup)]
@@ -160,6 +146,12 @@ pub struct ApiLockMetrics {
pub semaphore_acquire_seconds: Histogram<16>,
}
impl Default for ProxyMetrics {
fn default() -> Self {
Self::new()
}
}
impl Default for ApiLockMetrics {
fn default() -> Self {
Self::new()
@@ -561,52 +553,3 @@ pub enum RedisEventsCount {
PasswordUpdate,
AllowedIpsUpdate,
}
pub struct ThreadPoolWorkers(usize);
pub struct ThreadPoolWorkerId(pub usize);
impl LabelValue for ThreadPoolWorkerId {
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0 as i64)
}
}
impl LabelGroup for ThreadPoolWorkerId {
fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
v.write_value(LabelName::from_str("worker"), self);
}
}
impl LabelSet for ThreadPoolWorkers {
type Value<'a> = ThreadPoolWorkerId;
fn dynamic_cardinality(&self) -> Option<usize> {
Some(self.0)
}
fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
(value.0 < self.0).then_some(value.0)
}
fn decode(&self, value: usize) -> Self::Value<'_> {
ThreadPoolWorkerId(value)
}
}
impl FixedCardinalitySet for ThreadPoolWorkers {
fn cardinality(&self) -> usize {
self.0
}
}
#[derive(MetricGroup)]
#[metric(new(workers: usize))]
pub struct ThreadPoolMetrics {
pub injector_queue_depth: Gauge,
#[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
#[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
#[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
}

View File

@@ -6,14 +6,11 @@
//! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/backend/libpq/auth-scram.c>
//! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/interfaces/libpq/fe-auth-scram.c>
mod countmin;
mod exchange;
mod key;
mod messages;
mod pbkdf2;
mod secret;
mod signature;
pub mod threadpool;
pub use exchange::{exchange, Exchange};
pub use key::ScramKey;
@@ -59,13 +56,9 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
#[cfg(test)]
mod tests {
use crate::{
intern::EndpointIdInt,
sasl::{Mechanism, Step},
EndpointId,
};
use crate::sasl::{Mechanism, Step};
use super::{threadpool::ThreadPool, Exchange, ServerSecret};
use super::{Exchange, ServerSecret};
#[test]
fn snapshot() {
@@ -119,13 +112,8 @@ mod tests {
}
async fn run_round_trip_test(server_password: &str, client_password: &str) {
let pool = ThreadPool::new(1);
let ep = EndpointId::from("foo");
let ep = EndpointIdInt::from(ep);
let scram_secret = ServerSecret::build(server_password).await.unwrap();
let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes())
let outcome = super::exchange(&scram_secret, client_password.as_bytes())
.await
.unwrap();

View File

@@ -1,173 +0,0 @@
use std::hash::Hash;
/// estimator of hash jobs per second.
/// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
pub struct CountMinSketch {
// one for each depth
hashers: Vec<ahash::RandomState>,
width: usize,
depth: usize,
// buckets, width*depth
buckets: Vec<u32>,
}
impl CountMinSketch {
/// Given parameters (ε, δ),
/// set width = ceil(e/ε)
/// set depth = ceil(ln(1/δ))
///
/// guarantees:
/// actual <= estimate
/// estimate <= actual + ε * N with probability 1 - δ
/// where N is the cardinality of the stream
pub fn with_params(epsilon: f64, delta: f64) -> Self {
CountMinSketch::new(
(std::f64::consts::E / epsilon).ceil() as usize,
(1.0_f64 / delta).ln().ceil() as usize,
)
}
fn new(width: usize, depth: usize) -> Self {
Self {
#[cfg(test)]
hashers: (0..depth)
.map(|i| {
// digits of pi for good randomness
ahash::RandomState::with_seeds(
314159265358979323,
84626433832795028,
84197169399375105,
82097494459230781 + i as u64,
)
})
.collect(),
#[cfg(not(test))]
hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(),
width,
depth,
buckets: vec![0; width * depth],
}
}
pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
let mut min = u32::MAX;
for row in 0..self.depth {
let col = (self.hashers[row].hash_one(t) as usize) % self.width;
let row = &mut self.buckets[row * self.width..][..self.width];
row[col] = row[col].saturating_add(x);
min = std::cmp::min(min, row[col]);
}
min
}
pub fn reset(&mut self) {
self.buckets.clear();
self.buckets.resize(self.width * self.depth, 0);
}
}
#[cfg(test)]
mod tests {
use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
use super::CountMinSketch;
fn eval_precision(n: usize, p: f64, q: f64) -> usize {
// fixed value of phi for consistent test
let mut rng = StdRng::seed_from_u64(16180339887498948482);
#[allow(non_snake_case)]
let mut N = 0;
let mut ids = vec![];
for _ in 0..n {
// number of insert operations
let n = rng.gen_range(1..100);
// number to insert at once
let m = rng.gen_range(1..4096);
let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
ids.push((id, n, m));
// N = sum(actual)
N += n * m;
}
// q% of counts will be within p of the actual value
let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
dbg!(sketch.buckets.len());
// insert a bunch of entries in a random order
let mut ids2 = ids.clone();
while !ids2.is_empty() {
ids2.shuffle(&mut rng);
let mut i = 0;
while i < ids2.len() {
sketch.inc_and_return(&ids2[i].0, ids2[i].1);
ids2[i].2 -= 1;
if ids2[i].2 == 0 {
ids2.remove(i);
} else {
i += 1;
}
}
}
let mut within_p = 0;
for (id, n, m) in ids {
let actual = n * m;
let estimate = sketch.inc_and_return(&id, 0);
// This estimate has the guarantee that actual <= estimate
assert!(actual <= estimate);
// This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ.
// ε = p / N, δ = 1 - q;
// therefore, estimate <= actual + p with probability q.
if estimate as f64 <= actual as f64 + p {
within_p += 1;
}
}
within_p
}
#[test]
fn precision() {
assert_eq!(eval_precision(100, 100.0, 0.99), 100);
assert_eq!(eval_precision(1000, 100.0, 0.99), 1000);
assert_eq!(eval_precision(100, 4096.0, 0.99), 100);
assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000);
// seems to be more precise than the literature indicates?
// probably numbers are too small to truly represent the probabilities.
assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
}
// returns memory usage in bytes, and the time complexity per insert.
fn eval_cost(p: f64, q: f64) -> (usize, usize) {
#[allow(non_snake_case)]
// N = sum(actual)
// Let's assume 1021 samples, all of 4096
let N = 1021 * 4096;
let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
let time = sketch.depth;
(memory, time)
}
#[test]
fn memory_usage() {
assert_eq!(eval_cost(100.0, 0.99), (2273580, 5));
assert_eq!(eval_cost(4096.0, 0.99), (55520, 5));
assert_eq!(eval_cost(4096.0, 0.90), (33312, 3));
assert_eq!(eval_cost(4096.0, 0.1), (11104, 1));
}
}

View File

@@ -4,17 +4,15 @@ use std::convert::Infallible;
use hmac::{Hmac, Mac};
use sha2::Sha256;
use tokio::task::yield_now;
use super::messages::{
ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
};
use super::pbkdf2::Pbkdf2;
use super::secret::ServerSecret;
use super::signature::SignatureBuilder;
use super::threadpool::ThreadPool;
use super::ScramKey;
use crate::config;
use crate::intern::EndpointIdInt;
use crate::sasl::{self, ChannelBinding, Error as SaslError};
/// The only channel binding mode we currently support.
@@ -76,18 +74,37 @@ impl<'a> Exchange<'a> {
}
}
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
let mut prev = hmac
.clone()
.chain_update(salt)
.chain_update(1u32.to_be_bytes())
.finalize()
.into_bytes();
let mut hi = prev;
for i in 1..iterations {
prev = hmac.clone().chain_update(prev).finalize().into_bytes();
for (hi, prev) in hi.iter_mut().zip(prev) {
*hi ^= prev;
}
// yield every ~250us
// hopefully reduces tail latencies
if i % 1024 == 0 {
yield_now().await
}
}
hi.into()
}
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
async fn derive_client_key(
pool: &ThreadPool,
endpoint: EndpointIdInt,
password: &[u8],
salt: &[u8],
iterations: u32,
) -> ScramKey {
let salted_password = pool
.spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
.await
.expect("job should not be cancelled");
async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
let salted_password = pbkdf2(password, salt, iterations).await;
let make_key = |name| {
let key = Hmac::<Sha256>::new_from_slice(&salted_password)
@@ -102,13 +119,11 @@ async fn derive_client_key(
}
pub async fn exchange(
pool: &ThreadPool,
endpoint: EndpointIdInt,
secret: &ServerSecret,
password: &[u8],
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
let salt = base64::decode(&secret.salt_base64)?;
let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
let client_key = derive_client_key(password, &salt, secret.iterations).await;
if secret.is_password_invalid(&client_key).into() {
Ok(sasl::Outcome::Failure("password doesn't match"))

View File

@@ -1,89 +0,0 @@
use hmac::{
digest::{consts::U32, generic_array::GenericArray},
Hmac, Mac,
};
use sha2::Sha256;
pub struct Pbkdf2 {
hmac: Hmac<Sha256>,
prev: GenericArray<u8, U32>,
hi: GenericArray<u8, U32>,
iterations: u32,
}
// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
impl Pbkdf2 {
pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
let hmac =
Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
let prev = hmac
.clone()
.chain_update(salt)
.chain_update(1u32.to_be_bytes())
.finalize()
.into_bytes();
Self {
hmac,
// one consumed for the hash above
iterations: iterations - 1,
hi: prev,
prev,
}
}
pub fn cost(&self) -> u32 {
(self.iterations).clamp(0, 4096)
}
pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
let Self {
hmac,
prev,
hi,
iterations,
} = self;
// only do 4096 iterations per turn before sharing the thread for fairness
let n = (*iterations).clamp(0, 4096);
for _ in 0..n {
*prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
for (hi, prev) in hi.iter_mut().zip(*prev) {
*hi ^= prev;
}
}
*iterations -= n;
if *iterations == 0 {
std::task::Poll::Ready((*hi).into())
} else {
std::task::Poll::Pending
}
}
}
#[cfg(test)]
mod tests {
use super::Pbkdf2;
use pbkdf2::pbkdf2_hmac_array;
use sha2::Sha256;
#[test]
fn works() {
let salt = b"sodium chloride";
let pass = b"Ne0n_!5_50_C007";
let mut job = Pbkdf2::start(pass, salt, 600000);
let hash = loop {
let std::task::Poll::Ready(hash) = job.turn() else {
continue;
};
break hash;
};
let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
assert_eq!(hash, expected)
}
}

View File

@@ -1,321 +0,0 @@
//! Custom threadpool implementation for password hashing.
//!
//! Requirements:
//! 1. Fairness per endpoint.
//! 2. Yield support for high iteration counts.
use std::sync::{
atomic::{AtomicU64, Ordering},
Arc,
};
use crossbeam_deque::{Injector, Stealer, Worker};
use itertools::Itertools;
use parking_lot::{Condvar, Mutex};
use rand::Rng;
use rand::{rngs::SmallRng, SeedableRng};
use tokio::sync::oneshot;
use crate::{
intern::EndpointIdInt,
metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
scram::countmin::CountMinSketch,
};
use super::pbkdf2::Pbkdf2;
pub struct ThreadPool {
queue: Injector<JobSpec>,
stealers: Vec<Stealer<JobSpec>>,
parkers: Vec<(Condvar, Mutex<ThreadState>)>,
/// bitpacked representation.
/// lower 8 bits = number of sleeping threads
/// next 8 bits = number of idle threads (searching for work)
counters: AtomicU64,
pub metrics: Arc<ThreadPoolMetrics>,
}
#[derive(PartialEq)]
enum ThreadState {
Parked,
Active,
}
impl ThreadPool {
pub fn new(n_workers: u8) -> Arc<Self> {
let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
let parkers = (0..n_workers)
.map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
.collect_vec();
let pool = Arc::new(Self {
queue: Injector::new(),
stealers,
parkers,
// threads start searching for work
counters: AtomicU64::new((n_workers as u64) << 8),
metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
});
for (i, worker) in workers.into_iter().enumerate() {
let pool = Arc::clone(&pool);
std::thread::spawn(move || thread_rt(pool, worker, i));
}
pool
}
pub fn spawn_job(
&self,
endpoint: EndpointIdInt,
pbkdf2: Pbkdf2,
) -> oneshot::Receiver<[u8; 32]> {
let (tx, rx) = oneshot::channel();
let queue_was_empty = self.queue.is_empty();
self.metrics.injector_queue_depth.inc();
self.queue.push(JobSpec {
response: tx,
pbkdf2,
endpoint,
});
// inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
let counts = self.counters.load(Ordering::SeqCst);
let num_awake_but_idle = (counts >> 8) & 0xff;
let num_sleepers = counts & 0xff;
// If the queue is non-empty, then we always wake up a worker
// -- clearly the existing idle jobs aren't enough. Otherwise,
// check to see if we have enough idle workers.
if !queue_was_empty || num_awake_but_idle == 0 {
let num_to_wake = Ord::min(1, num_sleepers);
self.wake_any_threads(num_to_wake);
}
rx
}
#[cold]
fn wake_any_threads(&self, mut num_to_wake: u64) {
if num_to_wake > 0 {
for i in 0..self.parkers.len() {
if self.wake_specific_thread(i) {
num_to_wake -= 1;
if num_to_wake == 0 {
return;
}
}
}
}
}
fn wake_specific_thread(&self, index: usize) -> bool {
let (condvar, lock) = &self.parkers[index];
let mut state = lock.lock();
if *state == ThreadState::Parked {
condvar.notify_one();
// When the thread went to sleep, it will have incremented
// this value. When we wake it, its our job to decrement
// it. We could have the thread do it, but that would
// introduce a delay between when the thread was
// *notified* and when this counter was decremented. That
// might mislead people with new work into thinking that
// there are sleeping threads that they should try to
// wake, when in fact there is nothing left for them to
// do.
self.counters.fetch_sub(1, Ordering::SeqCst);
*state = ThreadState::Active;
true
} else {
false
}
}
fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
// announce thread as idle
self.counters.fetch_add(256, Ordering::SeqCst);
// try steal from the global queue
loop {
match self.queue.steal_batch_and_pop(worker) {
crossbeam_deque::Steal::Success(job) => {
self.metrics
.injector_queue_depth
.set(self.queue.len() as i64);
// no longer idle
self.counters.fetch_sub(256, Ordering::SeqCst);
return Some(job);
}
crossbeam_deque::Steal::Retry => continue,
crossbeam_deque::Steal::Empty => break,
}
}
// try steal from our neighbours
loop {
let mut retry = false;
let start = rng.gen_range(0..self.stealers.len());
let job = (start..self.stealers.len())
.chain(0..start)
.filter(|i| *i != skip)
.find_map(
|victim| match self.stealers[victim].steal_batch_and_pop(worker) {
crossbeam_deque::Steal::Success(job) => Some(job),
crossbeam_deque::Steal::Empty => None,
crossbeam_deque::Steal::Retry => {
retry = true;
None
}
},
);
if job.is_some() {
// no longer idle
self.counters.fetch_sub(256, Ordering::SeqCst);
return job;
}
if !retry {
return None;
}
}
}
}
fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
/// interval when we should steal from the global queue
/// so that tail latencies are managed appropriately
const STEAL_INTERVAL: usize = 61;
/// How often to reset the sketch values
const SKETCH_RESET_INTERVAL: usize = 1021;
let mut rng = SmallRng::from_entropy();
// used to determine whether we should temporarily skip tasks for fairness.
// 99% of estimates will overcount by no more than 4096 samples
let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
let (condvar, lock) = &pool.parkers[index];
'wait: loop {
// wait for notification of work
{
let mut lock = lock.lock();
// queue is empty
pool.metrics
.worker_queue_depth
.set(ThreadPoolWorkerId(index), 0);
// subtract 1 from idle count, add 1 to sleeping count.
pool.counters.fetch_sub(255, Ordering::SeqCst);
*lock = ThreadState::Parked;
condvar.wait(&mut lock);
}
for i in 0.. {
let mut job = match worker
.pop()
.or_else(|| pool.steal(&mut rng, index, &worker))
{
Some(job) => job,
None => continue 'wait,
};
pool.metrics
.worker_queue_depth
.set(ThreadPoolWorkerId(index), worker.len() as i64);
// receiver is closed, cancel the task
if !job.response.is_closed() {
let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
const P: f64 = 2000.0;
// probability decreases as rate increases.
// lower probability, higher chance of being skipped
//
// estimates (rate in terms of 4096 rounds):
// rate = 0 => probability = 100%
// rate = 10 => probability = 71.3%
// rate = 50 => probability = 62.1%
// rate = 500 => probability = 52.3%
// rate = 1021 => probability = 49.8%
//
// My expectation is that the pool queue will only begin backing up at ~1000rps
// in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
// are in requests per second.
let probability = P.ln() / (P + rate as f64).ln();
if pool.queue.len() > 32 || rng.gen_bool(probability) {
pool.metrics
.worker_task_turns_total
.inc(ThreadPoolWorkerId(index));
match job.pbkdf2.turn() {
std::task::Poll::Ready(result) => {
let _ = job.response.send(result);
}
std::task::Poll::Pending => worker.push(job),
}
} else {
pool.metrics
.worker_task_skips_total
.inc(ThreadPoolWorkerId(index));
// skip for now
worker.push(job)
}
}
// if we get stuck with a few long lived jobs in the queue
// it's better to try and steal from the queue too for fairness
if i % STEAL_INTERVAL == 0 {
let _ = pool.queue.steal_batch(&worker);
}
if i % SKETCH_RESET_INTERVAL == 0 {
sketch.reset();
}
}
}
}
struct JobSpec {
response: oneshot::Sender<[u8; 32]>,
pbkdf2: Pbkdf2,
endpoint: EndpointIdInt,
}
#[cfg(test)]
mod tests {
use crate::EndpointId;
use super::*;
#[tokio::test]
async fn hash_is_correct() {
let pool = ThreadPool::new(1);
let ep = EndpointId::from("foo");
let ep = EndpointIdInt::from(ep);
let salt = [0x55; 32];
let actual = pool
.spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
.await
.unwrap();
let expected = [
10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
];
assert_eq!(actual, expected)
}
}

View File

@@ -15,7 +15,6 @@ use crate::{
},
context::RequestMonitoring,
error::{ErrorKind, ReportableError, UserFacingError},
intern::EndpointIdInt,
proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
rate_limiter::EndpointRateLimiter,
Host,
@@ -67,14 +66,8 @@ impl PoolingBackend {
return Err(AuthError::auth_failed(&*user_info.user));
}
};
let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
let auth_outcome = crate::auth::validate_password_and_exchange(
&config.thread_pool,
ep,
&conn_info.password,
secret,
)
.await?;
let auth_outcome =
crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
let res = match auth_outcome {
crate::sasl::Outcome::Success(key) => {
info!("user successfully authenticated");

View File

@@ -10,7 +10,7 @@ pytest = "^7.4.4"
psycopg2-binary = "^2.9.6"
typing-extensions = "^4.6.1"
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
requests = "^2.32.0"
requests = "^2.31.0"
pytest-xdist = "^3.3.1"
asyncpg = "^0.29.0"
aiopg = "^1.4.0"

View File

@@ -20,6 +20,7 @@ use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use storage_broker::Uri;
use tokio::sync::mpsc;
use tracing::*;
use utils::pid_file;
@@ -29,13 +30,13 @@ use safekeeper::defaults::{
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
};
use safekeeper::remove_wal;
use safekeeper::wal_service;
use safekeeper::GlobalTimelines;
use safekeeper::SafeKeeperConf;
use safekeeper::{broker, WAL_SERVICE_RUNTIME};
use safekeeper::{control_file, BROKER_RUNTIME};
use safekeeper::{http, WAL_REMOVER_RUNTIME};
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
use safekeeper::{wal_backup, HTTP_RUNTIME};
use storage_broker::DEFAULT_ENDPOINT;
use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -376,6 +377,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
metrics::register_internal(Box::new(timeline_collector))?;
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
wal_backup::init_remote_storage(&conf);
// Keep handles to main tasks to die if any of them disappears.
@@ -388,9 +391,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
let current_thread_rt = conf
.current_thread_runtime
.then(|| Handle::try_current().expect("no runtime in main"));
let conf_ = conf.clone();
let wal_backup_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
.spawn(wal_backup::wal_backup_launcher_task_main(
conf_,
wal_backup_launcher_rx,
))
.map(|res| ("WAL backup launcher".to_owned(), res));
tasks_handles.push(Box::pin(wal_backup_handle));
// Load all timelines from disk to memory.
GlobalTimelines::init(conf.clone()).await?;
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
let conf_ = conf.clone();
// Run everything in current thread rt, if asked.

View File

@@ -46,8 +46,6 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
return Ok(());
}
let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
let mut client =
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -59,9 +57,15 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
// sensitive and there is no risk of deadlock as we don't await while
// lock is held.
let now = Instant::now();
let all_tlis = active_timelines_set.get_all();
let all_tlis = GlobalTimelines::get_all();
let mut n_pushed_tlis = 0;
for tli in &all_tlis {
// filtering alternative futures::stream::iter(all_tlis)
// .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
// doesn't look better, and I'm not sure how to do that without collect.
if !tli.is_active().await {
continue;
}
let sk_info = tli.get_safekeeper_info(&conf).await;
yield sk_info;
BROKER_PUSHED_UPDATES.inc();
@@ -86,7 +90,6 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
}
/// Subscribe and fetch all the interesting data from the broker.
#[instrument(name = "broker pull", skip_all)]
async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
@@ -183,7 +186,6 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
commit_lsn: sk_info.commit_lsn,
safekeeper_connstr: sk_info.safekeeper_connstr,
availability_zone: sk_info.availability_zone,
standby_horizon: 0,
};
// note this is a blocking call
@@ -317,7 +319,7 @@ async fn task_stats(stats: Arc<BrokerStats>) {
let now = BrokerStats::now_millis();
if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
info!("no broker updates for some time, last update: {:?}", ts);
}
}

View File

@@ -350,7 +350,6 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
backup_lsn: sk_info.backup_lsn.0,
local_start_lsn: sk_info.local_start_lsn.0,
availability_zone: None,
standby_horizon: sk_info.standby_horizon.0,
};
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;

View File

@@ -31,8 +31,6 @@ pub mod safekeeper;
pub mod send_wal;
pub mod state;
pub mod timeline;
pub mod timeline_manager;
pub mod timelines_set;
pub mod wal_backup;
pub mod wal_backup_partial;
pub mod wal_service;

View File

@@ -11,9 +11,8 @@ use futures::Future;
use metrics::{
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
proto::MetricFamily,
register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
IntGaugeVec,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge,
IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec,
};
use once_cell::sync::Lazy;
@@ -163,29 +162,6 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
)
.expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
});
pub static MANAGER_ITERATIONS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"safekeeper_manager_iterations_total",
"Number of iterations of the timeline manager task"
)
.expect("Failed to register safekeeper_manager_iterations_total counter")
});
pub static MANAGER_ACTIVE_CHANGES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"safekeeper_manager_active_changes_total",
"Number of timeline active status changes in the timeline manager task"
)
.expect("Failed to register safekeeper_manager_active_changes_total counter")
});
pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
register_int_counter_pair!(
"safekeeper_wal_backup_tasks_started_total",
"Number of active WAL backup tasks",
"safekeeper_wal_backup_tasks_finished_total",
"Number of finished WAL backup tasks",
)
.expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
});
pub const LABEL_UNKNOWN: &str = "unknown";
@@ -638,7 +614,8 @@ impl Collector for TimelineCollector {
self.written_wal_seconds.reset();
self.flushed_wal_seconds.reset();
let timelines_count = GlobalTimelines::get_all().len();
let timelines = GlobalTimelines::get_all();
let timelines_count = timelines.len();
let mut active_timelines_count = 0;
// Prometheus Collector is sync, and data is stored under async lock. To
@@ -769,9 +746,9 @@ impl Collector for TimelineCollector {
async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
let mut res = vec![];
let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
let timelines = GlobalTimelines::get_all();
for tli in active_timelines {
for tli in timelines {
if let Some(info) = tli.info_for_metrics().await {
res.push(info);
}

View File

@@ -45,9 +45,6 @@ const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
pub struct WalReceivers {
mutex: Mutex<WalReceiversShared>,
pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
num_computes_tx: tokio::sync::watch::Sender<usize>,
num_computes_rx: tokio::sync::watch::Receiver<usize>,
}
/// Id under which walreceiver is registered in shmem.
@@ -58,21 +55,16 @@ impl WalReceivers {
let (pageserver_feedback_tx, _) =
tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);
let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize);
Arc::new(WalReceivers {
mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
pageserver_feedback_tx,
num_computes_tx,
num_computes_rx,
})
}
/// Register new walreceiver. Returned guard provides access to the slot and
/// automatically deregisters in Drop.
pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
let mut shared = self.mutex.lock();
let slots = &mut shared.slots;
let slots = &mut self.mutex.lock().slots;
let walreceiver = WalReceiverState {
conn_id,
status: WalReceiverStatus::Voting,
@@ -86,9 +78,6 @@ impl WalReceivers {
slots.push(Some(walreceiver));
pos
};
self.update_num(&shared);
WalReceiverGuard {
id: pos,
walreceivers: self.clone(),
@@ -110,18 +99,7 @@ impl WalReceivers {
/// Get number of walreceivers (compute connections).
pub fn get_num(self: &Arc<WalReceivers>) -> usize {
self.mutex.lock().get_num()
}
/// Get channel for number of walreceivers.
pub fn get_num_rx(self: &Arc<WalReceivers>) -> tokio::sync::watch::Receiver<usize> {
self.num_computes_rx.clone()
}
/// Should get called after every update of slots.
fn update_num(self: &Arc<WalReceivers>, shared: &MutexGuard<WalReceiversShared>) {
let num = shared.get_num();
self.num_computes_tx.send_replace(num);
self.mutex.lock().slots.iter().flatten().count()
}
/// Get state of all walreceivers.
@@ -145,7 +123,6 @@ impl WalReceivers {
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
let mut shared = self.mutex.lock();
shared.slots[id] = None;
self.update_num(&shared);
}
/// Broadcast pageserver feedback to connected walproposers.
@@ -160,13 +137,6 @@ struct WalReceiversShared {
slots: Vec<Option<WalReceiverState>>,
}
impl WalReceiversShared {
/// Get number of walreceivers (compute connections).
fn get_num(&self) -> usize {
self.slots.iter().flatten().count()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalReceiverState {
/// None means it is recovery initiated by us (this safekeeper).
@@ -213,19 +183,9 @@ impl SafekeeperPostgresHandler {
&mut self,
pgb: &mut PostgresBackend<IO>,
) -> Result<(), QueryError> {
let mut tli: Option<Arc<Timeline>> = None;
if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
if let Err(end) = self.handle_start_wal_push_guts(pgb).await {
// Log the result and probably send it to the client, closing the stream.
let handle_end_fut = pgb.handle_copy_stream_end(end);
// If we managed to create the timeline, augment logging with current LSNs etc.
if let Some(tli) = tli {
let info = tli.get_safekeeper_info(&self.conf).await;
handle_end_fut
.instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn)))
.await;
} else {
handle_end_fut.await;
}
pgb.handle_copy_stream_end(end).await;
}
Ok(())
}
@@ -233,7 +193,6 @@ impl SafekeeperPostgresHandler {
pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
&mut self,
pgb: &mut PostgresBackend<IO>,
tli: &mut Option<Arc<Timeline>>,
) -> Result<(), CopyStreamHandlerEnd> {
// Notify the libpq client that it's allowed to send `CopyData` messages
pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -263,17 +222,13 @@ impl SafekeeperPostgresHandler {
// Read first message and create timeline if needed.
let res = network_reader.read_first_message().await;
let network_res = if let Ok((timeline, next_msg)) = res {
let res = if let Ok((tli, next_msg)) = res {
let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
timeline
.get_walreceivers()
.pageserver_feedback_tx
.subscribe();
*tli = Some(timeline.clone());
tli.get_walreceivers().pageserver_feedback_tx.subscribe();
tokio::select! {
// todo: add read|write .context to these errors
r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r,
r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
}
} else {
@@ -289,13 +244,13 @@ impl SafekeeperPostgresHandler {
match acceptor_handle {
None => {
// failed even before spawning; read_network should have error
Err(network_res.expect_err("no error with WalAcceptor not spawn"))
Err(res.expect_err("no error with WalAcceptor not spawn"))
}
Some(handle) => {
let wal_acceptor_res = handle.await;
// If there was any network error, return it.
network_res?;
res?;
// Otherwise, WalAcceptor thread must have errored.
match wal_acceptor_res {
@@ -486,7 +441,14 @@ impl WalAcceptor {
/// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
/// it must mean that network thread terminated.
async fn run(&mut self) -> anyhow::Result<()> {
// Register the connection and defer unregister.
// Order of the next two lines is important: we want first to remove our entry and then
// update status which depends on registered connections.
let _compute_conn_guard = ComputeConnectionGuard {
timeline: Arc::clone(&self.tli),
};
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
self.tli.update_status_notify().await?;
// After this timestamp we will stop processing AppendRequests and send a response
// to the walproposer. walproposer sends at least one AppendRequest per second,
@@ -552,3 +514,19 @@ impl WalAcceptor {
}
}
}
/// Calls update_status_notify in drop to update timeline status.
struct ComputeConnectionGuard {
timeline: Arc<Timeline>,
}
impl Drop for ComputeConnectionGuard {
fn drop(&mut self) {
let tli = self.timeline.clone();
tokio::spawn(async move {
if let Err(e) = tli.update_status_notify().await {
error!("failed to update timeline status: {}", e);
}
});
}
}

View File

@@ -37,11 +37,17 @@ use crate::{
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
info!("started");
let mut cancellation_rx = match tli.get_cancellation_rx() {
Ok(rx) => rx,
Err(_) => {
info!("timeline canceled during task start");
return;
}
};
let cancel = tli.cancel.clone();
select! {
_ = recovery_main_loop(tli, conf) => { unreachable!() }
_ = cancel.cancelled() => {
_ = cancellation_rx.changed() => {
info!("stopped");
}
}

View File

@@ -7,18 +7,29 @@ use tracing::*;
use crate::{GlobalTimelines, SafeKeeperConf};
pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
const ALLOW_INACTIVE_TIMELINES: bool = true;
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
let wal_removal_interval = Duration::from_millis(5000);
loop {
let now = tokio::time::Instant::now();
let mut active_timelines = 0;
let tlis = GlobalTimelines::get_all();
for tli in &tlis {
let is_active = tli.is_active().await;
if is_active {
active_timelines += 1;
}
if !ALLOW_INACTIVE_TIMELINES && !is_active {
continue;
}
let ttid = tli.ttid;
async {
if let Err(e) = tli.maybe_persist_control_file().await {
warn!("failed to persist control file: {e}");
}
if let Err(e) = tli.remove_old_wal().await {
if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await {
error!("failed to remove WAL: {}", e);
}
}
@@ -31,8 +42,8 @@ pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
if elapsed > wal_removal_interval {
info!(
"WAL removal is too long, processed {} timelines in {:?}",
total_timelines, elapsed
"WAL removal is too long, processed {} active timelines ({} total) in {:?}",
active_timelines, total_timelines, elapsed
);
}

View File

@@ -827,10 +827,10 @@ where
/// Persist control file if there is something to save and enough time
/// passed after the last save.
pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<bool> {
pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> {
const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
return Ok(false);
return Ok(());
}
let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
|| self.state.inmem.backup_lsn > self.state.backup_lsn
@@ -840,7 +840,7 @@ where
self.state.flush().await?;
trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
}
Ok(need_persist)
Ok(())
}
/// Handle request to append WAL.

View File

@@ -23,7 +23,7 @@ use utils::failpoint_support;
use utils::id::TenantTimelineId;
use utils::pageserver_feedback::PageserverFeedback;
use std::cmp::{max, min};
use std::cmp::min;
use std::net::SocketAddr;
use std::str;
use std::sync::Arc;
@@ -85,17 +85,8 @@ impl StandbyReply {
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct StandbyFeedback {
pub reply: StandbyReply,
pub hs_feedback: HotStandbyFeedback,
}
impl StandbyFeedback {
pub fn empty() -> Self {
StandbyFeedback {
reply: StandbyReply::empty(),
hs_feedback: HotStandbyFeedback::empty(),
}
}
reply: StandbyReply,
hs_feedback: HotStandbyFeedback,
}
/// WalSenders registry. Timeline holds it (wrapped in Arc).
@@ -171,8 +162,8 @@ impl WalSenders {
}
/// Get aggregated hot standby feedback (we send it to compute).
pub fn get_hotstandby(self: &Arc<WalSenders>) -> StandbyFeedback {
self.mutex.lock().agg_standby_feedback
pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
self.mutex.lock().agg_hs_feedback
}
/// Record new pageserver feedback, update aggregated values.
@@ -193,10 +184,6 @@ impl WalSenders {
fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
let mut shared = self.mutex.lock();
let slot = shared.get_slot_mut(id);
debug!(
"Record standby reply: ts={} apply_lsn={}",
reply.reply_ts, reply.apply_lsn
);
match &mut slot.feedback {
ReplicationFeedback::Standby(sf) => sf.reply = *reply,
ReplicationFeedback::Pageserver(_) => {
@@ -221,7 +208,7 @@ impl WalSenders {
})
}
}
shared.update_reply_feedback();
shared.update_hs_feedback();
}
/// Get remote_consistent_lsn reported by the pageserver. Returns None if
@@ -239,13 +226,13 @@ impl WalSenders {
fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
let mut shared = self.mutex.lock();
shared.slots[id] = None;
shared.update_reply_feedback();
shared.update_hs_feedback();
}
}
struct WalSendersShared {
// aggregated over all walsenders value
agg_standby_feedback: StandbyFeedback,
agg_hs_feedback: HotStandbyFeedback,
// last feedback ever received from any pageserver, empty if none
last_ps_feedback: PageserverFeedback,
// total counter of pageserver feedbacks received
@@ -256,7 +243,7 @@ struct WalSendersShared {
impl WalSendersShared {
fn new() -> Self {
WalSendersShared {
agg_standby_feedback: StandbyFeedback::empty(),
agg_hs_feedback: HotStandbyFeedback::empty(),
last_ps_feedback: PageserverFeedback::empty(),
ps_feedback_counter: 0,
slots: Vec::new(),
@@ -273,11 +260,10 @@ impl WalSendersShared {
self.slots[id].as_mut().expect("walsender doesn't exist")
}
/// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins
/// Update aggregated hot standy feedback. We just take min of valid xmins
/// and ts.
fn update_reply_feedback(&mut self) {
fn update_hs_feedback(&mut self) {
let mut agg = HotStandbyFeedback::empty();
let mut reply_agg = StandbyReply::empty();
for ws_state in self.slots.iter().flatten() {
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
let hs_feedback = standby_feedback.hs_feedback;
@@ -290,7 +276,7 @@ impl WalSendersShared {
} else {
agg.xmin = hs_feedback.xmin;
}
agg.ts = max(agg.ts, hs_feedback.ts);
agg.ts = min(agg.ts, hs_feedback.ts);
}
if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
@@ -298,43 +284,11 @@ impl WalSendersShared {
} else {
agg.catalog_xmin = hs_feedback.catalog_xmin;
}
agg.ts = max(agg.ts, hs_feedback.ts);
}
let reply = standby_feedback.reply;
if reply.write_lsn != Lsn::INVALID {
if reply_agg.write_lsn != Lsn::INVALID {
reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn);
} else {
reply_agg.write_lsn = reply.write_lsn;
}
}
if reply.flush_lsn != Lsn::INVALID {
if reply_agg.flush_lsn != Lsn::INVALID {
reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn);
} else {
reply_agg.flush_lsn = reply.flush_lsn;
}
}
if reply.apply_lsn != Lsn::INVALID {
if reply_agg.apply_lsn != Lsn::INVALID {
reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn);
} else {
reply_agg.apply_lsn = reply.apply_lsn;
}
}
if reply.reply_ts != 0 {
if reply_agg.reply_ts != 0 {
reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts);
} else {
reply_agg.reply_ts = reply.reply_ts;
}
agg.ts = min(agg.ts, hs_feedback.ts);
}
}
}
self.agg_standby_feedback = StandbyFeedback {
reply: reply_agg,
hs_feedback: agg,
};
self.agg_hs_feedback = agg;
}
}
@@ -386,16 +340,12 @@ impl SafekeeperPostgresHandler {
start_pos: Lsn,
term: Option<Term>,
) -> Result<(), QueryError> {
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
if let Err(end) = self
.handle_start_replication_guts(pgb, start_pos, term, tli.clone())
.handle_start_replication_guts(pgb, start_pos, term)
.await
{
let info = tli.get_safekeeper_info(&self.conf).await;
// Log the result and probably send it to the client, closing the stream.
pgb.handle_copy_stream_end(end)
.instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn)))
.await;
pgb.handle_copy_stream_end(end).await;
}
Ok(())
}
@@ -405,9 +355,10 @@ impl SafekeeperPostgresHandler {
pgb: &mut PostgresBackend<IO>,
start_pos: Lsn,
term: Option<Term>,
tli: Arc<Timeline>,
) -> Result<(), CopyStreamHandlerEnd> {
let appname = self.appname.clone();
let tli =
GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?;
// Use a guard object to remove our entry from the timeline when we are done.
let ws_guard = Arc::new(tli.get_walsenders().register(
@@ -756,15 +707,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
match msg.first().cloned() {
Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
// Note: deserializing is on m[1..] because we skip the tag byte.
let mut hs_feedback = HotStandbyFeedback::des(&msg[1..])
let hs_feedback = HotStandbyFeedback::des(&msg[1..])
.context("failed to deserialize HotStandbyFeedback")?;
// TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way:
// pq_sendint32(&reply_message, xmin);
// pq_sendint32(&reply_message, xmin_epoch);
// So it is two big endian 32-bit words in low endian order!
hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
hs_feedback.catalog_xmin =
(hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
self.ws_guard
.walsenders
.record_hs_feedback(self.ws_guard.id, &hs_feedback);
@@ -846,11 +790,8 @@ mod tests {
fn test_hs_feedback_no_valid() {
let mut wss = WalSendersShared::new();
push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
wss.update_reply_feedback();
assert_eq!(
wss.agg_standby_feedback.hs_feedback.xmin,
INVALID_FULL_TRANSACTION_ID
);
wss.update_hs_feedback();
assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID);
}
#[test]
@@ -859,7 +800,7 @@ mod tests {
push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
push_feedback(&mut wss, hs_feedback(1, 42));
push_feedback(&mut wss, hs_feedback(1, 64));
wss.update_reply_feedback();
assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42);
wss.update_hs_feedback();
assert_eq!(wss.agg_hs_feedback.xmin, 42);
}
}

View File

@@ -6,15 +6,15 @@ use camino::Utf8PathBuf;
use postgres_ffi::XLogSegNo;
use serde::{Deserialize, Serialize};
use tokio::fs;
use tokio_util::sync::CancellationToken;
use std::cmp::max;
use std::ops::{Deref, DerefMut};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio::{sync::watch, time::Instant};
use tokio::sync::{Mutex, MutexGuard};
use tokio::{
sync::{mpsc::Sender, watch},
time::Instant,
};
use tracing::*;
use utils::http::error::ApiError;
use utils::{
@@ -33,13 +33,12 @@ use crate::safekeeper::{
};
use crate::send_wal::WalSenders;
use crate::state::{TimelineMemState, TimelinePersistentState};
use crate::timelines_set::TimelinesSet;
use crate::wal_backup::{self};
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
use crate::metrics::FullTimelineInfo;
use crate::wal_storage::Storage as wal_storage_iface;
use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
use crate::{debug_dump, wal_backup_partial, wal_storage};
use crate::{GlobalTimelines, SafeKeeperConf};
/// Things safekeeper should know about timeline state on peers.
@@ -52,7 +51,8 @@ pub struct PeerInfo {
/// LSN of the last record.
pub flush_lsn: Lsn,
pub commit_lsn: Lsn,
/// Since which LSN safekeeper has WAL.
/// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
/// sk since backup_lsn.
pub local_start_lsn: Lsn,
/// When info was received. Serde annotations are not very useful but make
/// the code compile -- we don't rely on this field externally.
@@ -97,79 +97,25 @@ impl PeersInfo {
}
}
pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard<SharedState>` that
/// automatically updates `watch::Sender` channels with state on drop.
pub struct WriteGuardSharedState<'a> {
tli: Arc<Timeline>,
guard: RwLockWriteGuard<'a, SharedState>,
skip_update: bool,
}
impl<'a> WriteGuardSharedState<'a> {
fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
WriteGuardSharedState {
tli,
guard,
skip_update: false,
}
}
}
impl<'a> Deref for WriteGuardSharedState<'a> {
type Target = SharedState;
fn deref(&self) -> &Self::Target {
&self.guard
}
}
impl<'a> DerefMut for WriteGuardSharedState<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.guard
}
}
impl<'a> Drop for WriteGuardSharedState<'a> {
fn drop(&mut self) {
let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn()));
let commit_lsn = self.guard.sk.state.inmem.commit_lsn;
let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| {
if *old != term_flush_lsn {
*old = term_flush_lsn;
true
} else {
false
}
});
let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| {
if *old != commit_lsn {
*old = commit_lsn;
true
} else {
false
}
});
if !self.skip_update {
// send notification about shared state update
self.tli.shared_state_version_tx.send_modify(|old| {
*old += 1;
});
}
}
}
/// Shared state associated with database instance
pub struct SharedState {
/// Safekeeper object
pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
/// In memory list containing state of peers sent in latest messages from them.
pub(crate) peers_info: PeersInfo,
pub(crate) last_removed_segno: XLogSegNo,
peers_info: PeersInfo,
/// True when WAL backup launcher oversees the timeline, making sure WAL is
/// offloaded, allows to bother launcher less.
wal_backup_active: bool,
/// True whenever there is at least some pending activity on timeline: live
/// compute connection, pageserver is not caughtup (it must have latest WAL
/// for new compute start) or WAL backuping is not finished. Practically it
/// means safekeepers broadcast info to peers about the timeline, old WAL is
/// trimmed.
///
/// TODO: it might be better to remove tli completely from GlobalTimelines
/// when tli is inactive instead of having this flag.
active: bool,
last_removed_segno: XLogSegNo,
}
impl SharedState {
@@ -206,6 +152,8 @@ impl SharedState {
Ok(Self {
sk,
peers_info: PeersInfo(vec![]),
wal_backup_active: false,
active: false,
last_removed_segno: 0,
})
}
@@ -223,10 +171,75 @@ impl SharedState {
Ok(Self {
sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
peers_info: PeersInfo(vec![]),
wal_backup_active: false,
active: false,
last_removed_segno: 0,
})
}
fn is_active(&self, num_computes: usize) -> bool {
self.is_wal_backup_required(num_computes)
// FIXME: add tracking of relevant pageservers and check them here individually,
// otherwise migration won't work (we suspend too early).
|| self.sk.state.inmem.remote_consistent_lsn < self.sk.state.inmem.commit_lsn
}
/// Mark timeline active/inactive and return whether s3 offloading requires
/// start/stop action. If timeline is deactivated, control file is persisted
/// as maintenance task does that only for active timelines.
async fn update_status(&mut self, num_computes: usize, ttid: TenantTimelineId) -> bool {
let is_active = self.is_active(num_computes);
if self.active != is_active {
info!(
"timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
ttid,
is_active,
self.sk.state.inmem.remote_consistent_lsn,
self.sk.state.inmem.commit_lsn
);
if !is_active {
if let Err(e) = self.sk.state.flush().await {
warn!("control file save in update_status failed: {:?}", e);
}
}
}
self.active = is_active;
self.is_wal_backup_action_pending(num_computes)
}
/// Should we run s3 offloading in current state?
fn is_wal_backup_required(&self, num_computes: usize) -> bool {
let seg_size = self.get_wal_seg_size();
num_computes > 0 ||
// Currently only the whole segment is offloaded, so compare segment numbers.
(self.sk.state.inmem.commit_lsn.segment_number(seg_size) >
self.sk.state.inmem.backup_lsn.segment_number(seg_size))
}
/// Is current state of s3 offloading is not what it ought to be?
fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
if res {
let action_pending = if self.is_wal_backup_required(num_computes) {
"start"
} else {
"stop"
};
trace!(
"timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}",
self.sk.state.timeline_id, action_pending, num_computes, self.sk.state.inmem.commit_lsn, self.sk.state.inmem.backup_lsn
);
}
res
}
/// Returns whether s3 offloading is required and sets current status as
/// matching.
fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
self.wal_backup_active = self.is_wal_backup_required(num_computes);
self.wal_backup_active
}
fn get_wal_seg_size(&self) -> usize {
self.sk.state.server.wal_seg_size as usize
}
@@ -235,7 +248,6 @@ impl SharedState {
&self,
ttid: &TenantTimelineId,
conf: &SafeKeeperConf,
standby_apply_lsn: Lsn,
) -> SafekeeperTimelineInfo {
SafekeeperTimelineInfo {
safekeeper_id: conf.my_id.0,
@@ -258,14 +270,13 @@ impl SharedState {
backup_lsn: self.sk.state.inmem.backup_lsn.0,
local_start_lsn: self.sk.state.local_start_lsn.0,
availability_zone: conf.availability_zone.clone(),
standby_horizon: standby_apply_lsn.0,
}
}
/// Get our latest view of alive peers status on the timeline.
/// We pass our own info through the broker as well, so when we don't have connection
/// to the broker returned vec is empty.
pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
let now = Instant::now();
self.peers_info
.0
@@ -281,13 +292,18 @@ impl SharedState {
/// offloading.
/// While it is safe to use inmem values for determining horizon,
/// we use persistent to make possible normal states less surprising.
fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
fn get_horizon_segno(
&self,
wal_backup_enabled: bool,
extra_horizon_lsn: Option<Lsn>,
) -> XLogSegNo {
let state = &self.sk.state;
use std::cmp::min;
let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
// we don't want to remove WAL that is not yet offloaded to s3
horizon_lsn = min(horizon_lsn, state.backup_lsn);
if wal_backup_enabled {
horizon_lsn = min(horizon_lsn, state.backup_lsn);
}
if let Some(extra_horizon_lsn) = extra_horizon_lsn {
horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
}
@@ -328,6 +344,11 @@ impl From<TimelineError> for ApiError {
pub struct Timeline {
pub ttid: TenantTimelineId,
/// Sending here asks for wal backup launcher attention (start/stop
/// offloading). Sending ttid instead of concrete command allows to do
/// sending without timeline lock.
pub wal_backup_launcher_tx: Sender<TenantTimelineId>,
/// Used to broadcast commit_lsn updates to all background jobs.
commit_lsn_watch_tx: watch::Sender<Lsn>,
commit_lsn_watch_rx: watch::Receiver<Lsn>,
@@ -339,19 +360,19 @@ pub struct Timeline {
term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,
/// Broadcasts shared state updates.
shared_state_version_tx: watch::Sender<usize>,
shared_state_version_rx: watch::Receiver<usize>,
/// Safekeeper and other state, that should remain consistent and
/// synchronized with the disk. This is tokio mutex as we write WAL to disk
/// while holding it, ensuring that consensus checks are in order.
mutex: RwLock<SharedState>,
mutex: Mutex<SharedState>,
walsenders: Arc<WalSenders>,
walreceivers: Arc<WalReceivers>,
/// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
pub(crate) cancel: CancellationToken,
/// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
cancellation_tx: watch::Sender<bool>,
/// Timeline should not be used after cancellation. Background tasks should
/// monitor this channel and stop eventually after receiving `true` from this channel.
cancellation_rx: watch::Receiver<bool>,
/// Directory where timeline state is stored.
pub timeline_dir: Utf8PathBuf,
@@ -361,15 +382,15 @@ pub struct Timeline {
/// with different speed.
// TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
walsenders_keep_horizon: bool,
// timeline_manager controlled state
pub(crate) broker_active: AtomicBool,
pub(crate) wal_backup_active: AtomicBool,
}
impl Timeline {
/// Load existing timeline from disk.
pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Timeline> {
pub fn load_timeline(
conf: &SafeKeeperConf,
ttid: TenantTimelineId,
wal_backup_launcher_tx: Sender<TenantTimelineId>,
) -> Result<Timeline> {
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
let shared_state = SharedState::restore(conf, &ttid)?;
@@ -379,25 +400,23 @@ impl Timeline {
shared_state.sk.get_term(),
shared_state.sk.flush_lsn(),
)));
let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
let (cancellation_tx, cancellation_rx) = watch::channel(false);
let walreceivers = WalReceivers::new();
Ok(Timeline {
ttid,
wal_backup_launcher_tx,
commit_lsn_watch_tx,
commit_lsn_watch_rx,
term_flush_lsn_watch_tx,
term_flush_lsn_watch_rx,
shared_state_version_tx,
shared_state_version_rx,
mutex: RwLock::new(shared_state),
mutex: Mutex::new(shared_state),
walsenders: WalSenders::new(walreceivers.clone()),
walreceivers,
cancel: CancellationToken::default(),
cancellation_rx,
cancellation_tx,
timeline_dir: conf.timeline_dir(&ttid),
walsenders_keep_horizon: conf.walsenders_keep_horizon,
broker_active: AtomicBool::new(false),
wal_backup_active: AtomicBool::new(false),
})
}
@@ -405,6 +424,7 @@ impl Timeline {
pub fn create_empty(
conf: &SafeKeeperConf,
ttid: TenantTimelineId,
wal_backup_launcher_tx: Sender<TenantTimelineId>,
server_info: ServerInfo,
commit_lsn: Lsn,
local_start_lsn: Lsn,
@@ -412,28 +432,25 @@ impl Timeline {
let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
let (cancellation_tx, cancellation_rx) = watch::channel(false);
let state =
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
let walreceivers = WalReceivers::new();
Ok(Timeline {
ttid,
wal_backup_launcher_tx,
commit_lsn_watch_tx,
commit_lsn_watch_rx,
term_flush_lsn_watch_tx,
term_flush_lsn_watch_rx,
shared_state_version_tx,
shared_state_version_rx,
mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
walsenders: WalSenders::new(walreceivers.clone()),
walreceivers,
cancel: CancellationToken::default(),
cancellation_rx,
cancellation_tx,
timeline_dir: conf.timeline_dir(&ttid),
walsenders_keep_horizon: conf.walsenders_keep_horizon,
broker_active: AtomicBool::new(false),
wal_backup_active: AtomicBool::new(false),
})
}
@@ -444,9 +461,8 @@ impl Timeline {
/// and state on disk should remain unchanged.
pub async fn init_new(
self: &Arc<Timeline>,
shared_state: &mut WriteGuardSharedState<'_>,
shared_state: &mut MutexGuard<'_, SharedState>,
conf: &SafeKeeperConf,
broker_active_set: Arc<TimelinesSet>,
) -> Result<()> {
match fs::metadata(&self.timeline_dir).await {
Ok(_) => {
@@ -477,29 +493,16 @@ impl Timeline {
return Err(e);
}
self.bootstrap(conf, broker_active_set);
self.bootstrap(conf);
Ok(())
}
/// Bootstrap new or existing timeline starting background tasks.
pub fn bootstrap(
self: &Arc<Timeline>,
conf: &SafeKeeperConf,
broker_active_set: Arc<TimelinesSet>,
) {
// Start manager task which will monitor timeline state and update
// background tasks.
tokio::spawn(timeline_manager::main_task(
self.clone(),
conf.clone(),
broker_active_set,
));
/// Bootstrap new or existing timeline starting background stasks.
pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
// Start recovery task which always runs on the timeline.
if conf.peer_recovery_enabled {
tokio::spawn(recovery_main(self.clone(), conf.clone()));
}
// TODO: migrate to timeline_manager
if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
}
@@ -512,9 +515,10 @@ impl Timeline {
/// deletion API endpoint is retriable.
pub async fn delete(
&self,
shared_state: &mut WriteGuardSharedState<'_>,
shared_state: &mut MutexGuard<'_, SharedState>,
only_local: bool,
) -> Result<bool> {
) -> Result<(bool, bool)> {
let was_active = shared_state.active;
self.cancel(shared_state);
// TODO: It's better to wait for s3 offloader termination before
@@ -528,14 +532,20 @@ impl Timeline {
wal_backup::delete_timeline(&self.ttid).await?;
}
let dir_existed = delete_dir(&self.timeline_dir).await?;
Ok(dir_existed)
Ok((dir_existed, was_active))
}
/// Cancel timeline to prevent further usage. Background tasks will stop
/// eventually after receiving cancellation signal.
fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
///
/// Note that we can't notify backup launcher here while holding
/// shared_state lock, as this is a potential deadlock: caller is
/// responsible for that. Generally we should probably make WAL backup tasks
/// to shut down on their own, checking once in a while whether it is the
/// time.
fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
info!("timeline {} is cancelled", self.ttid);
self.cancel.cancel();
let _ = self.cancellation_tx.send(true);
// Close associated FDs. Nobody will be able to touch timeline data once
// it is cancelled, so WAL storage won't be opened again.
shared_state.sk.wal_store.close();
@@ -543,16 +553,44 @@ impl Timeline {
/// Returns if timeline is cancelled.
pub fn is_cancelled(&self) -> bool {
self.cancel.is_cancelled()
*self.cancellation_rx.borrow()
}
/// Returns watch channel which gets value when timeline is cancelled. It is
/// guaranteed to have not cancelled value observed (errors otherwise).
pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
let rx = self.cancellation_rx.clone();
if *rx.borrow() {
bail!(TimelineError::Cancelled(self.ttid));
}
Ok(rx)
}
/// Take a writing mutual exclusive lock on timeline shared_state.
pub async fn write_shared_state<'a>(self: &'a Arc<Self>) -> WriteGuardSharedState<'a> {
WriteGuardSharedState::new(self.clone(), self.mutex.write().await)
pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
self.mutex.lock().await
}
pub async fn read_shared_state(&self) -> ReadGuardSharedState {
self.mutex.read().await
async fn update_status(&self, shared_state: &mut SharedState) -> bool {
shared_state
.update_status(self.walreceivers.get_num(), self.ttid)
.await
}
/// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
pub async fn update_status_notify(&self) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
let is_wal_backup_action_pending: bool = {
let mut shared_state = self.write_shared_state().await;
self.update_status(&mut shared_state).await
};
if is_wal_backup_action_pending {
// Can fail only if channel to a static thread got closed, which is not normal at all.
self.wal_backup_launcher_tx.send(self.ttid).await?;
}
Ok(())
}
/// Returns true if walsender should stop sending WAL to pageserver. We
@@ -564,7 +602,7 @@ impl Timeline {
if self.is_cancelled() {
return true;
}
let shared_state = self.read_shared_state().await;
let shared_state = self.write_shared_state().await;
if self.walreceivers.get_num() == 0 {
return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
@@ -572,9 +610,9 @@ impl Timeline {
false
}
/// Ensure that current term is t, erroring otherwise, and lock the state.
pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
let ss = self.read_shared_state().await;
/// Ensure taht current term is t, erroring otherwise, and lock the state.
pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
let ss = self.write_shared_state().await;
if ss.sk.state.acceptor_state.term != t {
bail!(
"failed to acquire term {}, current term {}",
@@ -585,6 +623,18 @@ impl Timeline {
Ok(ss)
}
/// Returns whether s3 offloading is required and sets current status as
/// matching it.
pub async fn wal_backup_attend(&self) -> bool {
if self.is_cancelled() {
return false;
}
self.write_shared_state()
.await
.wal_backup_attend(self.walreceivers.get_num())
}
/// Returns commit_lsn watch channel.
pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
self.commit_lsn_watch_rx.clone()
@@ -595,14 +645,9 @@ impl Timeline {
self.term_flush_lsn_watch_rx.clone()
}
/// Returns watch channel for SharedState update version.
pub fn get_state_version_rx(&self) -> watch::Receiver<usize> {
self.shared_state_version_rx.clone()
}
/// Pass arrived message to the safekeeper.
pub async fn process_msg(
self: &Arc<Self>,
&self,
msg: &ProposerAcceptorMessage,
) -> Result<Option<AcceptorProposerMessage>> {
if self.is_cancelled() {
@@ -610,36 +655,53 @@ impl Timeline {
}
let mut rmsg: Option<AcceptorProposerMessage>;
let commit_lsn: Lsn;
let term_flush_lsn: TermLsn;
{
let mut shared_state = self.write_shared_state().await;
rmsg = shared_state.sk.process_msg(msg).await?;
// if this is AppendResponse, fill in proper hot standby feedback.
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
resp.hs_feedback = self.walsenders.get_hotstandby();
}
commit_lsn = shared_state.sk.state.inmem.commit_lsn;
term_flush_lsn =
TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
}
self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
self.commit_lsn_watch_tx.send(commit_lsn)?;
Ok(rmsg)
}
/// Returns wal_seg_size.
pub async fn get_wal_seg_size(&self) -> usize {
self.read_shared_state().await.get_wal_seg_size()
self.write_shared_state().await.get_wal_seg_size()
}
/// Returns true only if the timeline is loaded and active.
pub async fn is_active(&self) -> bool {
if self.is_cancelled() {
return false;
}
self.write_shared_state().await.active
}
/// Returns state of the timeline.
pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) {
let state = self.read_shared_state().await;
let state = self.write_shared_state().await;
(state.sk.state.inmem.clone(), state.sk.state.clone())
}
/// Returns latest backup_lsn.
pub async fn get_wal_backup_lsn(&self) -> Lsn {
self.read_shared_state().await.sk.state.inmem.backup_lsn
self.write_shared_state().await.sk.state.inmem.backup_lsn
}
/// Sets backup_lsn to the given value.
pub async fn set_wal_backup_lsn(self: &Arc<Self>, backup_lsn: Lsn) -> Result<()> {
pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
@@ -653,34 +715,39 @@ impl Timeline {
/// Get safekeeper info for broadcasting to broker and other peers.
pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn;
let shared_state = self.read_shared_state().await;
shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn)
let shared_state = self.write_shared_state().await;
shared_state.get_safekeeper_info(&self.ttid, conf)
}
/// Update timeline state with peer safekeeper data.
pub async fn record_safekeeper_info(
self: &Arc<Self>,
sk_info: SafekeeperTimelineInfo,
) -> Result<()> {
pub async fn record_safekeeper_info(&self, sk_info: SafekeeperTimelineInfo) -> Result<()> {
let is_wal_backup_action_pending: bool;
let commit_lsn: Lsn;
{
let mut shared_state = self.write_shared_state().await;
shared_state.sk.record_safekeeper_info(&sk_info).await?;
let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
shared_state.peers_info.upsert(&peer_info);
is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
commit_lsn = shared_state.sk.state.inmem.commit_lsn;
}
self.commit_lsn_watch_tx.send(commit_lsn)?;
// Wake up wal backup launcher, if it is time to stop the offloading.
if is_wal_backup_action_pending {
self.wal_backup_launcher_tx.send(self.ttid).await?;
}
Ok(())
}
/// Update in memory remote consistent lsn.
pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
let mut shared_state = self.write_shared_state().await;
shared_state.sk.state.inmem.remote_consistent_lsn =
max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
}
pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
let shared_state = self.read_shared_state().await;
let shared_state = self.write_shared_state().await;
shared_state.get_peers(conf.heartbeat_timeout)
}
@@ -702,7 +769,7 @@ impl Timeline {
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
/// Thus we don't try to predict it here.
pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
let ss = self.read_shared_state().await;
let ss = self.write_shared_state().await;
let term = ss.sk.state.acceptor_state.term;
let last_log_term = ss.sk.get_epoch();
let flush_lsn = ss.sk.flush_lsn();
@@ -773,12 +840,12 @@ impl Timeline {
/// Returns flush_lsn.
pub async fn get_flush_lsn(&self) -> Lsn {
self.read_shared_state().await.sk.wal_store.flush_lsn()
self.write_shared_state().await.sk.wal_store.flush_lsn()
}
/// Delete WAL segments from disk that are no longer needed. This is determined
/// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
@@ -794,8 +861,9 @@ impl Timeline {
let horizon_segno: XLogSegNo;
let remover = {
let shared_state = self.read_shared_state().await;
horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
let shared_state = self.write_shared_state().await;
horizon_segno =
shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn);
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
return Ok(()); // nothing to do
}
@@ -809,11 +877,7 @@ impl Timeline {
// update last_removed_segno
let mut shared_state = self.write_shared_state().await;
if shared_state.last_removed_segno != horizon_segno {
shared_state.last_removed_segno = horizon_segno;
} else {
shared_state.skip_update = true;
}
shared_state.last_removed_segno = horizon_segno;
Ok(())
}
@@ -821,40 +885,46 @@ impl Timeline {
/// passed after the last save. This helps to keep remote_consistent_lsn up
/// to date so that storage nodes restart doesn't cause many pageserver ->
/// safekeeper reconnections.
pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
let mut guard = self.write_shared_state().await;
let changed = guard.sk.maybe_persist_inmem_control_file().await?;
guard.skip_update = !changed;
Ok(())
pub async fn maybe_persist_control_file(&self) -> Result<()> {
self.write_shared_state()
.await
.sk
.maybe_persist_inmem_control_file()
.await
}
/// Gather timeline data for metrics.
/// Gather timeline data for metrics. If the timeline is not active, returns
/// None, we do not collect these.
pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
if self.is_cancelled() {
return None;
}
let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
let state = self.read_shared_state().await;
Some(FullTimelineInfo {
ttid: self.ttid,
ps_feedback_count,
last_ps_feedback,
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
timeline_is_active: self.broker_active.load(Ordering::Relaxed),
num_computes: self.walreceivers.get_num() as u32,
last_removed_segno: state.last_removed_segno,
epoch_start_lsn: state.sk.epoch_start_lsn,
mem_state: state.sk.state.inmem.clone(),
persisted_state: state.sk.state.clone(),
flush_lsn: state.sk.wal_store.flush_lsn(),
wal_storage: state.sk.wal_store.get_metrics(),
})
let state = self.write_shared_state().await;
if state.active {
Some(FullTimelineInfo {
ttid: self.ttid,
ps_feedback_count,
last_ps_feedback,
wal_backup_active: state.wal_backup_active,
timeline_is_active: state.active,
num_computes: self.walreceivers.get_num() as u32,
last_removed_segno: state.last_removed_segno,
epoch_start_lsn: state.sk.epoch_start_lsn,
mem_state: state.sk.state.inmem.clone(),
persisted_state: state.sk.state.clone(),
flush_lsn: state.sk.wal_store.flush_lsn(),
wal_storage: state.sk.wal_store.get_metrics(),
})
} else {
None
}
}
/// Returns in-memory timeline state to build a full debug dump.
pub async fn memory_dump(&self) -> debug_dump::Memory {
let state = self.read_shared_state().await;
let state = self.write_shared_state().await;
let (write_lsn, write_record_lsn, flush_lsn, file_open) =
state.sk.wal_store.internal_state();
@@ -863,8 +933,8 @@ impl Timeline {
is_cancelled: self.is_cancelled(),
peers_info_len: state.peers_info.0.len(),
walsenders: self.walsenders.get_all(),
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
active: self.broker_active.load(Ordering::Relaxed),
wal_backup_active: state.wal_backup_active,
active: state.active,
num_computes: self.walreceivers.get_num() as u32,
last_removed_segno: state.last_removed_segno,
epoch_start_lsn: state.sk.epoch_start_lsn,
@@ -878,7 +948,7 @@ impl Timeline {
/// Apply a function to the control file state and persist it.
pub async fn map_control_file<T>(
self: &Arc<Self>,
&self,
f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
) -> Result<T> {
let mut state = self.write_shared_state().await;

View File

@@ -1,145 +0,0 @@
//! The timeline manager task is responsible for managing the timeline's background tasks.
//! It is spawned alongside each timeline and exits when the timeline is deleted.
//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
use std::{sync::Arc, time::Duration};
use tracing::{info, instrument, warn};
use utils::lsn::Lsn;
use crate::{
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
timeline::{PeerInfo, ReadGuardSharedState, Timeline},
timelines_set::TimelinesSet,
wal_backup::{self, WalBackupTaskHandle},
SafeKeeperConf,
};
pub struct StateSnapshot {
pub commit_lsn: Lsn,
pub backup_lsn: Lsn,
pub remote_consistent_lsn: Lsn,
pub peers: Vec<PeerInfo>,
}
impl StateSnapshot {
/// Create a new snapshot of the timeline state.
fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self {
Self {
commit_lsn: read_guard.sk.state.inmem.commit_lsn,
backup_lsn: read_guard.sk.state.inmem.backup_lsn,
remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
peers: read_guard.get_peers(heartbeat_timeout),
}
}
}
/// Control how often the manager task should wake up to check updates.
/// There is no need to check for updates more often than this.
const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
/// background tasks.
#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
pub async fn main_task(
tli: Arc<Timeline>,
conf: SafeKeeperConf,
broker_active_set: Arc<TimelinesSet>,
) {
scopeguard::defer! {
if tli.is_cancelled() {
info!("manager task finished");
} else {
warn!("manager task finished prematurely");
}
};
// sets whether timeline is active for broker pushes or not
let mut tli_broker_active = broker_active_set.guard(tli.clone());
let ttid = tli.ttid;
let wal_seg_size = tli.get_wal_seg_size().await;
let heartbeat_timeout = conf.heartbeat_timeout;
let mut state_version_rx = tli.get_state_version_rx();
let walreceivers = tli.get_walreceivers();
let mut num_computes_rx = walreceivers.get_num_rx();
// list of background tasks
let mut backup_task: Option<WalBackupTaskHandle> = None;
let last_state = 'outer: loop {
MANAGER_ITERATIONS_TOTAL.inc();
let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
let num_computes = *num_computes_rx.borrow();
let is_wal_backup_required =
wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
if conf.is_wal_backup_enabled() {
wal_backup::update_task(
&conf,
ttid,
is_wal_backup_required,
&state_snapshot,
&mut backup_task,
)
.await;
}
let is_active = is_wal_backup_required
|| num_computes > 0
|| state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
// update the broker timeline set
if tli_broker_active.set(is_active) {
// write log if state has changed
info!(
"timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
);
MANAGER_ACTIVE_CHANGES.inc();
if !is_active {
// TODO: maybe use tokio::spawn?
if let Err(e) = tli.maybe_persist_control_file().await {
warn!("control file save in update_status failed: {:?}", e);
}
}
}
// update the state in Arc<Timeline>
tli.wal_backup_active
.store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
tli.broker_active
.store(is_active, std::sync::atomic::Ordering::Relaxed);
// wait until something changes. tx channels are stored under Arc, so they will not be
// dropped until the manager task is finished.
tokio::select! {
_ = tli.cancel.cancelled() => {
// timeline was deleted
break 'outer state_snapshot;
}
_ = async {
// don't wake up on every state change, but at most every REFRESH_INTERVAL
tokio::time::sleep(REFRESH_INTERVAL).await;
let _ = state_version_rx.changed().await;
} => {
// state was updated
}
_ = num_computes_rx.changed() => {
// number of connected computes was updated
}
}
};
// shutdown background tasks
if conf.is_wal_backup_enabled() {
wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
}
}

View File

@@ -4,7 +4,6 @@
use crate::safekeeper::ServerInfo;
use crate::timeline::{Timeline, TimelineError};
use crate::timelines_set::TimelinesSet;
use crate::SafeKeeperConf;
use anyhow::{bail, Context, Result};
use camino::Utf8PathBuf;
@@ -12,16 +11,16 @@ use once_cell::sync::Lazy;
use serde::Serialize;
use std::collections::HashMap;
use std::str::FromStr;
use std::sync::atomic::Ordering;
use std::sync::{Arc, Mutex};
use tokio::sync::mpsc::Sender;
use tracing::*;
use utils::id::{TenantId, TenantTimelineId, TimelineId};
use utils::lsn::Lsn;
struct GlobalTimelinesState {
timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
conf: Option<SafeKeeperConf>,
broker_active_set: Arc<TimelinesSet>,
load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
}
@@ -37,8 +36,11 @@ impl GlobalTimelinesState {
}
/// Get dependencies for a timeline constructor.
fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>) {
(self.get_conf().clone(), self.broker_active_set.clone())
fn get_dependencies(&self) -> (SafeKeeperConf, Sender<TenantTimelineId>) {
(
self.get_conf().clone(),
self.wal_backup_launcher_tx.as_ref().unwrap().clone(),
)
}
/// Insert timeline into the map. Returns error if timeline with the same id already exists.
@@ -63,8 +65,8 @@ impl GlobalTimelinesState {
static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
Mutex::new(GlobalTimelinesState {
timelines: HashMap::new(),
wal_backup_launcher_tx: None,
conf: None,
broker_active_set: Arc::new(TimelinesSet::default()),
load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
})
});
@@ -74,11 +76,16 @@ pub struct GlobalTimelines;
impl GlobalTimelines {
/// Inject dependencies needed for the timeline constructors and load all timelines to memory.
pub async fn init(conf: SafeKeeperConf) -> Result<()> {
pub async fn init(
conf: SafeKeeperConf,
wal_backup_launcher_tx: Sender<TenantTimelineId>,
) -> Result<()> {
// clippy isn't smart enough to understand that drop(state) releases the
// lock, so use explicit block
let tenants_dir = {
let mut state = TIMELINES_STATE.lock().unwrap();
assert!(state.wal_backup_launcher_tx.is_none());
state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
state.conf = Some(conf);
// Iterate through all directories and load tenants for all directories
@@ -122,9 +129,12 @@ impl GlobalTimelines {
/// this function is called during init when nothing else is running, so
/// this is fine.
async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
let (conf, broker_active_set) = {
let (conf, wal_backup_launcher_tx) = {
let state = TIMELINES_STATE.lock().unwrap();
state.get_dependencies()
(
state.get_conf().clone(),
state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
)
};
let timelines_dir = conf.tenant_dir(&tenant_id);
@@ -137,7 +147,7 @@ impl GlobalTimelines {
TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
{
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
match Timeline::load_timeline(&conf, ttid) {
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
Ok(timeline) => {
let tli = Arc::new(timeline);
TIMELINES_STATE
@@ -145,7 +155,8 @@ impl GlobalTimelines {
.unwrap()
.timelines
.insert(ttid, tli.clone());
tli.bootstrap(&conf, broker_active_set.clone());
tli.bootstrap(&conf);
tli.update_status_notify().await.unwrap();
}
// If we can't load a timeline, it's most likely because of a corrupted
// directory. We will log an error and won't allow to delete/recreate
@@ -178,9 +189,9 @@ impl GlobalTimelines {
_guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
ttid: TenantTimelineId,
) -> Result<Arc<Timeline>> {
let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies();
let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
match Timeline::load_timeline(&conf, ttid) {
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
Ok(timeline) => {
let tli = Arc::new(timeline);
@@ -191,7 +202,7 @@ impl GlobalTimelines {
.timelines
.insert(ttid, tli.clone());
tli.bootstrap(&conf, broker_active_set);
tli.bootstrap(&conf);
Ok(tli)
}
@@ -210,10 +221,6 @@ impl GlobalTimelines {
TIMELINES_STATE.lock().unwrap().get_conf().clone()
}
pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
}
/// Create a new timeline with the given id. If the timeline already exists, returns
/// an existing timeline.
pub async fn create(
@@ -222,7 +229,7 @@ impl GlobalTimelines {
commit_lsn: Lsn,
local_start_lsn: Lsn,
) -> Result<Arc<Timeline>> {
let (conf, broker_active_set) = {
let (conf, wal_backup_launcher_tx) = {
let state = TIMELINES_STATE.lock().unwrap();
if let Ok(timeline) = state.get(&ttid) {
// Timeline already exists, return it.
@@ -236,6 +243,7 @@ impl GlobalTimelines {
let timeline = Arc::new(Timeline::create_empty(
&conf,
ttid,
wal_backup_launcher_tx,
server_info,
commit_lsn,
local_start_lsn,
@@ -256,10 +264,7 @@ impl GlobalTimelines {
// Write the new timeline to the disk and start background workers.
// Bootstrap is transactional, so if it fails, the timeline will be deleted,
// and the state on disk should remain unchanged.
if let Err(e) = timeline
.init_new(&mut shared_state, &conf, broker_active_set)
.await
{
if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
// Note: the most likely reason for init failure is that the timeline
// directory already exists on disk. This happens when timeline is corrupted
// and wasn't loaded from disk on startup because of that. We want to preserve
@@ -276,6 +281,8 @@ impl GlobalTimelines {
// We are done with bootstrap, release the lock, return the timeline.
// {} block forces release before .await
}
timeline.update_status_notify().await?;
timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
Ok(timeline)
}
@@ -328,13 +335,12 @@ impl GlobalTimelines {
let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
match tli_res {
Ok(timeline) => {
let was_active = timeline.broker_active.load(Ordering::Relaxed);
// Take a lock and finish the deletion holding this mutex.
let mut shared_state = timeline.write_shared_state().await;
info!("deleting timeline {}, only_local={}", ttid, only_local);
let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
let (dir_existed, was_active) =
timeline.delete(&mut shared_state, only_local).await?;
// Remove timeline from the map.
// FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -343,7 +349,7 @@ impl GlobalTimelines {
Ok(TimelineDeleteForceResult {
dir_existed,
was_active, // TODO: we probably should remove this field
was_active,
})
}
Err(_) => {

View File

@@ -1,90 +0,0 @@
use std::{collections::HashMap, sync::Arc};
use utils::id::TenantTimelineId;
use crate::timeline::Timeline;
/// Set of timelines, supports operations:
/// - add timeline
/// - remove timeline
/// - clone the set
///
/// Usually used for keeping subset of timelines. For example active timelines that require broker push.
pub struct TimelinesSet {
timelines: std::sync::Mutex<HashMap<TenantTimelineId, Arc<Timeline>>>,
}
impl Default for TimelinesSet {
fn default() -> Self {
Self {
timelines: std::sync::Mutex::new(HashMap::new()),
}
}
}
impl TimelinesSet {
pub fn insert(&self, tli: Arc<Timeline>) {
self.timelines.lock().unwrap().insert(tli.ttid, tli);
}
pub fn delete(&self, ttid: &TenantTimelineId) {
self.timelines.lock().unwrap().remove(ttid);
}
/// If present is true, adds timeline to the set, otherwise removes it.
pub fn set_present(&self, tli: Arc<Timeline>, present: bool) {
if present {
self.insert(tli);
} else {
self.delete(&tli.ttid);
}
}
pub fn is_present(&self, ttid: &TenantTimelineId) -> bool {
self.timelines.lock().unwrap().contains_key(ttid)
}
/// Returns all timelines in the set.
pub fn get_all(&self) -> Vec<Arc<Timeline>> {
self.timelines.lock().unwrap().values().cloned().collect()
}
/// Returns a timeline guard for easy presence control.
pub fn guard(self: &Arc<Self>, tli: Arc<Timeline>) -> TimelineSetGuard {
let is_present = self.is_present(&tli.ttid);
TimelineSetGuard {
timelines_set: self.clone(),
tli,
is_present,
}
}
}
/// Guard is used to add or remove timeline from the set.
/// If the timeline present in set, it will be removed from it on drop.
/// Note: do not use more than one guard for the same timeline, it caches the presence state.
/// It is designed to be used in the manager task only.
pub struct TimelineSetGuard {
timelines_set: Arc<TimelinesSet>,
tli: Arc<Timeline>,
is_present: bool,
}
impl TimelineSetGuard {
/// Returns true if the state was changed.
pub fn set(&mut self, present: bool) -> bool {
if present == self.is_present {
return false;
}
self.is_present = present;
self.timelines_set.set_present(self.tli.clone(), present);
true
}
}
impl Drop for TimelineSetGuard {
fn drop(&mut self) {
// remove timeline from the map on drop
self.timelines_set.delete(&self.tli.ttid);
}
}

View File

@@ -9,7 +9,7 @@ use utils::backoff;
use utils::id::NodeId;
use std::cmp::min;
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::num::NonZeroU32;
use std::pin::Pin;
use std::sync::Arc;
@@ -29,10 +29,9 @@ use tracing::*;
use utils::{id::TenantTimelineId, lsn::Lsn};
use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS};
use crate::timeline::{PeerInfo, Timeline};
use crate::timeline_manager::StateSnapshot;
use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
use crate::{GlobalTimelines, SafeKeeperConf};
use once_cell::sync::OnceCell;
@@ -42,84 +41,35 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
/// Default buffer size when interfacing with [`tokio::fs::File`].
const BUFFER_SIZE: usize = 32 * 1024;
pub struct WalBackupTaskHandle {
/// Check whether wal backup is required for timeline. If yes, mark that launcher is
/// aware of current status and return the timeline.
async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
match GlobalTimelines::get(ttid).ok() {
Some(tli) => {
tli.wal_backup_attend().await;
Some(tli)
}
None => None,
}
}
struct WalBackupTaskHandle {
shutdown_tx: Sender<()>,
handle: JoinHandle<()>,
}
/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity?
pub fn is_wal_backup_required(
wal_seg_size: usize,
num_computes: usize,
state: &StateSnapshot,
) -> bool {
num_computes > 0 ||
// Currently only the whole segment is offloaded, so compare segment numbers.
(state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size))
struct WalBackupTimelineEntry {
timeline: Arc<Timeline>,
handle: Option<WalBackupTaskHandle>,
}
/// Based on peer information determine which safekeeper should offload; if it
/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
/// is running, kill it.
pub async fn update_task(
conf: &SafeKeeperConf,
ttid: TenantTimelineId,
need_backup: bool,
state: &StateSnapshot,
entry: &mut Option<WalBackupTaskHandle>,
) {
let (offloader, election_dbg_str) =
determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
let elected_me = Some(conf.my_id) == offloader;
let should_task_run = need_backup && elected_me;
// start or stop the task
if should_task_run != (entry.is_some()) {
if should_task_run {
info!("elected for backup: {}", election_dbg_str);
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
let timeline_dir = conf.timeline_dir(&ttid);
let async_task = backup_task_main(
ttid,
timeline_dir,
conf.workdir.clone(),
conf.backup_parallel_jobs,
shutdown_rx,
);
let handle = if conf.current_thread_runtime {
tokio::spawn(async_task)
} else {
WAL_BACKUP_RUNTIME.spawn(async_task)
};
*entry = Some(WalBackupTaskHandle {
shutdown_tx,
handle,
});
} else {
if !need_backup {
// don't need backup at all
info!("stepping down from backup, need_backup={}", need_backup);
} else {
// someone else has been elected
info!("stepping down from backup: {}", election_dbg_str);
}
shut_down_task(entry).await;
}
}
}
async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
if let Some(wb_handle) = entry.take() {
async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) {
if let Some(wb_handle) = entry.handle.take() {
// Tell the task to shutdown. Error means task exited earlier, that's ok.
let _ = wb_handle.shutdown_tx.send(()).await;
// Await the task itself. TODO: restart panicked tasks earlier.
if let Err(e) = wb_handle.handle.await {
warn!("WAL backup task panicked: {}", e);
warn!("WAL backup task for {} panicked: {}", ttid, e);
}
}
}
@@ -176,6 +126,49 @@ fn determine_offloader(
}
}
/// Based on peer information determine which safekeeper should offload; if it
/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
/// is running, kill it.
async fn update_task(
conf: &SafeKeeperConf,
ttid: TenantTimelineId,
entry: &mut WalBackupTimelineEntry,
) {
let alive_peers = entry.timeline.get_peers(conf).await;
let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
let (offloader, election_dbg_str) =
determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
let elected_me = Some(conf.my_id) == offloader;
if elected_me != (entry.handle.is_some()) {
if elected_me {
info!("elected for backup: {}", election_dbg_str);
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
let timeline_dir = conf.timeline_dir(&ttid);
let handle = tokio::spawn(
backup_task_main(
ttid,
timeline_dir,
conf.workdir.clone(),
conf.backup_parallel_jobs,
shutdown_rx,
)
.in_current_span(),
);
entry.handle = Some(WalBackupTaskHandle {
shutdown_tx,
handle,
});
} else {
info!("stepping down from backup: {}", election_dbg_str);
shut_down_task(ttid, entry).await;
}
}
}
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
// Storage must be configured and initialized when this is called.
@@ -197,6 +190,67 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
});
}
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
/// tasks. Having this in separate task simplifies locking, allows to reap
/// panics and separate elections from offloading itself.
pub async fn wal_backup_launcher_task_main(
conf: SafeKeeperConf,
mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
) -> anyhow::Result<()> {
info!(
"WAL backup launcher started, remote config {:?}",
conf.remote_storage
);
// Presence in this map means launcher is aware s3 offloading is needed for
// the timeline, but task is started only if it makes sense for to offload
// from this safekeeper.
let mut tasks: HashMap<TenantTimelineId, WalBackupTimelineEntry> = HashMap::new();
let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC));
loop {
tokio::select! {
ttid = wal_backup_launcher_rx.recv() => {
// channel is never expected to get closed
let ttid = ttid.unwrap();
if !conf.is_wal_backup_enabled() {
continue; /* just drain the channel and do nothing */
}
async {
let timeline = is_wal_backup_required(ttid).await;
// do we need to do anything at all?
if timeline.is_some() != tasks.contains_key(&ttid) {
if let Some(timeline) = timeline {
// need to start the task
let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry {
timeline,
handle: None,
});
update_task(&conf, ttid, entry).await;
} else {
// need to stop the task
info!("stopping WAL backup task");
let mut entry = tasks.remove(&ttid).unwrap();
shut_down_task(ttid, &mut entry).await;
}
}
}.instrument(info_span!("WAL backup", ttid = %ttid)).await;
}
// For each timeline needing offloading, check if this safekeeper
// should do the job and start/stop the task accordingly.
_ = ticker.tick() => {
for (ttid, entry) in tasks.iter_mut() {
update_task(&conf, *ttid, entry)
.instrument(info_span!("WAL backup", ttid = %ttid))
.await;
}
}
}
}
}
struct WalBackupTask {
timeline: Arc<Timeline>,
timeline_dir: Utf8PathBuf,
@@ -207,7 +261,6 @@ struct WalBackupTask {
}
/// Offload single timeline.
#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
async fn backup_task_main(
ttid: TenantTimelineId,
timeline_dir: Utf8PathBuf,
@@ -215,8 +268,6 @@ async fn backup_task_main(
parallel_jobs: usize,
mut shutdown_rx: Receiver<()>,
) {
let _guard = WAL_BACKUP_TASKS.guard();
info!("started");
let res = GlobalTimelines::get(ttid);
if let Err(e) = res {

View File

@@ -277,6 +277,14 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
debug!("started");
let await_duration = conf.partial_backup_timeout;
let mut cancellation_rx = match tli.get_cancellation_rx() {
Ok(rx) => rx,
Err(_) => {
info!("timeline canceled during task start");
return;
}
};
// sleep for random time to avoid thundering herd
{
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
@@ -319,7 +327,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
&& flush_lsn_rx.borrow().term == seg.term
{
tokio::select! {
_ = backup.tli.cancel.cancelled() => {
_ = cancellation_rx.changed() => {
info!("timeline canceled");
return;
}
@@ -332,7 +340,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
// if we don't have any data and zero LSNs, wait for something
while flush_lsn_rx.borrow().lsn == Lsn(0) {
tokio::select! {
_ = backup.tli.cancel.cancelled() => {
_ = cancellation_rx.changed() => {
info!("timeline canceled");
return;
}
@@ -349,7 +357,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
// waiting until timeout expires OR segno changes
'inner: loop {
tokio::select! {
_ = backup.tli.cancel.cancelled() => {
_ = cancellation_rx.changed() => {
info!("timeline canceled");
return;
}

View File

@@ -147,7 +147,6 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
local_start_lsn: 0,
availability_zone: None,
standby_horizon: 0,
};
counter += 1;
yield info;

View File

@@ -42,7 +42,6 @@ message SafekeeperTimelineInfo {
uint64 remote_consistent_lsn = 7;
uint64 peer_horizon_lsn = 8;
uint64 local_start_lsn = 9;
uint64 standby_horizon = 14;
// A connection string to use for WAL receiving.
string safekeeper_connstr = 10;
// HTTP endpoint connection string
@@ -106,6 +105,4 @@ message SafekeeperDiscoveryResponse {
string safekeeper_connstr = 4;
// Availability zone of a safekeeper.
optional string availability_zone = 5;
// Replica apply LSN
uint64 standby_horizon = 6;
}

View File

@@ -736,7 +736,6 @@ mod tests {
http_connstr: "neon-1-sk-1.local:7677".to_owned(),
local_start_lsn: 0,
availability_zone: None,
standby_horizon: 0,
})
}

View File

@@ -142,7 +142,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_resident_physical_size",
"pageserver_io_operations_bytes_total",
"pageserver_last_record_lsn",
"pageserver_standby_horizon",
"pageserver_smgr_query_seconds_bucket",
"pageserver_smgr_query_seconds_count",
"pageserver_smgr_query_seconds_sum",

View File

@@ -1625,7 +1625,7 @@ class NeonCli(AbstractNeonCli):
args.extend(["-c", "switch_aux_file_policy:v1"])
if aux_file_v2 is AuxFileStore.CrossValidation:
args.extend(["-c", "switch_aux_file_policy:cross-validation"])
args.extend(["-c", "switch_aux_file_policy:cross_validation"])
if set_default:
args.append("--set-default")
@@ -2721,12 +2721,7 @@ class PgBin:
env.update(env_add)
return env
def run(
self,
command: List[str],
env: Optional[Env] = None,
cwd: Optional[Union[str, Path]] = None,
):
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
"""
Run one of the postgres binaries.
@@ -2788,28 +2783,6 @@ class PgBin:
log.info(f"last checkpoint at {checkpoint_lsn}")
return Lsn(checkpoint_lsn)
def take_fullbackup(
self,
pageserver: NeonPageserver,
tenant: TenantId,
timeline: TimelineId,
lsn: Lsn,
output: Path,
):
"""
Request fullbackup from pageserver, store it at 'output'.
"""
cmd = [
"psql",
"--no-psqlrc",
pageserver.connstr(),
"-c",
f"fullbackup {tenant} {timeline} {lsn}",
"-o",
str(output),
]
self.run_capture(cmd)
@pytest.fixture(scope="function")
def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
@@ -4172,12 +4145,7 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
# pg is the existing and running compute node, that we want to compare with a basebackup
def check_restored_datadir_content(
test_output_dir: Path,
env: NeonEnv,
endpoint: Endpoint,
ignored_files: Optional[list[str]] = None,
):
def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
# Get the timeline ID. We need it for the 'basebackup' command
@@ -4230,10 +4198,6 @@ def check_restored_datadir_content(
if not f.startswith("pg_xact") and not f.startswith("pg_multixact")
]
if ignored_files:
pgdata_files = [f for f in pgdata_files if f not in ignored_files]
restored_files = [f for f in restored_files if f not in ignored_files]
# check that file sets are equal
assert pgdata_files == restored_files
@@ -4324,17 +4288,6 @@ def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):
time.sleep(1)
def log_replica_lag(primary: Endpoint, secondary: Endpoint):
last_replay_lsn = Lsn(
secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False)
)
primary_lsn = Lsn(
primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False)
)
lag = primary_lsn - last_replay_lsn
log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}")
def wait_for_last_flush_lsn(
env: NeonEnv,
endpoint: Endpoint,

View File

@@ -70,7 +70,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
# this is expected given our collaborative shutdown approach for the UploadQueue
".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
".*Compaction failed.*, retrying in .*: ShuttingDown",
".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*",
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
".*Error processing HTTP request: NotFound: Timeline .* was not found",
".*took more than expected to complete.*",
@@ -92,10 +91,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*",
# Can happen when the test shuts down the storage controller while it is calling the utilization API
".*WARN.*path=/v1/utilization .*request was dropped before completing",
# Can happen during shutdown
".*scheduling deletion on drop failed: queue is in state Stopped.*",
# Can happen during shutdown
".*ignoring failure to find gc cutoffs: timeline shutting down.*",
)

View File

@@ -56,30 +56,20 @@ class InMemoryLayerInfo:
class HistoricLayerInfo:
kind: str
layer_file_name: str
layer_file_size: int
layer_file_size: Optional[int]
lsn_start: str
lsn_end: Optional[str]
remote: bool
# None for image layers, true if pageserver thinks this is an L0 delta layer
l0: Optional[bool]
@classmethod
def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
# instead of parsing the key range lets keep the definition of "L0" in pageserver
l0_ness = d.get("l0")
assert l0_ness is None or isinstance(l0_ness, bool)
size = d["layer_file_size"]
assert isinstance(size, int)
return HistoricLayerInfo(
kind=d["kind"],
layer_file_name=d["layer_file_name"],
layer_file_size=size,
layer_file_size=d.get("layer_file_size"),
lsn_start=d["lsn_start"],
lsn_end=d.get("lsn_end"),
remote=d["remote"],
l0=l0_ness,
)
@@ -593,7 +583,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
timeline_id: TimelineId,
force_repartition=False,
force_image_layer_creation=False,
wait_until_uploaded=False,
):
self.is_testing_enabled_or_skip()
query = {}
@@ -601,8 +590,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
query["force_repartition"] = "true"
if force_image_layer_creation:
query["force_image_layer_creation"] = "true"
if wait_until_uploaded:
query["wait_until_uploaded"] = "true"
log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
res = self.put(
@@ -669,7 +656,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
timeline_id: TimelineId,
force_repartition=False,
force_image_layer_creation=False,
wait_until_uploaded=False,
):
self.is_testing_enabled_or_skip()
query = {}
@@ -677,8 +663,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
query["force_repartition"] = "true"
if force_image_layer_creation:
query["force_image_layer_creation"] = "true"
if wait_until_uploaded:
query["wait_until_uploaded"] = "true"
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
res = self.put(

View File

@@ -4,13 +4,10 @@ import json
import os
import re
import subprocess
import tarfile
import threading
import time
from hashlib import sha256
from pathlib import Path
from typing import (
IO,
TYPE_CHECKING,
Any,
Callable,
@@ -18,10 +15,8 @@ from typing import (
Iterable,
List,
Optional,
Set,
Tuple,
TypeVar,
Union,
)
from urllib.parse import urlencode
@@ -495,68 +490,12 @@ def assert_no_errors(log_file, service, allowed_errors):
@enum.unique
class AuxFileStore(str, enum.Enum):
V1 = "v1"
V2 = "v2"
CrossValidation = "cross-validation"
V1 = "V1"
V2 = "V2"
CrossValidation = "CrossValidation"
def __repr__(self) -> str:
return f"'aux-{self.value}'"
def __str__(self) -> str:
return f"'aux-{self.value}'"
def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]):
"""
This is essentially:
lines=$(comm -3 \
<(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
<(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
| wc -l)
[ "$lines" = "0" ]
But in a more mac friendly fashion.
"""
started_at = time.time()
def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
assert reader is not None
digest = sha256(usedforsecurity=False)
while True:
buf = reader.read(64 * 1024)
if not buf:
break
digest.update(buf)
return digest.digest()
def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
with tarfile.open(p) as f:
matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
ret = list(
map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
)
ret.sort(key=lambda t: t[0])
return ret
left_list, right_list = map(build_hash_list, [left, right])
assert len(left_list) == len(
right_list
), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}"
mismatching = set()
for left_tuple, right_tuple in zip(left_list, right_list):
left_path, left_hash = left_tuple
right_path, right_hash = right_tuple
assert (
left_path == right_path
), f"file count matched, expected these to be same paths: {left_path}, {right_path}"
if left_hash != right_hash:
mismatching.add(left_path)
assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}"
elapsed = time.time() - started_at
log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")

View File

@@ -17,13 +17,9 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[
# eviction might be the first one after an attach to access the layers
".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
# detach can happen before we get to validate the generation number
".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
]
# eviction might be the first one after an attach to access the layers
env.pageserver.allowed_errors.append(
".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
)
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
return env
@@ -166,7 +162,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
"checkpoint_distance": 10000,
"checkpoint_timeout": "13m",
"compaction_algorithm": {
"kind": "tiered",
"kind": "Tiered",
},
"eviction_policy": {
"kind": "LayerAccessThreshold",
@@ -194,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
"trace_read_requests": True,
"walreceiver_connect_timeout": "13m",
"image_layer_creation_check_threshold": 1,
"switch_aux_file_policy": "cross-validation",
"switch_aux_file_policy": "CrossValidation",
}
ps_http = env.pageserver.http_client()

View File

@@ -1,6 +1,5 @@
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
AuxFileStore,
NeonEnvBuilder,
logical_replication_sync,
)
@@ -15,7 +14,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
timeline_id = env.initial_timeline
tenant_config = client.tenant_config(tenant_id).effective_config
tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
tenant_config["switch_aux_file_policy"] = "V2"
client.set_tenant_config(tenant_id, tenant_config)
# aux file v2 is enabled on the write path, so for now, it should be unset (or null)
assert (
@@ -50,10 +49,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
with env.pageserver.http_client() as client:
# aux file v2 flag should be enabled at this point
assert (
client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
== AuxFileStore.V2
)
assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2"
with env.pageserver.http_client() as client:
tenant_config = client.tenant_config(tenant_id).effective_config
tenant_config["switch_aux_file_policy"] = "V1"
@@ -63,7 +59,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
"last_aux_file_policy"
]
== AuxFileStore.V2
== "V2"
)
env.pageserver.restart()
with env.pageserver.http_client() as client:
@@ -72,5 +68,5 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
"last_aux_file_policy"
]
== AuxFileStore.V2
== "V2"
)

View File

@@ -165,6 +165,7 @@ def test_sharding_compaction(
image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
# Pageserver should assert rather than emit an empty layer file, but double check here
assert layer.layer_file_size is not None
assert layer.layer_file_size > 0
shard_has_image_layers.append(len(image_layer_sizes) > 1)
@@ -177,7 +178,7 @@ def test_sharding_compaction(
#
# We only do this check with tiny stripes, because large stripes may not give all shards enough
# data to have statistically significant image layers
avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)
avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) # type: ignore
log.info(f"Shard {shard_id} average image layer size: {avg_size}")
assert avg_size > compaction_target_size / 2
@@ -194,8 +195,8 @@ def test_sharding_compaction(
class CompactionAlgorithm(str, enum.Enum):
LEGACY = "legacy"
TIERED = "tiered"
LEGACY = "Legacy"
TIERED = "Tiered"
@pytest.mark.parametrize(

View File

@@ -1,7 +1,7 @@
import os
from pathlib import Path
from fixtures.common_types import Lsn
from fixtures.common_types import Lsn, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
@@ -19,16 +19,17 @@ def test_fullbackup(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
port_distributor: PortDistributor,
pg_distrib_dir: Path,
test_output_dir: Path,
):
env = neon_env_builder.init_start()
# endpoint needs to be alive until the fullbackup so that we have
# prev_record_lsn for the vanilla_pg to start in read-write mode
# for some reason this does not happen if endpoint is shutdown.
endpoint_main = env.endpoints.create_start("main")
env.neon_cli.create_branch("test_fullbackup")
endpoint_main = env.endpoints.create_start("test_fullbackup")
with endpoint_main.cursor() as cur:
timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
# data loading may take a while, so increase statement timeout
cur.execute("SET statement_timeout='300s'")
cur.execute(
@@ -40,13 +41,17 @@ def test_fullbackup(
lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
log.info(f"start_backup_lsn = {lsn}")
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
# Get and unpack fullbackup from pageserver
restored_dir_path = env.repo_dir / "restored_datadir"
os.mkdir(restored_dir_path, 0o750)
query = f"fullbackup {env.initial_tenant} {timeline} {lsn}"
tar_output_file = test_output_dir / "fullbackup.tar"
pg_bin.take_fullbackup(
env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file
)
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
pg_bin.run_capture(cmd, env=psql_env)
subprocess_capture(
env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)]
)
@@ -56,7 +61,7 @@ def test_fullbackup(
# use resetwal to overwrite it
pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
pg_bin.run_capture(cmd)
pg_bin.run_capture(cmd, env=psql_env)
# Restore from the backup and find the data we inserted
port = port_distributor.get_port()

View File

@@ -1,21 +1,9 @@
import asyncio
import os
import re
import threading
import time
from functools import partial
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
log_replica_lag,
tenant_get_shards,
wait_replica_caughtup,
)
from fixtures.utils import wait_until
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
# Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -116,28 +104,19 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
wait_replica_caughtup(primary, secondary2)
# Test two different scenarios related to gc of data needed by hot standby.
# We had an issue that a standby server made GetPage requests with an
# old LSN, based on the last-written LSN cache, to avoid waits in the
# pageserver. However, requesting a page with a very old LSN, such
# that the GC horizon has already advanced past it, results in an
# error from the pageserver:
# "Bad request: tried to request a page version that was garbage collected"
#
# When pause_apply is False, standby is mostly caught up with the primary.
# However, in compute <-> pageserver protocol version 1 only one LSN had been
# sent to the pageserver in page request, and to avoid waits in the pageserver
# it was last-written LSN cache value. If page hasn't been updated for a long
# time that resulted in an error from the pageserver: "Bad request: tried to
# request a page version that was garbage collected". For primary this wasn't a
# problem because pageserver always bumped LSN to the newest one; for standy
# that would be incorrect since we might get page fresher then apply LSN. Hence,
# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN
# in case of standby) and not_modified_since which could be used as an
# optimization to avoid waiting.
# To avoid that, the compute<-> pageserver protocol was updated so
# that that the standby now sends two LSNs, the old last-written LSN
# and the current replay LSN.
#
# https://github.com/neondatabase/neon/issues/6211
#
# When pause_apply is True we model standby lagging behind primary (e.g. due to
# high max_standby_streaming_delay). To prevent pageserver from removing data
# still needed by the standby apply LSN is propagated in standby -> safekeepers
# -> broker -> pageserver flow so that pageserver could hold off gc for it.
@pytest.mark.parametrize("pause_apply", [False, True])
def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
tenant_conf = {
# set PITR interval to be small, so we can do GC
"pitr_interval": "0 s",
@@ -181,9 +160,6 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
# so we still remember the LSNs of the pages.
s_cur.execute("SELECT clear_buffer_cache()")
if pause_apply:
s_cur.execute("SELECT pg_wal_replay_pause()")
# Do other stuff on the primary, to advance the WAL
p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
@@ -200,155 +176,6 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
# generates use old not_modified_since LSNs, older than
# the GC cutoff, but new request LSNs. (In protocol
# version 1 there was only one LSN, and this failed.)
log_replica_lag(primary, secondary)
s_cur.execute("SELECT COUNT(*) FROM test")
log_replica_lag(primary, secondary)
res = s_cur.fetchone()
assert res[0] == 10000
def run_pgbench(connstr: str, pg_bin: PgBin):
log.info(f"Start a pgbench workload on pg {connstr}")
# s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
log.info("pgbench init done")
pg_bin.run_capture(["pgbench", "-T60", connstr])
# assert that pgbench_accounts and its index are created.
def pgbench_accounts_initialized(ep):
ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass")
# Test that hot_standby_feedback works in neon (it is forwarded through
# safekeepers). That is, ensure queries on standby don't fail during load on
# primary under the following conditions:
# - pgbench bombards primary with updates.
# - On the secondary we run long select of the updated table.
# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts
# so apply doesn't need to wait.
# - Do agressive vacuum on primary which still shouldn't create conflicts.
# Actually this appears to be redundant due to microvacuum existence.
#
# Without hs feedback enabled we'd see 'User query might have needed to see row
# versions that must be removed.' errors.
def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start()
agressive_vacuum_conf = [
"log_autovacuum_min_duration = 0",
"autovacuum_naptime = 10s",
"autovacuum_vacuum_threshold = 25",
"autovacuum_vacuum_scale_factor = 0.1",
"autovacuum_vacuum_cost_delay = -1",
]
with env.endpoints.create_start(
branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf
) as primary:
# It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with
# 'User was holding shared buffer pin for too long.'.
with env.endpoints.new_replica_start(
origin=primary,
endpoint_id="secondary",
config_lines=[
"max_standby_streaming_delay=2s",
"neon.protocol_version=2",
"hot_standby_feedback=true",
],
) as secondary:
log.info(
f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
)
t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
t.start()
# Wait until pgbench_accounts is created + filled on replica *and*
# index is created. Otherwise index creation would conflict with
# read queries and hs feedback won't save us.
wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
# Test should fail if hs feedback is disabled anyway, but cross
# check that walproposer sets some xmin.
def xmin_is_not_null():
slot_xmin = primary.safe_psql_scalar(
"select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
log_query=False,
)
log.info(f"xmin is {slot_xmin}")
assert int(slot_xmin) > 0
wait_until(10, 1.0, xmin_is_not_null)
for _ in range(1, 5):
# in debug mode takes about 5-7s
balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
log.info(f"balance={balance}")
log_replica_lag(primary, secondary)
t.join()
# check xmin is reset when standby is gone
def xmin_is_null():
slot_xmin = primary.safe_psql_scalar(
"select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
log_query=False,
)
log.info(f"xmin is {slot_xmin}")
assert slot_xmin is None
wait_until(10, 1.0, xmin_is_null)
# Test race condition between WAL replay and backends performing queries
# https://github.com/neondatabase/neon/issues/7791
def test_replica_query_race(neon_simple_env: NeonEnv):
env = neon_simple_env
primary_ep = env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
)
with primary_ep.connect() as p_con:
with p_con.cursor() as p_cur:
p_cur.execute("CREATE EXTENSION neon_test_utils")
p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter")
standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby")
time.sleep(1)
# In primary, run a lot of UPDATEs on a single page
finished = False
writecounter = 1
async def primary_workload():
nonlocal writecounter, finished
conn = await primary_ep.connect_async()
while writecounter < 10000:
writecounter += 1
await conn.execute(f"UPDATE test SET counter = {writecounter}")
finished = True
# In standby, at the same time, run queries on it. And repeatedly drop caches
async def standby_workload():
nonlocal writecounter, finished
conn = await standby_ep.connect_async()
reads = 0
while not finished:
readcounter = await conn.fetchval("SELECT counter FROM test")
# Check that the replica is keeping up with the primary. In local
# testing, the lag between primary and standby is much smaller, in
# the ballpark of 2-3 counter values. But be generous in case there's
# some hiccup.
# assert(writecounter - readcounter < 1000)
assert readcounter <= writecounter
if reads % 100 == 0:
log.info(f"read {reads}: counter {readcounter}, last update {writecounter}")
reads += 1
await conn.execute("SELECT clear_buffer_cache()")
async def both():
await asyncio.gather(
primary_workload(),
standby_workload(),
)
asyncio.run(both())

View File

@@ -21,7 +21,7 @@ from fixtures.pageserver.utils import (
wait_for_upload,
)
from fixtures.remote_storage import RemoteStorageKind
from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture
from fixtures.utils import subprocess_capture
def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
@@ -163,7 +163,7 @@ def test_import_from_pageserver_small(
num_rows = 3000
lsn = _generate_data(num_rows, endpoint)
_import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
_import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir)
@pytest.mark.timeout(1800)
@@ -193,7 +193,9 @@ def test_import_from_pageserver_multisegment(
log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
assert logical_size > 1024**3 # = 1GB
tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
tar_output_file = _import(
num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir
)
# Check if the backup data contains multiple segment files
cnt_seg_files = 0
@@ -233,6 +235,7 @@ def _import(
env: NeonEnv,
pg_bin: PgBin,
timeline: TimelineId,
pg_distrib_dir: Path,
test_output_dir: Path,
) -> Path:
"""Test importing backup data to the pageserver.
@@ -245,9 +248,15 @@ def _import(
path to the backup archive file"""
log.info(f"start_backup_lsn = {lsn}")
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
# Get a fullbackup from pageserver
query = f"fullbackup { env.initial_tenant} {timeline} {lsn}"
tar_output_file = test_output_dir / "fullbackup.tar"
pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file)
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
pg_bin.run_capture(cmd, env=psql_env)
# Stop the first pageserver instance, erase all its data
env.endpoints.stop_all()
@@ -292,15 +301,26 @@ def _import(
wait_for_upload(client, tenant, timeline, lsn)
# Check it worked
endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn)
endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
# Take another fullbackup
query = f"fullbackup { tenant} {timeline} {lsn}"
new_tar_output_file = test_output_dir / "fullbackup-new.tar"
pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file)
cmd = [
"psql",
"--no-psqlrc",
env.pageserver.connstr(),
"-c",
query,
"-o",
str(new_tar_output_file),
]
pg_bin.run_capture(cmd, env=psql_env)
# Check it's the same as the first fullbackup
assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set())
# TODO pageserver should be checking checksum
assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
# Check that gc works
pageserver_http = env.pageserver.http_client()

View File

@@ -272,14 +272,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
resident_physical_size_metric == 0
), "ensure that resident_physical_size metric is zero"
assert resident_physical_size_metric == sum(
layer.layer_file_size for layer in info.historic_layers if not layer.remote
layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
), "ensure that resident_physical_size metric corresponds to layer map dump"
remote_physical_size_metric = ps_http.get_timeline_metric(
tenant_id, timeline_id, "pageserver_remote_physical_size"
)
assert remote_physical_size_metric == sum(
layer.layer_file_size for layer in info.historic_layers if layer.remote
layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
), "ensure that remote_physical_size metric corresponds to layer map dump"
log.info("before runnning GC, ensure that remote_physical size is zero")

View File

@@ -5,7 +5,7 @@ from pathlib import Path
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
from fixtures.pageserver.utils import (
wait_for_last_record_lsn,
)
@@ -71,17 +71,22 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
def test_import_at_2bil(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
pg_bin: PgBin,
pg_distrib_dir: Path,
pg_bin,
vanilla_pg,
):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
# Reset the vanilla Postgres instance to somewhat before 2 billion transactions.
pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)]
pg_bin.run_capture(cmd)
pg_bin.run_capture(cmd, env=psql_env)
vanilla_pg.start()
vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")

View File

@@ -540,6 +540,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
for layer in layers.historic_layers:
log.info(f"pre-compact: {layer}")
assert layer.layer_file_size is not None, "we must know layer file sizes"
layer_sizes += layer.layer_file_size
pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)

View File

@@ -287,7 +287,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
total_historic_bytes += sum(
layer.layer_file_size
for layer in layer_map.historic_layers
if Lsn(layer.lsn_start) > initdb_lsn
if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn
)
total_ephemeral_layers += len(layer_map.in_memory_layers)

View File

@@ -575,10 +575,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
tenant_timelines = {}
# This mirrors a constant in `downloader.rs`
default_download_period_secs = 60
# The upload period, which will also be the download once the secondary has seen its first heatmap
upload_period_secs = 30
freshen_interval_secs = 60
for _i in range(0, tenant_count):
tenant_id = TenantId.generate()
@@ -590,32 +587,17 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
placement_policy='{"Attached":1}',
# Run with a low heatmap period so that we can avoid having to do synthetic API calls
# to trigger the upload promptly.
conf={"heatmap_period": f"{upload_period_secs}s"},
conf={"heatmap_period": "1s"},
)
env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
tenant_timelines[tenant_id] = [timeline_a, timeline_b]
def await_log(pageserver, deadline, expression):
"""
Wrapper around assert_log_contains that waits with a deadline rather than timeout
"""
now = time.time()
if now > deadline:
raise RuntimeError(f"Timed out waiting for {expression}")
else:
timeout = int(deadline - now) + 1
try:
wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore
except:
log.error(f"Timed out waiting for '{expression}'")
raise
t_start = time.time()
# Wait long enough that the background downloads should happen; we expect all the inital layers
# of all the initial timelines to show up on the secondary location of each tenant.
initial_download_deadline = time.time() + default_download_period_secs * 3
time.sleep(freshen_interval_secs * 1.5)
for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -623,32 +605,16 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
now = time.time()
if now > initial_download_deadline:
raise RuntimeError("Timed out waiting for initial secondary download")
else:
for timeline_id in timelines:
log.info(
f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}"
)
await_log(
ps_secondary,
initial_download_deadline,
f".*{timeline_id}.*Wrote timeline_detail.*",
)
for timeline_id in timelines:
log.info(
f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}"
)
log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
# One or more layers should be present for all timelines
assert ps_secondary.list_layers(tenant_id, timeline_id)
# Delete the second timeline: this should be reflected later on the secondary
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
# Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
deletion_deadline = time.time() + upload_period_secs * 3
# Wait long enough for the secondary locations to see the deletion
time.sleep(freshen_interval_secs * 1.5)
for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -656,24 +622,11 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
expect_del_timeline = timelines[1]
log.info(
f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}"
)
await_log(
ps_secondary,
deletion_deadline,
f".*Timeline no longer in heatmap.*{expect_del_timeline}.*",
)
# This one was not deleted
assert ps_secondary.list_layers(tenant_id, timelines[0])
# This one was deleted
log.info(
f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
)
assert not ps_secondary.list_layers(tenant_id, expect_del_timeline)
assert not ps_secondary.list_layers(tenant_id, timelines[1])
t_end = time.time()
@@ -687,7 +640,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
expect_download_rate = 1.0 / upload_period_secs
expect_download_rate = 1.0 / freshen_interval_secs
log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
assert download_rate < expect_download_rate * 2

View File

@@ -1,25 +1,16 @@
#
# This file runs pg_regress-based tests.
#
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING, cast
from typing import Optional
import pytest
from fixtures.neon_fixtures import (
NeonEnvBuilder,
check_restored_datadir_content,
)
from fixtures.pg_version import PgVersion
from fixtures.remote_storage import s3_storage
if TYPE_CHECKING:
from typing import Optional
from fixtures.neon_fixtures import PgBin
from pytest import CaptureFixture
# Run the main PostgreSQL regression tests, in src/test/regress.
#
@@ -28,14 +19,12 @@ def test_pg_regress(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
build_type: str,
pg_bin: PgBin,
capsys: CaptureFixture[str],
pg_bin,
capsys,
base_dir: Path,
pg_distrib_dir: Path,
shard_count: Optional[int],
):
DBNAME = "regression"
"""
:param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this
many shards.
@@ -53,7 +42,7 @@ def test_pg_regress(
# Connect to postgres and create a database called "regression".
endpoint = env.endpoints.create_start("main")
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
endpoint.safe_psql("CREATE DATABASE regression")
# Create some local directories for pg_regress to run in.
runpath = test_output_dir / "regress"
@@ -88,67 +77,7 @@ def test_pg_regress(
with capsys.disabled():
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
ignored_files: Optional[list[str]] = None
# Neon handles unlogged relations in a special manner. During a
# basebackup, we ship the init fork as the main fork. This presents a
# problem in that the endpoint's data directory and the basebackup will
# have differences and will fail the eventual file comparison.
#
# Unlogged tables were introduced in version 9.1. ALTER TABLE grew
# support for setting the persistence of a table in 9.5. The reason that
# this doesn't affect versions < 15 (but probably would between 9.1 and
# 9.5) is that all the regression tests that deal with unlogged tables
# up until that point dropped the unlogged tables or set them to logged
# at some point during the test.
#
# In version 15, Postgres grew support for unlogged sequences, and with
# that came a few more regression tests. These tests did not all drop
# the unlogged tables/sequences prior to finishing.
#
# But unlogged sequences came with a bug in that, sequences didn't
# inherit the persistence of their "parent" tables if they had one. This
# was fixed and backported to 15, thus exacerbating our problem a bit.
#
# So what we can do is just ignore file differences between the data
# directory and basebackup for unlogged relations.
results = cast(
"list[tuple[str, str]]",
endpoint.safe_psql(
"""
SELECT
relkind,
pg_relation_filepath(
pg_filenode_relation(reltablespace, relfilenode)
) AS unlogged_relation_paths
FROM pg_class
WHERE relpersistence = 'u'
""",
dbname=DBNAME,
),
)
unlogged_relation_files: list[str] = []
for r in results:
unlogged_relation_files.append(r[1])
# This is related to the following Postgres commit:
#
# commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
# Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
# Date: 2023-08-23 09:21:31 -0500
#
# Use the buffer cache when initializing an unlogged index.
#
# This patch was backpatched to 16. Without it, the LSN in the
# page header would be 0/0 in the data directory, which wouldn't
# match the LSN generated during the basebackup, thus creating
# a difference.
if env.pg_version <= PgVersion.V15 and r[0] == "i":
unlogged_relation_files.append(f"{r[1]}_init")
ignored_files = unlogged_relation_files
check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
check_restored_datadir_content(test_output_dir, env, endpoint)
# Run the PostgreSQL "isolation" tests, in src/test/isolation.
@@ -157,8 +86,8 @@ def test_pg_regress(
def test_isolation(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
pg_bin: PgBin,
capsys: CaptureFixture[str],
pg_bin,
capsys,
base_dir: Path,
pg_distrib_dir: Path,
shard_count: Optional[int],
@@ -213,8 +142,8 @@ def test_isolation(
def test_sql_regress(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
pg_bin: PgBin,
capsys: CaptureFixture[str],
pg_bin,
capsys,
base_dir: Path,
pg_distrib_dir: Path,
shard_count: Optional[int],

View File

@@ -632,6 +632,7 @@ def test_sharding_ingest_layer_sizes(
historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start)
for layer in historic_layers:
assert layer.layer_file_size is not None
if layer.layer_file_size < expect_layer_size // 2:
classification = "Small"
small_layer_count += 1

Some files were not shown because too many files have changed in this diff Show More