Compare commits

..

6 Commits

Author SHA1 Message Date
John Spray
c4b0ecec33 pageserver: reduce usage of TENANTS 2024-05-15 20:56:17 +01:00
John Spray
a5565cc641 pageserver: pull get_active_tenant_with_timeout into TenantManager 2024-05-15 20:49:16 +01:00
John Spray
1804abd23d pageserver: pass a TenantManager into page service 2024-05-15 20:45:31 +01:00
John Spray
0d4247d2d8 fix a weird variable name 2024-05-15 18:14:10 +01:00
John Spray
b968ae0aa8 make a comment clearer 2024-05-15 18:13:37 +01:00
John Spray
f901cd459b pageserver: refine tenant_id->shard lookup 2024-05-14 22:20:57 +01:00
190 changed files with 4038 additions and 9235 deletions

View File

@@ -17,7 +17,6 @@
!libs/
!neon_local/
!pageserver/
!patches/
!pgxn/
!proxy/
!s3_scrubber/

View File

@@ -5,7 +5,6 @@ self-hosted-runner:
- large
- large-arm64
- small
- small-arm64
- us-east-2
config-variables:
- REMOTE_STORAGE_AZURE_CONTAINER

View File

@@ -3,13 +3,13 @@ description: 'Create Branch using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
project_id:
description: 'ID of the Project to create Branch in'
desctiption: 'ID of the Project to create Branch in'
required: true
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
outputs:
dsn:

View File

@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
project_id:
description: 'ID of the Project which should be deleted'
desctiption: 'ID of the Project which should be deleted'
required: true
branch_id:
description: 'ID of the branch to delete'
desctiption: 'ID of the branch to delete'
required: true
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
runs:

View File

@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
region_id:
description: 'Region ID, if not set the project will be created in the default region'
desctiption: 'Region ID, if not set the project will be created in the default region'
default: aws-us-east-2
postgres_version:
description: 'Postgres version; default is 15'
default: '15'
desctiption: 'Postgres version; default is 15'
default: 15
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
provisioner:
description: 'k8s-pod or k8s-neonvm'
desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'
compute_units:
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
default: '[1, 1]'
outputs:

View File

@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
project_id:
description: 'ID of the Project to delete'
desctiption: 'ID of the Project to delete'
required: true
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
runs:

View File

@@ -548,7 +548,7 @@ jobs:
report-benchmarks-failures:
needs: [ benchmarks, create-test-report ]
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
runs-on: ubuntu-latest
steps:
@@ -723,13 +723,9 @@ jobs:
uses: ./.github/workflows/trigger-e2e-tests.yml
secrets: inherit
neon-image-arch:
neon-image:
needs: [ check-permissions, build-build-tools-image, tag ]
strategy:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: [ self-hosted, gen3, large ]
steps:
- name: Checkout
@@ -751,6 +747,12 @@ jobs:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- uses: docker/build-push-action@v5
with:
context: .
@@ -762,52 +764,25 @@ jobs:
push: true
pull: true
file: Dockerfile
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
cache-from: type=registry,ref=neondatabase/neon:cache
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
tags: |
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
neondatabase/neon:${{needs.tag.outputs.build-tag}}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
neon-image:
needs: [ neon-image-arch, tag ]
runs-on: ubuntu-latest
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch image
run: |
docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Push multi-arch image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
compute-node-image-arch:
compute-node-image:
needs: [ check-permissions, build-build-tools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
steps:
- name: Checkout
@@ -854,14 +829,15 @@ jobs:
push: true
pull: true
file: Dockerfile.compute-node
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
tags: |
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
if: matrix.version == 'v16'
if: ${{ matrix.version == 'v16' }}
uses: docker/build-push-action@v5
with:
target: compute-tools-image
@@ -875,57 +851,14 @@ jobs:
pull: true
file: Dockerfile.compute-node
tags: |
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
compute-node-image:
needs: [ compute-node-image-arch, tag ]
runs-on: ubuntu-latest
strategy:
matrix:
version: [ v14, v15, v16 ]
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch compute-node image
run: |
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
- name: Create multi-arch compute-tools image
if: matrix.version == 'v16'
run: |
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
- name: Push multi-arch compute-tools image to ECR
if: matrix.version == 'v16'
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, gen3, large ]
@@ -933,8 +866,11 @@ jobs:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
defaults:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.29.3
VM_BUILDER_VERSION: v0.28.1
steps:
- name: Checkout
@@ -947,48 +883,26 @@ jobs:
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
chmod +x vm-builder
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
# it won't have the proper authentication (written at v0.6.0)
- name: Pulling compute-node image
run: |
docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Build vm image
run: |
./vm-builder \
-spec=vm-image-spec.yaml \
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Pushing vm-compute-node image
run: |
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ check-permissions, tag, neon-image, compute-node-image ]
strategy:
fail-fast: false
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
runs-on: [ self-hosted, gen3, small ]
steps:
- name: Checkout
@@ -1006,7 +920,7 @@ jobs:
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
run: |
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
echo "Pageserver version string: $pageserver_version"
@@ -1032,48 +946,78 @@ jobs:
promote-images:
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: ubuntu-latest
env:
VERSIONS: v14 v15 v16
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
# Don't add if-condition here.
# The job should always be run because we have dependant other jobs that shouldn't be skipped
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Copy vm-compute-node images to ECR
- name: Install Crane & ECR helper
run: |
for version in ${VERSIONS}; do
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Copy vm-compute-node images to Docker Hub
run: |
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
- name: Add latest tag to images
if: github.ref_name == 'main'
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
run: |
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
docker buildx imagetools create -t $repo/neon:latest \
$repo/neon:${{ needs.tag.outputs.build-tag }}
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
docker buildx imagetools create -t $repo/compute-tools:latest \
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
- name: Push images to production ECR
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
run: |
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
for version in ${VERSIONS}; do
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
- name: Configure Docker Hub login
run: |
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
echo "" > /github/home/.docker/config.json
crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
done
- name: Push vm-compute-node to Docker Hub
run: |
crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
- name: Push latest tags to Docker Hub
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
run: |
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]

View File

@@ -136,7 +136,7 @@ jobs:
check-linux-arm-build:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ]
runs-on: [ self-hosted, large-arm64 ]
env:
# Use release build only, to have less debug info around
@@ -260,7 +260,7 @@ jobs:
check-codestyle-rust-arm:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ]
runs-on: [ self-hosted, large-arm64 ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}

View File

@@ -53,7 +53,7 @@ jobs:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
## Storage & Compute release ${RELEASE_DATE}
## Release ${RELEASE_DATE}
**Please merge this Pull Request using 'Create a merge commit' button**
EOF

200
Cargo.lock generated
View File

@@ -708,7 +708,7 @@ dependencies = [
"sha1",
"sync_wrapper",
"tokio",
"tokio-tungstenite",
"tokio-tungstenite 0.20.0",
"tower",
"tower-layer",
"tower-service",
@@ -979,12 +979,6 @@ version = "3.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
[[package]]
name = "bytemuck"
version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
[[package]]
name = "byteorder"
version = "1.4.3"
@@ -1072,9 +1066,9 @@ dependencies = [
[[package]]
name = "chrono"
version = "0.4.38"
version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
dependencies = [
"android-tzdata",
"iana-time-zone",
@@ -1082,7 +1076,7 @@ dependencies = [
"num-traits",
"serde",
"wasm-bindgen",
"windows-targets 0.52.4",
"windows-targets 0.48.0",
]
[[package]]
@@ -1109,7 +1103,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
dependencies = [
"ciborium-io",
"half 1.8.2",
"half",
]
[[package]]
@@ -1239,10 +1233,8 @@ dependencies = [
"serde_json",
"signal-hook",
"tar",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-stream",
"tokio-util",
"toml_edit",
"tracing",
@@ -1471,21 +1463,26 @@ dependencies = [
[[package]]
name = "crossbeam-deque"
version = "0.8.5"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
dependencies = [
"cfg-if",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
version = "0.9.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"memoffset 0.8.0",
"scopeguard",
]
[[package]]
@@ -1599,7 +1596,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
dependencies = [
"cfg-if",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"lock_api",
"once_cell",
"parking_lot_core 0.9.8",
@@ -2000,27 +1997,6 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "framed-websockets"
version = "0.1.0"
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
dependencies = [
"base64 0.21.1",
"bytemuck",
"bytes",
"futures-core",
"futures-sink",
"http-body-util",
"hyper 1.2.0",
"hyper-util",
"pin-project",
"rand 0.8.5",
"sha1",
"thiserror",
"tokio",
"tokio-util",
]
[[package]]
name = "fs2"
version = "0.4.3"
@@ -2273,17 +2249,6 @@ version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
[[package]]
name = "half"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
dependencies = [
"cfg-if",
"crunchy",
"num-traits",
]
[[package]]
name = "hash32"
version = "0.3.1"
@@ -2310,9 +2275,9 @@ dependencies = [
[[package]]
name = "hashbrown"
version = "0.14.5"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
dependencies = [
"ahash",
"allocator-api2",
@@ -2320,11 +2285,11 @@ dependencies = [
[[package]]
name = "hashlink"
version = "0.9.1"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
dependencies = [
"hashbrown 0.14.5",
"hashbrown 0.14.0",
]
[[package]]
@@ -2633,6 +2598,21 @@ dependencies = [
"tokio-native-tls",
]
[[package]]
name = "hyper-tungstenite"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
dependencies = [
"http-body-util",
"hyper 1.2.0",
"hyper-util",
"pin-project-lite",
"tokio",
"tokio-tungstenite 0.21.0",
"tungstenite 0.21.0",
]
[[package]]
name = "hyper-util"
version = "0.1.3"
@@ -2710,7 +2690,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
dependencies = [
"equivalent",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
]
[[package]]
@@ -2972,7 +2952,7 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
dependencies = [
"hashbrown 0.14.5",
"hashbrown 0.14.0",
]
[[package]]
@@ -3025,7 +3005,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
dependencies = [
"bytes",
"crossbeam-utils",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"itoa",
"lasso",
"measured-derive",
@@ -3587,7 +3567,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
dependencies = [
"dlv-list",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
]
[[package]]
@@ -3908,14 +3888,13 @@ dependencies = [
[[package]]
name = "parquet"
version = "51.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"ahash",
"bytes",
"chrono",
"half 2.4.1",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"num",
"num-bigint",
"paste",
@@ -3923,13 +3902,12 @@ dependencies = [
"thrift",
"twox-hash",
"zstd",
"zstd-sys",
]
[[package]]
name = "parquet_derive"
version = "51.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"parquet",
"proc-macro2",
@@ -3956,9 +3934,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
[[package]]
name = "pbkdf2"
version = "0.12.2"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
dependencies = [
"digest",
"hmac",
@@ -4105,6 +4083,17 @@ dependencies = [
"tokio-postgres",
]
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"native-tls",
"tokio",
"tokio-native-tls",
"tokio-postgres",
]
[[package]]
name = "postgres-protocol"
version = "0.6.4"
@@ -4370,7 +4359,6 @@ dependencies = [
name = "proxy"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"async-compression",
"async-trait",
@@ -4387,14 +4375,12 @@ dependencies = [
"chrono",
"clap",
"consumption_metrics",
"crossbeam-deque",
"dashmap",
"env_logger",
"fallible-iterator",
"framed-websockets",
"futures",
"git-version",
"hashbrown 0.14.5",
"hashbrown 0.13.2",
"hashlink",
"hex",
"hmac",
@@ -4404,6 +4390,7 @@ dependencies = [
"humantime",
"hyper 0.14.26",
"hyper 1.2.0",
"hyper-tungstenite",
"hyper-util",
"indexmap 2.0.1",
"ipnet",
@@ -4412,6 +4399,7 @@ dependencies = [
"md5",
"measured",
"metrics",
"native-tls",
"once_cell",
"opentelemetry",
"parking_lot 0.12.1",
@@ -4419,6 +4407,7 @@ dependencies = [
"parquet_derive",
"pbkdf2",
"pin-project-lite",
"postgres-native-tls",
"postgres-protocol",
"postgres_backend",
"pq_proto",
@@ -4446,6 +4435,7 @@ dependencies = [
"smol_str",
"socket2 0.5.5",
"subtle",
"sync_wrapper",
"task-local-extensions",
"thiserror",
"tikv-jemalloc-ctl",
@@ -4454,7 +4444,6 @@ dependencies = [
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-tungstenite",
"tokio-util",
"tower-service",
"tracing",
@@ -4466,7 +4455,7 @@ dependencies = [
"utils",
"uuid",
"walkdir",
"webpki-roots 0.26.1",
"webpki-roots 0.25.2",
"workspace_hack",
"x509-parser",
]
@@ -5219,20 +5208,20 @@ dependencies = [
"hex",
"histogram",
"itertools",
"native-tls",
"pageserver",
"pageserver_api",
"postgres-native-tls",
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest 0.12.4",
"rustls 0.22.4",
"serde",
"serde_json",
"serde_with",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-stream",
"tokio-util",
@@ -5240,7 +5229,6 @@ dependencies = [
"tracing-appender",
"tracing-subscriber",
"utils",
"webpki-roots 0.26.1",
"workspace_hack",
]
@@ -5831,15 +5819,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "statx-sys"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69c325f46f705b7a66fb87f0ebb999524a7363f30f05d373277b4ef7f409fe87"
dependencies = [
"libc",
]
[[package]]
name = "storage_broker"
version = "0.1.0"
@@ -5973,7 +5952,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
[[package]]
name = "svg_fmt"
version = "0.4.2"
source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
[[package]]
name = "syn"
@@ -6263,7 +6242,7 @@ dependencies = [
[[package]]
name = "tokio-epoll-uring"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=arpad/statx_sys#ca8446b8edb5e0aef88520f2fc209a13a834fd25"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
dependencies = [
"futures",
"nix 0.26.4",
@@ -6401,7 +6380,19 @@ dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite",
"tungstenite 0.20.1",
]
[[package]]
name = "tokio-tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.21.0",
]
[[package]]
@@ -6415,7 +6406,7 @@ dependencies = [
"futures-io",
"futures-sink",
"futures-util",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"pin-project-lite",
"tokio",
"tracing",
@@ -6697,6 +6688,25 @@ dependencies = [
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
dependencies = [
"byteorder",
"bytes",
"data-encoding",
"http 1.1.0",
"httparse",
"log",
"rand 0.8.5",
"sha1",
"thiserror",
"url",
"utf-8",
]
[[package]]
name = "twox-hash"
version = "1.6.3"
@@ -6794,12 +6804,11 @@ dependencies = [
[[package]]
name = "uring-common"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=arpad/statx_sys#ca8446b8edb5e0aef88520f2fc209a13a834fd25"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
dependencies = [
"bytes",
"io-uring",
"libc",
"statx-sys",
]
[[package]]
@@ -7468,7 +7477,6 @@ dependencies = [
name = "workspace_hack"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"aws-config",
"aws-runtime",
@@ -7494,7 +7502,7 @@ dependencies = [
"futures-sink",
"futures-util",
"getrandom 0.2.11",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"hex",
"hmac",
"hyper 0.14.26",
@@ -7627,9 +7635,9 @@ dependencies = [
[[package]]
name = "zeroize"
version = "1.7.0"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
dependencies = [
"zeroize_derive",
]

View File

@@ -41,7 +41,6 @@ license = "Apache-2.0"
## All dependency versions, used in the project
[workspace.dependencies]
ahash = "0.8"
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
@@ -75,7 +74,6 @@ clap = { version = "4.0", features = ["derive"] }
comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
crossbeam-deque = "0.8.5"
crossbeam-utils = "0.8.5"
dashmap = { version = "5.5.0", features = ["raw-api"] }
either = "1.8"
@@ -83,14 +81,13 @@ enum-map = "2.4.2"
enumset = "1.0.12"
fail = "0.5.0"
fallible-iterator = "0.2"
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
fs2 = "0.4.3"
futures = "0.3"
futures-core = "0.3"
futures-util = "0.3"
git-version = "0.3"
hashbrown = "0.14"
hashlink = "0.9.1"
hashbrown = "0.13"
hashlink = "0.8.4"
hdrhistogram = "7.5.2"
hex = "0.4"
hex-literal = "0.4"
@@ -101,7 +98,7 @@ http-types = { version = "2", default-features = false }
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
tokio-tungstenite = "0.20.0"
hyper-tungstenite = "0.13.0"
indexmap = "2"
inotify = "0.10.2"
ipnet = "2.9.0"
@@ -124,8 +121,8 @@ opentelemetry = "0.20.0"
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.12.0"
parking_lot = "0.12"
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "51.0.0"
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "49.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
procfs = "0.14"
@@ -161,8 +158,8 @@ socket2 = "0.5"
strum = "0.24"
strum_macros = "0.24"
"subtle" = "2.5.0"
# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
# https://github.com/nical/rust_debug/pull/4
svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
sync_wrapper = "0.1.2"
tar = "0.4"
task-local-extensions = "0.1.4"
@@ -171,7 +168,7 @@ thiserror = "1.0"
tikv-jemallocator = "0.5"
tikv-jemalloc-ctl = "0.5"
tokio = { version = "1.17", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "arpad/statx_sys" }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.11.0"
tokio-rustls = "0.25"
@@ -191,7 +188,7 @@ url = "2.2"
urlencoding = "2.1"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
webpki-roots = "0.26"
webpki-roots = "0.25"
x509-parser = "0.15"
## TODO replace this with tracing
@@ -200,6 +197,7 @@ log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
@@ -240,12 +238,13 @@ tonic-build = "0.9"
[patch.crates-io]
# Needed to get `tokio-postgres-rustls` to depend on our fork.
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
################# Binary contents sections

View File

@@ -241,17 +241,11 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
FROM build-deps AS vector-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY patches/pgvector.patch /pgvector.patch
# By default, pgvector Makefile uses `-march=native`. We don't want that,
# because we build the images on different machines than where we run them.
# Pass OPTFLAGS="" to remove it.
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
patch -p1 < /pgvector.patch && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
#########################################################################################

View File

@@ -1,6 +1,4 @@
[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech)
[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)
# Neon

View File

@@ -27,12 +27,10 @@ reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-util.workspace = true
tokio-stream.workspace = true
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true
thiserror.workspace = true
url.workspace = true
compute_api.workspace = true

View File

@@ -1,116 +0,0 @@
use compute_api::{
responses::CatalogObjects,
spec::{Database, Role},
};
use futures::Stream;
use postgres::{Client, NoTls};
use std::{path::Path, process::Stdio, result::Result, sync::Arc};
use tokio::{
io::{AsyncBufReadExt, BufReader},
process::Command,
task,
};
use tokio_stream::{self as stream, StreamExt};
use tokio_util::codec::{BytesCodec, FramedRead};
use tracing::warn;
use crate::{
compute::ComputeNode,
pg_helpers::{get_existing_dbs, get_existing_roles},
};
pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
let connstr = compute.connstr.clone();
task::spawn_blocking(move || {
let mut client = Client::connect(connstr.as_str(), NoTls)?;
let roles: Vec<Role>;
{
let mut xact = client.transaction()?;
roles = get_existing_roles(&mut xact)?;
}
let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
Ok(CatalogObjects { roles, databases })
})
.await?
}
#[derive(Debug, thiserror::Error)]
pub enum SchemaDumpError {
#[error("Database does not exist.")]
DatabaseDoesNotExist,
#[error("Failed to execute pg_dump.")]
IO(#[from] std::io::Error),
}
// It uses the pg_dump utility to dump the schema of the specified database.
// The output is streamed back to the caller and supposed to be streamed via HTTP.
//
// Before return the result with the output, it checks that pg_dump produced any output.
// If not, it tries to parse the stderr output to determine if the database does not exist
// and special error is returned.
//
// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
pub async fn get_database_schema(
compute: &Arc<ComputeNode>,
dbname: &str,
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
let pgbin = &compute.pgbin;
let basepath = Path::new(pgbin).parent().unwrap();
let pgdump = basepath.join("pg_dump");
let mut connstr = compute.connstr.clone();
connstr.set_path(dbname);
let mut cmd = Command::new(pgdump)
.arg("--schema-only")
.arg(connstr.as_str())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.kill_on_drop(true)
.spawn()?;
let stdout = cmd.stdout.take().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
})?;
let stderr = cmd.stderr.take().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
})?;
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
let stderr_reader = BufReader::new(stderr);
let first_chunk = match stdout_reader.next().await {
Some(Ok(bytes)) if !bytes.is_empty() => bytes,
Some(Err(e)) => {
return Err(SchemaDumpError::IO(e));
}
_ => {
let mut lines = stderr_reader.lines();
if let Some(line) = lines.next_line().await? {
if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) {
return Err(SchemaDumpError::DatabaseDoesNotExist);
}
warn!("pg_dump stderr: {}", line)
}
tokio::spawn(async move {
while let Ok(Some(line)) = lines.next_line().await {
warn!("pg_dump stderr: {}", line)
}
});
return Err(SchemaDumpError::IO(std::io::Error::new(
std::io::ErrorKind::Other,
"failed to start pg_dump",
)));
}
};
let initial_stream = stream::once(Ok(first_chunk.freeze()));
// Consume stderr and log warnings
tokio::spawn(async move {
let mut lines = stderr_reader.lines();
while let Ok(Some(line)) = lines.next_line().await {
warn!("pg_dump stderr: {}", line)
}
});
Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
}

View File

@@ -5,21 +5,17 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
use crate::catalog::SchemaDumpError;
use crate::catalog::{get_database_schema, get_dbs_and_roles};
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
use anyhow::Result;
use hyper::header::CONTENT_TYPE;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use tokio::task;
use tracing::{error, info, warn};
use tracing_utils::http::OtelName;
use utils::http::request::must_get_query_param;
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
ComputeStatusResponse {
@@ -137,34 +133,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
(&Method::GET, "/dbs_and_roles") => {
info!("serving /dbs_and_roles GET request",);
match get_dbs_and_roles(compute).await {
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
Err(_) => {
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::GET, "/database_schema") => {
let database = match must_get_query_param(&req, "database") {
Err(e) => return e.into_response(),
Ok(database) => database,
};
info!("serving /database_schema GET request with database: {database}",);
match get_database_schema(compute, &database).await {
Ok(res) => render_plain(Body::wrap_stream(res)),
Err(SchemaDumpError::DatabaseDoesNotExist) => {
render_json_error("database does not exist", StatusCode::NOT_FOUND)
}
Err(e) => {
error!("can't get schema dump: {}", e);
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// download extension files from remote extension storage on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
@@ -335,25 +303,10 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
};
Response::builder()
.status(status)
.header(CONTENT_TYPE, "application/json")
.body(Body::from(serde_json::to_string(&error).unwrap()))
.unwrap()
}
fn render_json(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "application/json")
.body(body)
.unwrap()
}
fn render_plain(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "text/plain")
.body(body)
.unwrap()
}
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
{
let mut state = compute.state.lock().unwrap();

View File

@@ -68,51 +68,6 @@ paths:
schema:
$ref: "#/components/schemas/Info"
/dbs_and_roles:
get:
tags:
- Info
summary: Get databases and roles in the catalog.
description: ""
operationId: getDbsAndRoles
responses:
200:
description: Compute schema objects
content:
application/json:
schema:
$ref: "#/components/schemas/DbsAndRoles"
/database_schema:
get:
tags:
- Info
summary: Get schema dump
parameters:
- name: database
in: query
description: Database name to dump.
required: true
schema:
type: string
example: "postgres"
description: Get schema dump in SQL format.
operationId: getDatabaseSchema
responses:
200:
description: Schema dump
content:
text/plain:
schema:
type: string
description: Schema dump in SQL format.
404:
description: Non existing database.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
/check_writability:
post:
tags:
@@ -274,73 +229,6 @@ components:
num_cpus:
type: integer
DbsAndRoles:
type: object
description: Databases and Roles
required:
- roles
- databases
properties:
roles:
type: array
items:
$ref: "#/components/schemas/Role"
databases:
type: array
items:
$ref: "#/components/schemas/Database"
Database:
type: object
description: Database
required:
- name
- owner
- restrict_conn
- invalid
properties:
name:
type: string
owner:
type: string
options:
type: array
items:
$ref: "#/components/schemas/GenericOption"
restrict_conn:
type: boolean
invalid:
type: boolean
Role:
type: object
description: Role
required:
- name
properties:
name:
type: string
encrypted_password:
type: string
options:
type: array
items:
$ref: "#/components/schemas/GenericOption"
GenericOption:
type: object
description: Schema Generic option
required:
- name
- vartype
properties:
name:
type: string
value:
type: string
vartype:
type: string
ComputeState:
type: object
required:

View File

@@ -8,7 +8,6 @@ pub mod configurator;
pub mod http;
#[macro_use]
pub mod logger;
pub mod catalog;
pub mod compute;
pub mod extension_server;
pub mod monitor;

View File

@@ -1,5 +1,3 @@
use std::path::Path;
use anyhow::{anyhow, Context};
use tracing::warn;
@@ -19,24 +17,17 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
.arg(size_bytes.to_string())
.spawn();
if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
return Ok(());
}
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => {
// The command failed. Maybe it was because the resize-swap file doesn't exist?
// The --once flag causes it to delete itself on success so we don't disable swap
// while postgres is running; maybe this is fine.
match Path::new(RESIZE_SWAP_BIN).try_exists() {
Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
// The path doesn't exist; we're actually ok
Ok(false) => {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
Ok(())
},
}
}
false => Err(anyhow!("process exited with {status}")),
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| {

View File

@@ -152,9 +152,6 @@ pub struct NeonStorageControllerConf {
/// Heartbeat timeout before marking a node offline
#[serde(with = "humantime_serde")]
pub max_unavailable: Duration,
/// Threshold for auto-splitting a tenant into shards
pub split_threshold: Option<u64>,
}
impl NeonStorageControllerConf {
@@ -167,7 +164,6 @@ impl Default for NeonStorageControllerConf {
fn default() -> Self {
Self {
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
split_threshold: None,
}
}
}

View File

@@ -243,13 +243,9 @@ impl StorageController {
anyhow::bail!("initdb failed with status {status}");
}
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", self.postgres_port),
format!("port = {}", self.postgres_port),
)
.await?;
};
@@ -309,10 +305,6 @@ impl StorageController {
));
}
if let Some(split_threshold) = self.config.split_threshold.as_ref() {
args.push(format!("--split-threshold={split_threshold}"))
}
background_process::start_process(
COMMAND,
&self.env.base_data_dir,

View File

@@ -99,10 +99,6 @@ name = "async-executor"
[[bans.deny]]
name = "smol"
[[bans.deny]]
# We want to use rustls instead of native-tls.
name = "postgres-native-tls"
# This section is considered when running `cargo deny check sources`.
# More documentation about the 'sources' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html

View File

@@ -1,4 +1,4 @@
ARG REPOSITORY=neondatabase
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG COMPUTE_IMAGE=compute-node-v14
ARG TAG=latest

View File

@@ -8,6 +8,8 @@
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
# to verify custom image builds (e.g pre-published ones).
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
set -eux -o pipefail
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

View File

@@ -3,7 +3,7 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize, Serializer};
use crate::spec::{ComputeSpec, Database, Role};
use crate::spec::ComputeSpec;
#[derive(Serialize, Debug, Deserialize)]
pub struct GenericAPIError {
@@ -113,12 +113,6 @@ pub struct ComputeMetrics {
pub total_ext_download_size: u64,
}
#[derive(Clone, Debug, Default, Serialize)]
pub struct CatalogObjects {
pub roles: Vec<Role>,
pub databases: Vec<Database>,
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.
/// This is not actually a compute API response, so consider moving
/// to a different place.

View File

@@ -307,7 +307,7 @@ impl KeySpace {
}
/// Merge another keyspace into the current one.
/// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
/// Note: the keyspaces must not ovelap (enforced via assertions)
pub fn merge(&mut self, other: &KeySpace) {
let all_ranges = self
.ranges

View File

@@ -9,7 +9,7 @@ use std::{
collections::HashMap,
io::{BufRead, Read},
num::{NonZeroU64, NonZeroUsize},
sync::atomic::AtomicUsize,
str::FromStr,
time::{Duration, SystemTime},
};
@@ -161,22 +161,6 @@ impl std::fmt::Debug for TenantState {
}
}
/// A temporary lease to a specific lsn inside a timeline.
/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
#[serde_as]
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LsnLease {
#[serde_as(as = "SystemTimeAsRfc3339Millis")]
pub valid_until: SystemTime,
}
serde_with::serde_conv!(
SystemTimeAsRfc3339Millis,
SystemTime,
|time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
|value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
);
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum ActivatingFrom {
@@ -305,7 +289,7 @@ pub struct TenantConfig {
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
// defer parsing compaction_algorithm, like eviction_policy
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
pub compaction_algorithm: Option<CompactionAlgorithm>,
pub gc_horizon: Option<u64>,
pub gc_period: Option<String>,
pub image_creation_threshold: Option<usize>,
@@ -324,100 +308,28 @@ pub struct TenantConfig {
pub switch_aux_file_policy: Option<AuxFilePolicy>,
}
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
/// tenant config. When the first aux file written, the policy will be persisted in the
/// `index_part.json` file and has a limited migration path.
///
/// Currently, we only allow the following migration path:
///
/// Unset -> V1
/// -> V2
/// -> CrossValidation -> V2
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AuxFilePolicy {
/// V1 aux file policy: store everything in AUX_FILE_KEY
#[strum(ascii_case_insensitive)]
V1,
/// V2 aux file policy: store in the AUX_FILE keyspace
#[strum(ascii_case_insensitive)]
V2,
/// Cross validation runs both formats on the write path and does validation
/// on the read path.
#[strum(ascii_case_insensitive)]
CrossValidation,
}
impl AuxFilePolicy {
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
matches!(
(from, to),
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
)
}
impl FromStr for AuxFilePolicy {
type Err = anyhow::Error;
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
pub fn default_tenant_config() -> Self {
Self::V1
}
}
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
pub struct AtomicAuxFilePolicy(AtomicUsize);
impl AtomicAuxFilePolicy {
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
Self(AtomicUsize::new(
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
))
}
pub fn load(&self) -> Option<AuxFilePolicy> {
match self.0.load(std::sync::atomic::Ordering::Acquire) {
0 => None,
other => Some(AuxFilePolicy::from_usize(other)),
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.to_lowercase();
if s == "v1" {
Ok(Self::V1)
} else if s == "v2" {
Ok(Self::V2)
} else if s == "crossvalidation" || s == "cross_validation" {
Ok(Self::CrossValidation)
} else {
anyhow::bail!("cannot parse {} to aux file policy", s)
}
}
pub fn store(&self, policy: Option<AuxFilePolicy>) {
self.0.store(
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
std::sync::atomic::Ordering::Release,
);
}
}
impl AuxFilePolicy {
pub fn to_usize(self) -> usize {
match self {
Self::V1 => 1,
Self::CrossValidation => 2,
Self::V2 => 3,
}
}
pub fn try_from_usize(this: usize) -> Option<Self> {
match this {
1 => Some(Self::V1),
2 => Some(Self::CrossValidation),
3 => Some(Self::V2),
_ => None,
}
}
pub fn from_usize(this: usize) -> Self {
Self::try_from_usize(this).unwrap()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -438,28 +350,13 @@ impl EvictionPolicy {
}
}
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind")]
pub enum CompactionAlgorithm {
Legacy,
Tiered,
}
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
pub struct CompactionAlgorithmSettings {
pub kind: CompactionAlgorithm,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct EvictionPolicyLayerAccessThreshold {
#[serde(with = "humantime_serde")]
@@ -707,9 +604,6 @@ pub struct TimelineInfo {
pub state: TimelineState,
pub walreceiver_status: String,
/// The last aux file policy being used on this timeline
pub last_aux_file_policy: Option<AuxFilePolicy>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -816,8 +710,6 @@ pub enum HistoricLayerInfo {
lsn_end: Lsn,
remote: bool,
access_stats: LayerAccessStats,
l0: bool,
},
Image {
layer_file_name: String,
@@ -853,16 +745,6 @@ impl HistoricLayerInfo {
};
*field = value;
}
pub fn layer_file_size(&self) -> u64 {
match self {
HistoricLayerInfo::Delta {
layer_file_size, ..
} => *layer_file_size,
HistoricLayerInfo::Image {
layer_file_size, ..
} => *layer_file_size,
}
}
}
#[derive(Debug, Serialize, Deserialize)]
@@ -870,16 +752,6 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
pub max_concurrent_downloads: NonZeroUsize,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct IngestAuxFilesRequest {
pub aux_files: HashMap<String, String>,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct ListAuxFilesRequest {
pub lsn: Lsn,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct DownloadRemoteLayersTaskInfo {
pub task_id: String,
@@ -904,6 +776,9 @@ pub struct TimelineGcRequest {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerProcessStatus {
pub pid: u32,
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
pub kind: Cow<'static, str>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -942,55 +817,6 @@ pub struct TenantScanRemoteStorageResponse {
pub shards: Vec<TenantScanRemoteStorageShard>,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "snake_case")]
pub enum TenantSorting {
ResidentSize,
MaxLogicalSize,
}
impl Default for TenantSorting {
fn default() -> Self {
Self::ResidentSize
}
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct TopTenantShardsRequest {
// How would you like to sort the tenants?
pub order_by: TenantSorting,
// How many results?
pub limit: usize,
// Omit tenants with more than this many shards (e.g. if this is the max number of shards
// that the caller would ever split to)
pub where_shards_lt: Option<ShardCount>,
// Omit tenants where the ordering metric is less than this (this is an optimization to
// let us quickly exclude numerous tiny shards)
pub where_gt: Option<u64>,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
pub struct TopTenantShardItem {
pub id: TenantShardId,
/// Total size of layers on local disk for all timelines in this tenant
pub resident_size: u64,
/// Total size of layers in remote storage for all timelines in this tenant
pub physical_size: u64,
/// The largest logical size of a timeline within this tenant
pub max_logical_size: u64,
}
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct TopTenantShardsResponse {
pub shards: Vec<TopTenantShardItem>,
}
pub mod virtual_file {
#[derive(
Copy,
@@ -1416,7 +1242,6 @@ impl PagestreamBeMessage {
#[cfg(test)]
mod tests {
use serde_json::json;
use std::str::FromStr;
use super::*;
@@ -1624,69 +1449,4 @@ mod tests {
assert_eq!(actual, expected, "example on {line}");
}
}
#[test]
fn test_aux_file_migration_path() {
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::V1
));
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::V2
));
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::CrossValidation
));
// Self-migration is not a valid migration path, and the caller should handle it by itself.
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::V2
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::CrossValidation
));
// Migrations not allowed
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::V2
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::CrossValidation
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::CrossValidation
));
// Migrations allowed
assert!(AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::V2
));
}
#[test]
fn test_aux_parse() {
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
assert_eq!(
AuxFilePolicy::from_str("cross-validation").unwrap(),
AuxFilePolicy::CrossValidation
);
}
}

View File

@@ -125,7 +125,7 @@ impl ShardCount {
/// `v` may be zero, or the number of shards in the tenant. `v` is what
/// [`Self::literal`] would return.
pub const fn new(val: u8) -> Self {
pub fn new(val: u8) -> Self {
Self(val)
}
}
@@ -559,14 +559,6 @@ impl ShardIdentity {
}
}
/// Obtains the shard number and count combined into a `ShardIndex`.
pub fn shard_index(&self) -> ShardIndex {
ShardIndex {
shard_count: self.count,
shard_number: self.number,
}
}
pub fn shard_slug(&self) -> String {
if self.count > ShardCount(0) {
format!("-{:02x}{:02x}", self.number.0, self.count.0)

View File

@@ -820,11 +820,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
Ok(ProcessMsgResult::Continue)
}
/// - Log as info/error result of handling COPY stream and send back
/// ErrorResponse if that makes sense.
/// - Shutdown the stream if we got Terminate.
/// - Then close the connection because we don't handle exiting from COPY
/// stream normally.
/// Log as info/error result of handling COPY stream and send back
/// ErrorResponse if that makes sense. Shutdown the stream if we got
/// Terminate. TODO: transition into waiting for Sync msg if we initiate the
/// close.
pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
use CopyStreamHandlerEnd::*;
@@ -850,6 +849,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
}
}
if let Terminate = &end {
self.state = ProtoState::Closed;
}
let err_to_send_and_errcode = match &end {
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
@@ -879,12 +882,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
error!("failed to send ErrorResponse: {}", ee);
}
}
// Proper COPY stream finishing to continue using the connection is not
// implemented at the server side (we don't need it so far). To prevent
// further usages of the connection, close it.
self.framed.shutdown().await.ok();
self.state = ProtoState::Closed;
}
}

View File

@@ -178,13 +178,6 @@ impl PgConnectionConfig {
}
}
impl fmt::Display for PgConnectionConfig {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// The password is intentionally hidden and not part of this display string.
write!(f, "postgresql://{}:{}", self.host, self.port)
}
}
impl fmt::Debug for PgConnectionConfig {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`

View File

@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
// Is there enough space on the page for another logical message and an
// XLOG_SWITCH? If not, start over.
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
continue;
}
@@ -373,29 +373,31 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
&[&(repeats as i32)],
)?;
info!(
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
client.pg_current_wal_insert_lsn()?,
XLOG_SIZE_OF_XLOG_RECORD
);
// Emit the XLOG_SWITCH
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
let xlog_switch_record_end: PgLsn =
client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
!= XLOG_SIZE_OF_XLOG_SHORT_PHD
{
warn!(
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
xlog_switch_record_end,
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
);
continue;
}
return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
break;
}
info!(
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
client.pg_current_wal_insert_lsn()?,
XLOG_SIZE_OF_XLOG_RECORD
);
// Emit the XLOG_SWITCH
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
let next_segment = PgLsn::from(0x0200_0000);
ensure!(
xlog_switch_record_end < next_segment,
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
xlog_switch_record_end,
next_segment
);
ensure!(
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
xlog_switch_record_end,
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
);
Ok(vec![before_xlog_switch, xlog_switch_record_end])
}
}

View File

@@ -29,6 +29,7 @@ use http_types::{StatusCode, Url};
use tokio_util::sync::CancellationToken;
use tracing::debug;
use crate::RemoteStorageActivity;
use crate::{
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -525,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage {
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
Err(TimeTravelError::Unimplemented)
}
fn activity(&self) -> RemoteStorageActivity {
self.concurrency_limiter.activity()
}
}
pin_project_lite::pin_project! {

View File

@@ -263,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
done_if_after: SystemTime,
cancel: &CancellationToken,
) -> Result<(), TimeTravelError>;
/// Query how busy we currently are: may be used by callers which wish to politely
/// back off if there are already a lot of operations underway.
fn activity(&self) -> RemoteStorageActivity;
}
pub struct RemoteStorageActivity {
pub read_available: usize,
pub read_total: usize,
pub write_available: usize,
pub write_total: usize,
}
/// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -444,6 +455,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
}
pub fn activity(&self) -> RemoteStorageActivity {
match self {
Self::LocalFs(s) => s.activity(),
Self::AwsS3(s) => s.activity(),
Self::AzureBlob(s) => s.activity(),
Self::Unreliable(s) => s.activity(),
}
}
}
impl GenericRemoteStorage {
@@ -774,6 +794,9 @@ struct ConcurrencyLimiter {
// The helps to ensure we don't exceed the thresholds.
write: Arc<Semaphore>,
read: Arc<Semaphore>,
write_total: usize,
read_total: usize,
}
impl ConcurrencyLimiter {
@@ -802,10 +825,21 @@ impl ConcurrencyLimiter {
Arc::clone(self.for_kind(kind)).acquire_owned().await
}
fn activity(&self) -> RemoteStorageActivity {
RemoteStorageActivity {
read_available: self.read.available_permits(),
read_total: self.read_total,
write_available: self.write.available_permits(),
write_total: self.write_total,
}
}
fn new(limit: usize) -> ConcurrencyLimiter {
Self {
read: Arc::new(Semaphore::new(limit)),
write: Arc::new(Semaphore::new(limit)),
read_total: limit,
write_total: limit,
}
}
}

View File

@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
use utils::crashsafe::path_with_suffix_extension;
use crate::{
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
REMOTE_STORAGE_PREFIX_SEPARATOR,
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
use super::{RemoteStorage, StorageMetadata};
@@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs {
) -> Result<(), TimeTravelError> {
Err(TimeTravelError::Unimplemented)
}
fn activity(&self) -> RemoteStorageActivity {
// LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
RemoteStorageActivity {
read_available: 16,
read_total: 16,
write_available: 16,
write_total: 16,
}
}
}
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {

View File

@@ -47,8 +47,8 @@ use utils::backoff;
use super::StorageMetadata;
use crate::{
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
pub(super) mod metrics;
@@ -975,6 +975,10 @@ impl RemoteStorage for S3Bucket {
}
Ok(())
}
fn activity(&self) -> RemoteStorageActivity {
self.concurrency_limiter.activity()
}
}
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].

View File

@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
use crate::{
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
StorageMetadata, TimeTravelError,
RemoteStorageActivity, StorageMetadata, TimeTravelError,
};
pub struct UnreliableWrapper {
@@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper {
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
fn activity(&self) -> RemoteStorageActivity {
self.inner.activity()
}
}

View File

@@ -50,9 +50,6 @@ pub struct SkTimelineInfo {
pub safekeeper_connstr: Option<String>,
#[serde(default)]
pub http_connstr: Option<String>,
// Minimum of all active RO replicas flush LSN
#[serde(default = "lsn_invalid")]
pub standby_horizon: Lsn,
}
#[derive(Debug, Clone, Deserialize, Serialize)]

View File

@@ -135,8 +135,7 @@ impl Gate {
let started_at = std::time::Instant::now();
let mut do_close = std::pin::pin!(self.do_close());
// with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
let nag_after = Duration::from_millis(100);
let nag_after = Duration::from_secs(1);
let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
return;

View File

@@ -496,9 +496,9 @@ mod tests {
// TODO: When updating Postgres versions, this test will cause
// problems. Postgres version in message needs updating.
//
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
vec![
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,

View File

@@ -30,27 +30,47 @@
//! 2024-04-15 on i3en.3xlarge
//!
//! ```text
//! short/1 time: [24.584 µs 24.737 µs 24.922 µs]
//! short/2 time: [33.479 µs 33.660 µs 33.888 µs]
//! short/4 time: [42.713 µs 43.046 µs 43.440 µs]
//! short/8 time: [71.814 µs 72.478 µs 73.240 µs]
//! short/16 time: [132.73 µs 134.45 µs 136.22 µs]
//! short/32 time: [258.31 µs 260.73 µs 263.27 µs]
//! short/64 time: [511.61 µs 514.44 µs 517.51 µs]
//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
//! ```
use bytes::{Buf, Bytes};
use criterion::{BenchmarkId, Criterion};
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
use pageserver::{
config::PageServerConf,
walrecord::NeonWalRecord,
walredo::{PostgresRedoManager, ProcessKind},
};
use pageserver_api::{key::Key, shard::TenantShardId};
use std::{
sync::Arc,
@@ -60,32 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
use utils::{id::TenantId, lsn::Lsn};
fn bench(c: &mut Criterion) {
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("short");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::short_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group(format!("{process_kind}-short"));
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::short_input());
b.iter_custom(|iters| {
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
});
},
);
}
}
}
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("medium");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| {
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
});
},
);
}
}
}
}
@@ -93,10 +120,16 @@ criterion::criterion_group!(benches, bench);
criterion::criterion_main!(benches);
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
fn bench_impl(
process_kind: ProcessKind,
redo_work: Arc<Request>,
n_redos: u64,
nclients: u64,
) -> Duration {
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
conf.walredo_process_kind = process_kind;
let conf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
@@ -125,13 +158,27 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
});
}
rt.block_on(async move {
let elapsed = rt.block_on(async move {
let mut total_wallclock_time = Duration::ZERO;
while let Some(res) = tasks.join_next().await {
total_wallclock_time += res.unwrap();
}
total_wallclock_time
})
});
// consistency check to ensure process kind setting worked
if nredos_per_client > 0 {
assert_eq!(
manager
.status()
.process
.map(|p| p.kind)
.expect("the benchmark work causes a walredo process to be spawned"),
std::borrow::Cow::Borrowed(process_kind.into())
);
}
elapsed
}
async fn client(

View File

@@ -1,12 +1,8 @@
use std::collections::HashMap;
use bytes::Bytes;
use pageserver_api::{models::*, shard::TenantShardId};
use reqwest::{IntoUrl, Method, StatusCode};
use utils::{
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
lsn::Lsn,
};
pub mod util;
@@ -490,18 +486,6 @@ impl Client {
.map_err(Error::ReceiveBody)
}
pub async fn top_tenant_shards(
&self,
request: TopTenantShardsRequest,
) -> Result<TopTenantShardsResponse> {
let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
self.request(Method::POST, uri, request)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn layer_map_info(
&self,
tenant_shard_id: TenantShardId,
@@ -565,57 +549,4 @@ impl Client {
}),
}
}
pub async fn ingest_aux_files(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
aux_files: HashMap<String, String>,
) -> Result<bool> {
let uri = format!(
"{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
);
let resp = self
.request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
.await?;
match resp.status() {
StatusCode::OK => Ok(true),
status => Err(match resp.json::<HttpErrorBody>().await {
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
Err(_) => {
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
}
}),
}
}
pub async fn list_aux_files(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<HashMap<String, Bytes>> {
let uri = format!(
"{}/v1/tenant/{}/timeline/{}/list_aux_files",
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
);
let resp = self
.request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
.await?;
match resp.status() {
StatusCode::OK => {
let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
})?;
Ok(resp)
}
status => Err(match resp.json::<HttpErrorBody>().await {
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
Err(_) => {
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
}
}),
}
}
}

View File

@@ -1,5 +1,4 @@
use clap::{Parser, Subcommand};
use pageserver_compaction::helpers::PAGE_SZ;
use pageserver_compaction::simulator::MockTimeline;
use rand::Rng;
use std::io::Write;
@@ -52,7 +51,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
let mut executor = MockTimeline::new();
// Convert the logical size in MB into a key range.
let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
//let key_range = u64::MIN..u64::MAX;
println!(
"starting simulation with key range {:016X}-{:016X}",

View File

@@ -25,7 +25,7 @@ use std::collections::{HashSet, VecDeque};
use std::ops::Range;
use crate::helpers::{
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
};
use crate::interface::*;
use utils::lsn::Lsn;
@@ -379,7 +379,7 @@ where
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
.await?,
&self.shard_identity,
) * PAGE_SZ;
) * 8192;
let wal_size = job
.input_layers
@@ -441,7 +441,7 @@ where
let mut window = KeyspaceWindow::new(
E::Key::MIN..E::Key::MAX,
keyspace,
self.target_file_size / PAGE_SZ,
self.target_file_size / 8192,
);
while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
new_jobs.push(CompactionJob::<E> {
@@ -663,8 +663,8 @@ where
}
}
/// Sliding window through keyspace and values for image layer
/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
// Sliding window through keyspace and values
// This is used by over_with_images to decide on good split points
struct KeyspaceWindow<K> {
head: KeyspaceWindowHead<K>,
@@ -804,9 +804,9 @@ struct WindowElement<K> {
accum_size: u64,
}
/// Sliding window through keyspace and values for delta layer tiling
///
/// This is used to decide which delta layer to write next.
// Sliding window through keyspace and values
//
// This is used to decide what layer to write next, from the beginning of the window.
struct Window<K> {
elems: VecDeque<WindowElement<K>>,
@@ -830,13 +830,11 @@ where
fn feed(&mut self, key: K, size: u64) {
let last_size;
if let Some(last) = self.elems.back_mut() {
// We require the keys to be strictly increasing for the window.
// Keys should already have been deduplicated by `accum_key_values`
assert!(
last.last_key < key,
"last_key(={}) >= key(={key})",
last.last_key
);
assert!(last.last_key <= key);
if key == last.last_key {
last.accum_size += size;
return;
}
last_size = last.accum_size;
} else {
last_size = 0;
@@ -924,7 +922,7 @@ where
// If we're willing to stretch it up to 1.25 target size, could we
// gobble up the rest of the work? This avoids creating very small
// "tail" layers at the end of the keyspace
if !has_more && self.remain_size() < target_size * 5 / 4 {
if !has_more && self.remain_size() < target_size * 5 / 3 {
self.commit_upto(self.elems.len());
} else {
let delta_split_at = self.find_size_split(target_size);

View File

@@ -16,8 +16,6 @@ use std::pin::Pin;
use std::task::{ready, Poll};
use utils::lsn::Lsn;
pub const PAGE_SZ: u64 = 8192;
pub fn keyspace_total_size<K>(
keyspace: &CompactionKeySpace<K>,
shard_identity: &ShardIdentity,

View File

@@ -14,7 +14,6 @@ use std::ops::Range;
use std::sync::Arc;
use std::sync::Mutex;
use crate::helpers::PAGE_SZ;
use crate::helpers::{merge_delta_keys, overlaps_with};
use crate::interface;
@@ -380,8 +379,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
}
fn file_size(&self) -> u64 {
match self {
MockLayer::Delta(this) => this.file_size,
MockLayer::Image(this) => this.file_size,
MockLayer::Delta(this) => this.file_size(),
MockLayer::Image(this) => this.file_size(),
}
}
fn short_id(&self) -> String {
@@ -510,7 +509,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
let new_layer = Arc::new(MockImageLayer {
key_range: key_range.clone(),
lsn_range: lsn..lsn,
file_size: accum_size * PAGE_SZ,
file_size: accum_size * 8192,
deleted: Mutex::new(false),
});
info!(

View File

@@ -52,6 +52,7 @@
use anyhow::{Context, Result};
use pageserver::repository::Key;
use pageserver::METADATA_FILE_NAME;
use std::cmp::Ordering;
use std::io::{self, BufRead};
use std::path::PathBuf;
@@ -158,6 +159,10 @@ pub fn main() -> Result<()> {
let line = PathBuf::from_str(&line).unwrap();
let filename = line.file_name().unwrap();
let filename = filename.to_str().unwrap();
if filename == METADATA_FILE_NAME {
// Don't try and parse "metadata" like a key-lsn range
continue;
}
let (key_range, lsn_range) = parse_filename(filename);
files.push(Layer {
filename: filename.to_owned(),

View File

@@ -2,7 +2,7 @@ use std::collections::HashMap;
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
#[derive(serde::Serialize)]
struct Output<'a> {
layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
disk_consistent_lsn: Lsn,
timeline_metadata: &'a TimelineMetadata,
}

View File

@@ -219,7 +219,6 @@ fn handle_metadata(
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
println!("Current metadata:\n{meta:?}");
let mut update_meta = false;
// TODO: simplify this part
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
meta = TimelineMetadata::new(
*disk_consistent_lsn,

View File

@@ -1,98 +0,0 @@
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
use pageserver_api::shard::TenantShardId;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
use std::collections::HashMap;
use std::sync::Arc;
/// Ingest aux files into the pageserver.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
page_service_connstring: String,
#[clap(long)]
pageserver_jwt: Option<String>,
targets: Option<Vec<TenantTimelineId>>,
}
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
let rt = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
let main_task = rt.spawn(main_impl(args));
rt.block_on(main_task).unwrap()
}
async fn main_impl(args: Args) -> anyhow::Result<()> {
let args: &'static Args = Box::leak(Box::new(args));
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
&mgmt_api_client,
crate::util::cli::targets::Spec {
limit_to_first_n_targets: None,
targets: {
if let Some(targets) = &args.targets {
if targets.len() != 1 {
anyhow::bail!("must specify exactly one target");
}
Some(targets.clone())
} else {
None
}
},
},
)
.await?;
let timeline = timelines[0];
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
let timeline_id = timeline.timeline_id;
println!("operating on timeline {}", timeline);
mgmt_api_client
.tenant_config(&TenantConfigRequest {
tenant_id: timeline.tenant_id,
config: TenantConfig {
switch_aux_file_policy: Some(AuxFilePolicy::V2),
..Default::default()
},
})
.await?;
for batch in 0..100 {
let items = (0..100)
.map(|id| {
(
format!("pg_logical/mappings/{:03}.{:03}", batch, id),
format!("{:08}", id),
)
})
.collect::<HashMap<_, _>>();
let file_cnt = items.len();
mgmt_api_client
.ingest_aux_files(tenant_shard_id, timeline_id, items)
.await?;
println!("ingested {file_cnt} files");
}
let files = mgmt_api_client
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
.await?;
println!("{} files found", files.len());
anyhow::Ok(())
}

View File

@@ -2,11 +2,9 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
use pageserver_client::mgmt_api;
use rand::seq::SliceRandom;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info};
use utils::id::{TenantTimelineId, TimelineId};
use std::{f64, sync::Arc};
use tokio::{
sync::{mpsc, OwnedSemaphorePermit},
task::JoinSet,
@@ -14,7 +12,10 @@ use tokio::{
use std::{
num::NonZeroUsize,
sync::atomic::{AtomicU64, Ordering},
sync::{
atomic::{AtomicU64, Ordering},
Arc,
},
time::{Duration, Instant},
};
@@ -50,31 +51,19 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
Ok(())
}
#[derive(serde::Serialize)]
struct Output {
downloads_count: u64,
downloads_bytes: u64,
evictions_count: u64,
timeline_restarts: u64,
#[serde(with = "humantime_serde")]
runtime: Duration,
}
#[derive(Debug, Default)]
struct LiveStats {
evictions_count: AtomicU64,
downloads_count: AtomicU64,
downloads_bytes: AtomicU64,
evictions: AtomicU64,
downloads: AtomicU64,
timeline_restarts: AtomicU64,
}
impl LiveStats {
fn eviction_done(&self) {
self.evictions_count.fetch_add(1, Ordering::Relaxed);
self.evictions.fetch_add(1, Ordering::Relaxed);
}
fn download_done(&self, size: u64) {
self.downloads_count.fetch_add(1, Ordering::Relaxed);
self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
fn download_done(&self) {
self.downloads.fetch_add(1, Ordering::Relaxed);
}
fn timeline_restart_done(&self) {
self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
@@ -103,49 +92,28 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
)
.await?;
let token = CancellationToken::new();
let mut tasks = JoinSet::new();
let periodic_stats = Arc::new(LiveStats::default());
let total_stats = Arc::new(LiveStats::default());
let start = Instant::now();
let live_stats = Arc::new(LiveStats::default());
tasks.spawn({
let periodic_stats = Arc::clone(&periodic_stats);
let total_stats = Arc::clone(&total_stats);
let cloned_token = token.clone();
let live_stats = Arc::clone(&live_stats);
async move {
let mut last_at = Instant::now();
loop {
if cloned_token.is_cancelled() {
return;
}
tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
let now = Instant::now();
let delta: Duration = now - last_at;
last_at = now;
let LiveStats {
evictions_count,
downloads_count,
downloads_bytes,
evictions,
downloads,
timeline_restarts,
} = &*periodic_stats;
let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
} = &*live_stats;
let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
}
}
});
@@ -156,42 +124,14 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
args,
Arc::clone(&mgmt_api_client),
tl,
Arc::clone(&periodic_stats),
token.clone(),
Arc::clone(&live_stats),
));
}
}
if let Some(runtime) = args.runtime {
tokio::spawn(async move {
tokio::time::sleep(runtime.into()).await;
token.cancel();
});
}
while let Some(res) = tasks.join_next().await {
res.unwrap();
}
let end = Instant::now();
let duration: Duration = end - start;
let output = {
let LiveStats {
evictions_count,
downloads_count,
downloads_bytes,
timeline_restarts,
} = &*total_stats;
Output {
downloads_count: downloads_count.load(Ordering::Relaxed),
downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
evictions_count: evictions_count.load(Ordering::Relaxed),
timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
runtime: duration,
}
};
let output = serde_json::to_string_pretty(&output).unwrap();
println!("{output}");
Ok(())
}
@@ -200,7 +140,6 @@ async fn timeline_actor(
mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
timeline: TenantTimelineId,
live_stats: Arc<LiveStats>,
token: CancellationToken,
) {
// TODO: support sharding
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
@@ -210,7 +149,7 @@ async fn timeline_actor(
layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
concurrency: Arc<tokio::sync::Semaphore>,
}
while !token.is_cancelled() {
loop {
debug!("restarting timeline");
let layer_map_info = mgmt_api_client
.layer_map_info(tenant_shard_id, timeline.timeline_id)
@@ -246,7 +185,7 @@ async fn timeline_actor(
live_stats.timeline_restart_done();
while !token.is_cancelled() {
loop {
assert!(!timeline.joinset.is_empty());
if let Some(res) = timeline.joinset.try_join_next() {
debug!(?res, "a layer actor exited, should not happen");
@@ -316,7 +255,7 @@ async fn layer_actor(
.layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
.await
.unwrap();
live_stats.download_done(layer.layer_file_size());
live_stats.download_done();
did_it
}
};

View File

@@ -14,7 +14,6 @@ mod util {
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
mod cmd {
pub(super) mod aux_files;
pub(super) mod basebackup;
pub(super) mod getpage_latest_lsn;
pub(super) mod ondemand_download_churn;
@@ -28,7 +27,6 @@ enum Args {
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
AuxFiles(cmd::aux_files::Args),
}
fn main() {
@@ -48,7 +46,6 @@ fn main() {
cmd::trigger_initial_size_calculation::main(args)
}
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
Args::AuxFiles(args) => cmd::aux_files::main(args),
}
.unwrap()
}

View File

@@ -5,35 +5,14 @@ use bytes::{Buf, BufMut, Bytes};
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
use tracing::warn;
// BEGIN Copyright (c) 2017 Servo Contributors
/// Const version of FNV hash.
#[inline]
#[must_use]
pub const fn fnv_hash(bytes: &[u8]) -> u128 {
const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
const PRIME: u128 = 0x0000000001000000000000000000013B;
let mut hash = INITIAL_STATE;
let mut i = 0;
while i < bytes.len() {
hash ^= bytes[i] as u128;
hash = hash.wrapping_mul(PRIME);
i += 1;
}
hash
}
// END Copyright (c) 2017 Servo Contributors
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
let hash = fnv_hash(data).to_be_bytes();
let mut key = [0; METADATA_KEY_SIZE];
let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
key[0] = AUX_KEY_PREFIX;
key[1] = dir_level1;
key[2] = dir_level2;
key[3..16].copy_from_slice(&hash[3..16]);
key[3..16].copy_from_slice(&hash[0..13]);
Key::from_metadata_key_fixed_size(&key)
}
@@ -221,19 +200,15 @@ mod tests {
fn test_hash_portable() {
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
// if the algorithm produces the same hash across different environments.
assert_eq!(
265160408618497461376862998434862070044,
super::fnv_hash("test1".as_bytes())
305317690835051308206966631765527126151,
twox_hash::xxh3::hash128("test1".as_bytes())
);
assert_eq!(
295486155126299629456360817749600553988,
super::fnv_hash("test/test2".as_bytes())
);
assert_eq!(
144066263297769815596495629667062367629,
super::fnv_hash("".as_bytes())
85104974691013376326742244813280798847,
twox_hash::xxh3::hash128("test/test2".as_bytes())
);
assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
}
#[test]
@@ -241,28 +216,28 @@ mod tests {
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
// of the page server.
assert_eq!(
"62000001017F8B83D94F7081693471ABF91C",
encode_aux_file_key("pg_logical/mappings/test1").to_string(),
"6200000101E5B20C5F8DD5AA3289D6D9EAFA",
encode_aux_file_key("pg_logical/mappings/test1").to_string()
);
assert_eq!(
"62000001027F8E83D94F7081693471ABFCCD",
encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
"620000010239AAC544893139B26F501B97E6",
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
);
assert_eq!(
"62000001032E07BB014262B821756295C58D",
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
"620000010300000000000000000000000000",
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
);
assert_eq!(
"62000001FF4F38E1C74754E7D03C1A660178",
encode_aux_file_key("pg_logical/unsupported").to_string(),
"62000001FF8635AF2134B7266EC5B4189FD6",
encode_aux_file_key("pg_logical/unsupported").to_string()
);
assert_eq!(
"62000002017F8D83D94F7081693471ABFB92",
"6200000201772D0E5D71DE14DA86142A1619",
encode_aux_file_key("pg_replslot/test3").to_string()
);
assert_eq!(
"620000FFFF2B6ECC8AEF93F643DC44F15E03",
encode_aux_file_key("other_file_not_supported").to_string(),
"620000FFFF1866EBEB53B807B26A2416F317",
encode_aux_file_key("other_file_not_supported").to_string()
);
}

View File

@@ -284,6 +284,7 @@ fn start_pageserver(
))
.unwrap();
pageserver::preinitialize_metrics();
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
// If any failpoints were set from FAILPOINTS environment variable,
// print them to the log for debugging purposes
@@ -382,7 +383,7 @@ fn start_pageserver(
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
let remote_storage = Some(create_remote_storage_client(conf)?);
// Set up deletion queue
let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -515,12 +516,16 @@ fn start_pageserver(
}
});
let secondary_controller = secondary::spawn_tasks(
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
shutdown_pageserver.clone(),
);
let secondary_controller = if let Some(remote_storage) = &remote_storage {
secondary::spawn_tasks(
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
shutdown_pageserver.clone(),
)
} else {
secondary::null_controller()
};
// shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -528,13 +533,15 @@ fn start_pageserver(
// been configured.
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
launch_disk_usage_global_eviction_task(
conf,
remote_storage.clone(),
disk_usage_eviction_state.clone(),
tenant_manager.clone(),
background_jobs_barrier.clone(),
)?;
if let Some(remote_storage) = &remote_storage {
launch_disk_usage_global_eviction_task(
conf,
remote_storage.clone(),
disk_usage_eviction_state.clone(),
tenant_manager.clone(),
background_jobs_barrier.clone(),
)?;
}
// Start up the service to handle HTTP mgmt API request. We created the
// listener earlier already.
@@ -689,7 +696,14 @@ fn start_pageserver(
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
)
.await;
unreachable!()
})
}

View File

@@ -99,7 +99,7 @@ pub mod defaults {
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
///
/// Default built-in configuration file.

View File

@@ -632,7 +632,7 @@ impl DeletionQueue {
///
/// If remote_storage is None, then the returned workers will also be None.
pub fn new<C>(
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
control_plane_client: Option<C>,
conf: &'static PageServerConf,
) -> (Self, Option<DeletionQueueWorkers<C>>)
@@ -658,6 +658,23 @@ impl DeletionQueue {
// longer to flush after Tenants have all been torn down.
let cancel = CancellationToken::new();
let remote_storage = match remote_storage {
None => {
return (
Self {
client: DeletionQueueClient {
tx,
executor_tx,
lsn_table: lsn_table.clone(),
},
cancel,
},
None,
)
}
Some(r) => r,
};
(
Self {
client: DeletionQueueClient {
@@ -748,7 +765,7 @@ mod test {
/// Simulate a pageserver restart by destroying and recreating the deletion queue
async fn restart(&mut self) {
let (deletion_queue, workers) = DeletionQueue::new(
self.storage.clone(),
Some(self.storage.clone()),
Some(self.mock_control_plane.clone()),
self.harness.conf,
);
@@ -858,7 +875,7 @@ mod test {
let mock_control_plane = MockControlPlane::new();
let (deletion_queue, worker) = DeletionQueue::new(
storage.clone(),
Some(storage.clone()),
Some(mock_control_plane.clone()),
harness.conf,
);

View File

@@ -534,12 +534,18 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
});
}
EvictionLayer::Secondary(layer) => {
let file_size = layer.metadata.file_size;
let file_size = layer.metadata.file_size();
let tenant_manager = tenant_manager.clone();
js.spawn(async move {
layer
.secondary_tenant
.evict_layer(layer.timeline_id, layer.name)
.evict_layer(
tenant_manager.get_conf(),
layer.timeline_id,
layer.name,
layer.metadata,
)
.await;
Ok(file_size)
});
@@ -641,7 +647,7 @@ impl EvictionLayer {
pub(crate) fn get_file_size(&self) -> u64 {
match self {
Self::Attached(l) => l.layer_desc().file_size,
Self::Secondary(sl) => sl.metadata.file_size,
Self::Secondary(sl) => sl.metadata.file_size(),
}
}
}

View File

@@ -257,37 +257,6 @@ paths:
schema:
$ref: "#/components/schemas/LsnByTimestampResponse"
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
format: hex
post:
description: Obtain lease for the given LSN
parameters:
- name: lsn
in: query
required: true
schema:
type: string
format: hex
description: A LSN to obtain the lease for
responses:
"200":
description: OK
content:
application/json:
schema:
$ref: "#/components/schemas/LsnLease"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
parameters:
- name: tenant_id
@@ -612,80 +581,6 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
ŕequired: true
schema:
type: string
put:
description: |
Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
Current implementation might not be retryable across failure cases, but will be enhanced in future.
Detaching should be expected to be expensive operation. Timeouts should be retried.
responses:
"200":
description: |
The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
If any timelines were deleted after reparenting, they might not be on this list.
content:
application/json:
schema:
$ref: "#/components/schemas/AncestorDetached"
"400":
description: |
Number of early checks meaning the timeline cannot be detached now:
- the ancestor of timeline has an ancestor: not supported, see RFC
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"404":
description: Tenant or timeline not found.
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"409":
description: |
The timeline can never be detached:
- timeline has no ancestor, implying that the timeline has never had an ancestor
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"500":
description: |
Transient error, for example, pageserver shutdown happened while
processing the request but we were unable to distinguish that. Must
be retried.
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: |
Temporarily unavailable, please retry. Possible reasons:
- another timeline detach for the same tenant is underway, please retry later
- detected shutdown error
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/:
get:
description: Get tenants list
@@ -1085,15 +980,6 @@ components:
type: string
enum: [past, present, future, nodata]
LsnLease:
type: object
required:
- valid_until
properties:
valid_until:
type: string
format: date-time
PageserverUtilization:
type: object
required:
@@ -1151,19 +1037,6 @@ components:
format: int64
description: How many bytes of layer content were in the latest layer heatmap
AncestorDetached:
type: object
required:
- reparented_timelines
properties:
reparented_timelines:
type: array
description: Set of reparented timeline ids
properties:
type: string
format: hex
description: TimelineId
Error:
type: object

View File

@@ -1,8 +1,6 @@
//!
//! Management HTTP API
//!
use std::cmp::Reverse;
use std::collections::BinaryHeap;
use std::collections::HashMap;
use std::str::FromStr;
use std::sync::Arc;
@@ -16,9 +14,6 @@ use hyper::header;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::ShardParameters;
@@ -29,11 +24,7 @@ use pageserver_api::models::TenantScanRemoteStorageShard;
use pageserver_api::models::TenantShardLocation;
use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse;
use pageserver_api::models::TenantSorting;
use pageserver_api::models::TenantState;
use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::TopTenantShardsRequest;
use pageserver_api::models::TopTenantShardsResponse;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
TenantLoadRequest, TenantLocationConfigRequest,
@@ -75,7 +66,6 @@ use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::Timeline;
use crate::tenant::GetTimelineError;
use crate::tenant::SpawnMode;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::{config::PageServerConf, tenant::mgr};
@@ -114,7 +104,7 @@ pub struct State {
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
allowlist_routes: Vec<Uri>,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
@@ -128,7 +118,7 @@ impl State {
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
@@ -281,13 +271,6 @@ impl From<GetTenantError> for ApiError {
}
}
impl From<GetTimelineError> for ApiError {
fn from(gte: GetTimelineError) -> Self {
// Rationale: tenant is activated only after eligble timelines activate
ApiError::NotFound(gte.into())
}
}
impl From<GetActiveTenantError> for ApiError {
fn from(e: GetActiveTenantError) -> ApiError {
match e {
@@ -395,7 +378,7 @@ async fn build_timeline_info_common(
let guard = timeline.last_received_wal.lock().unwrap();
if let Some(info) = guard.as_ref() {
(
Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
Some(info.last_received_msg_lsn),
Some(info.last_received_msg_ts),
)
@@ -450,8 +433,6 @@ async fn build_timeline_info_common(
state,
walreceiver_status,
last_aux_file_policy: timeline.last_aux_file_policy.load(),
};
Ok(info)
}
@@ -652,7 +633,9 @@ async fn timeline_preserve_initdb_handler(
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let timeline = tenant.get_timeline(timeline_id, false)?;
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(|e| ApiError::NotFound(e.into()))?;
timeline
.preserve_initdb_archive()
@@ -694,7 +677,9 @@ async fn timeline_detail_handler(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline = tenant.get_timeline(timeline_id, false)?;
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(|e| ApiError::NotFound(e.into()))?;
let timeline_info = build_timeline_info(
&timeline,
@@ -828,6 +813,12 @@ async fn tenant_attach_handler(
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
if state.remote_storage.is_none() {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
)));
}
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let shard_params = ShardParameters::default();
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
@@ -1652,6 +1643,12 @@ async fn tenant_time_travel_remote_storage_handler(
)));
}
let Some(storage) = state.remote_storage.as_ref() else {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run time travel"
)));
};
if timestamp > done_if_after {
return Err(ApiError::BadRequest(anyhow!(
"The done_if_after timestamp comes before the timestamp to recover to"
@@ -1661,7 +1658,7 @@ async fn tenant_time_travel_remote_storage_handler(
tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
remote_timeline_client::upload::time_travel_recover_tenant(
&state.remote_storage,
storage,
&tenant_shard_id,
timestamp,
done_if_after,
@@ -1706,32 +1703,6 @@ async fn handle_tenant_break(
json_response(StatusCode::OK, ())
}
// Obtains an lsn lease on the given timeline.
async fn lsn_lease_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let lsn: Lsn = parse_query_param(&request, "lsn")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let result = timeline
.make_lsn_lease(lsn, &ctx)
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
json_response(StatusCode::OK, result)
}
// Run GC immediately on given timeline.
async fn timeline_gc_handler(
mut request: Request<Body>,
@@ -1767,8 +1738,6 @@ async fn timeline_compact_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1777,9 +1746,6 @@ async fn timeline_compact_handler(
.compact(&cancel, flags, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
}
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1804,8 +1770,6 @@ async fn timeline_checkpoint_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1819,10 +1783,6 @@ async fn timeline_checkpoint_handler(
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
}
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1906,11 +1866,14 @@ async fn timeline_detach_ancestor_handler(
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
let ctx = &ctx;
let timeline = tenant.get_timeline(timeline_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))?;
let (_guard, prepared) = timeline
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
.await?;
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
let res = state
.tenant_manager
@@ -1940,6 +1903,11 @@ async fn deletion_queue_flush(
) -> Result<Response<Body>, ApiError> {
let state = get_state(&r);
if state.remote_storage.is_none() {
// Nothing to do if remote storage is disabled.
return json_response(StatusCode::OK, ());
}
let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
let flush = async {
@@ -2044,7 +2012,9 @@ async fn active_timeline_of_active_tenant(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
Ok(tenant.get_timeline(timeline_id, true)?)
tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))
}
async fn always_panic_handler(
@@ -2102,11 +2072,18 @@ async fn disk_usage_eviction_run(
};
let state = get_state(&r);
let Some(storage) = state.remote_storage.as_ref() else {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run eviction iteration"
)));
};
let eviction_state = state.disk_usage_eviction_state.clone();
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&eviction_state,
&state.remote_storage,
storage,
usage,
&state.tenant_manager,
config.eviction_order,
@@ -2143,23 +2120,29 @@ async fn tenant_scan_remote_handler(
let state = get_state(&request);
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let Some(remote_storage) = state.remote_storage.as_ref() else {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Remote storage not configured"
)));
};
let mut response = TenantScanRemoteStorageResponse::default();
let (shards, _other_keys) =
list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone())
list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
for tenant_shard_id in shards {
let (timeline_ids, _other_keys) =
list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone())
list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
let mut generation = Generation::none();
for timeline_id in timeline_ids {
match download_index_part(
&state.remote_storage,
remote_storage,
&tenant_shard_id,
&timeline_id,
Generation::MAX,
@@ -2308,31 +2291,6 @@ async fn post_tracing_event_handler(
json_response(StatusCode::OK, ())
}
async fn force_aux_policy_switch_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
let policy: AuxFilePolicy = json_request(&mut r).await?;
let state = get_state(&r);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
timeline
.do_switch_aux_policy(policy)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn put_io_engine_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
@@ -2395,150 +2353,6 @@ async fn get_utilization(
.map_err(ApiError::InternalServerError)
}
async fn list_aux_files(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let body: ListAuxFilesRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
json_response(StatusCode::OK, files)
}
async fn ingest_aux_files(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let body: IngestAuxFilesRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let mut modification = timeline.begin_modification(
Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
for (fname, content) in body.aux_files {
modification
.put_file(&fname, content.as_bytes(), &ctx)
.await
.map_err(ApiError::InternalServerError)?;
}
modification
.commit(&ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
/// Report on the largest tenants on this pageserver, for the storage controller to identify
/// candidates for splitting
async fn post_top_tenants(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
let request: TopTenantShardsRequest = json_request(&mut r).await?;
let state = get_state(&r);
fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 {
match order_by {
TenantSorting::ResidentSize => sizes.resident_size,
TenantSorting::MaxLogicalSize => sizes.max_logical_size,
}
}
#[derive(Eq, PartialEq)]
struct HeapItem {
metric: u64,
sizes: TopTenantShardItem,
}
impl PartialOrd for HeapItem {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
/// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which
/// supports popping the greatest item but not the smallest.
impl Ord for HeapItem {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
Reverse(self.metric).cmp(&Reverse(other.metric))
}
}
let mut top_n: BinaryHeap<HeapItem> = BinaryHeap::with_capacity(request.limit);
// FIXME: this is a lot of clones to take this tenant list
for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() {
if let Some(shards_lt) = request.where_shards_lt {
// Ignore tenants which already have >= this many shards
if tenant_shard_id.shard_count >= shards_lt {
continue;
}
}
let sizes = match tenant_slot {
TenantSlot::Attached(tenant) => tenant.get_sizes(),
TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
continue;
}
};
let metric = get_size_metric(&sizes, &request.order_by);
if let Some(gt) = request.where_gt {
// Ignore tenants whose metric is <= the lower size threshold, to do less sorting work
if metric <= gt {
continue;
}
};
match top_n.peek() {
None => {
// Top N list is empty: candidate becomes first member
top_n.push(HeapItem { metric, sizes });
}
Some(i) if i.metric > metric && top_n.len() < request.limit => {
// Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end
top_n.push(HeapItem { metric, sizes });
}
Some(i) if i.metric > metric => {
// List is at limit and lowest value is greater than our candidate, drop it.
}
Some(_) => top_n.push(HeapItem { metric, sizes }),
}
while top_n.len() > request.limit {
top_n.pop();
}
}
json_response(
StatusCode::OK,
TopTenantShardsResponse {
shards: top_n.into_iter().map(|i| i.sizes).collect(),
},
)
}
/// Common functionality of all the HTTP API handlers.
///
/// - Adds a tracing span to each request (by `request_span`)
@@ -2751,10 +2565,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|r| api_handler(r, get_timestamp_of_lsn_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|r| api_handler(r, lsn_lease_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
|r| api_handler(r, timeline_gc_handler),
@@ -2828,19 +2638,6 @@ pub fn make_router(
|r| api_handler(r, timeline_collect_keyspace),
)
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|r| api_handler(r, force_aux_policy_switch_handler),
)
.get("/v1/utilization", |r| api_handler(r, get_utilization))
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
|r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
|r| testing_api_handler("list_aux_files", r, list_aux_files),
)
.post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
.any(handler_404))
}

View File

@@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics;
#[tracing::instrument(skip_all, fields(%exit_code))]
pub async fn shutdown_pageserver(
tenant_manager: &TenantManager,
mut deletion_queue: DeletionQueue,
deletion_queue: Option<DeletionQueue>,
exit_code: i32,
) {
use std::time::Duration;
@@ -89,7 +89,9 @@ pub async fn shutdown_pageserver(
.await;
// Best effort to persist any outstanding deletions, to avoid leaking objects
deletion_queue.shutdown(Duration::from_secs(5)).await;
if let Some(mut deletion_queue) = deletion_queue {
deletion_queue.shutdown(Duration::from_secs(5)).await;
}
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
@@ -112,6 +114,10 @@ pub async fn shutdown_pageserver(
std::process::exit(exit_code);
}
/// The name of the metadata file pageserver creates per timeline.
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
pub const METADATA_FILE_NAME: &str = "metadata";
/// Per-tenant configuration file.
/// Full path: `tenants/<tenant_id>/config`.
pub(crate) const TENANT_CONFIG_NAME: &str = "config";

View File

@@ -525,15 +525,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_standby_horizon",
"Standby apply LSN for which GC is hold off, by timeline.",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_resident_physical_size",
@@ -1867,6 +1858,7 @@ pub(crate) struct WalIngestMetrics {
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) time_spent_on_ingest: Histogram,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -1890,6 +1882,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
"Number of WAL records filtered out due to sharding"
)
.expect("failed to define a metric"),
time_spent_on_ingest: register_histogram!(
"pageserver_wal_ingest_put_value_seconds",
"Actual time spent on ingesting a record",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric"),
});
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -2001,6 +1999,29 @@ impl Default for WalRedoProcessCounters {
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
Lazy::new(WalRedoProcessCounters::default);
#[cfg(not(test))]
pub mod wal_redo {
use super::*;
static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
std::sync::Mutex::new(
register_uint_gauge_vec!(
"pageserver_wal_redo_process_kind",
"The configured process kind for walredo",
&["kind"],
)
.unwrap(),
)
});
pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
// use guard to avoid races around the next two steps
let guard = PROCESS_KIND.lock().unwrap();
guard.reset();
guard.with_label_values(&[&format!("{kind}")]).set(1);
}
}
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
pub(crate) struct StorageTimeMetricsTimer {
metrics: StorageTimeMetrics,
@@ -2100,8 +2121,7 @@ pub(crate) struct TimelineMetrics {
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub aux_file_size_gauge: IntGauge,
@@ -2170,9 +2190,6 @@ impl TimelineMetrics {
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2218,7 +2235,6 @@ impl TimelineMetrics {
find_gc_cutoffs_histo,
load_layer_map_histo,
last_record_gauge,
standby_horizon_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
aux_file_size_gauge,
@@ -2253,7 +2269,6 @@ impl TimelineMetrics {
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2320,7 +2335,6 @@ use pin_project_lite::pin_project;
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::pin::Pin;
use std::sync::atomic::AtomicU64;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::{Duration, Instant};
@@ -2330,35 +2344,35 @@ use crate::task_mgr::TaskKind;
use crate::tenant::mgr::TenantSlot;
/// Maintain a per timeline gauge in addition to the global gauge.
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
last_set: AtomicU64,
struct PerTimelineRemotePhysicalSizeGauge {
last_set: u64,
gauge: UIntGauge,
}
impl PerTimelineRemotePhysicalSizeGauge {
fn new(per_timeline_gauge: UIntGauge) -> Self {
Self {
last_set: AtomicU64::new(0),
last_set: per_timeline_gauge.get(),
gauge: per_timeline_gauge,
}
}
pub(crate) fn set(&self, sz: u64) {
fn set(&mut self, sz: u64) {
self.gauge.set(sz);
let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
if sz < prev {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
if sz < self.last_set {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
} else {
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
};
self.last_set = sz;
}
pub(crate) fn get(&self) -> u64 {
fn get(&self) -> u64 {
self.gauge.get()
}
}
impl Drop for PerTimelineRemotePhysicalSizeGauge {
fn drop(&mut self) {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
}
}
@@ -2366,7 +2380,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
tenant_id: String,
shard_id: String,
timeline_id: String,
pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -2374,27 +2388,38 @@ pub(crate) struct RemoteTimelineClientMetrics {
impl RemoteTimelineClientMetrics {
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
let tenant_id_str = tenant_shard_id.tenant_id.to_string();
let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
let timeline_id_str = timeline_id.to_string();
let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
.unwrap(),
);
RemoteTimelineClientMetrics {
tenant_id: tenant_id_str,
shard_id: shard_id_str,
timeline_id: timeline_id_str,
tenant_id: tenant_shard_id.tenant_id.to_string(),
shard_id: format!("{}", tenant_shard_id.shard_slug()),
timeline_id: timeline_id.to_string(),
calls: Mutex::new(HashMap::default()),
bytes_started_counter: Mutex::new(HashMap::default()),
bytes_finished_counter: Mutex::new(HashMap::default()),
remote_physical_size_gauge,
remote_physical_size_gauge: Mutex::new(None),
}
}
pub(crate) fn remote_physical_size_set(&self, sz: u64) {
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
let gauge = guard.get_or_insert_with(|| {
PerTimelineRemotePhysicalSizeGauge::new(
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
])
.unwrap(),
)
});
gauge.set(sz);
}
pub(crate) fn remote_physical_size_get(&self) -> u64 {
let guard = self.remote_physical_size_gauge.lock().unwrap();
guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
}
pub fn remote_operation_time(
&self,
file_kind: &RemoteOpFileKind,

View File

@@ -19,7 +19,6 @@ use pageserver_api::models::{
};
use pageserver_api::shard::ShardIndex;
use pageserver_api::shard::ShardNumber;
use pageserver_api::shard::TenantShardId;
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
use pq_proto::framed::ConnectionError;
use pq_proto::FeStartupPacket;
@@ -33,8 +32,6 @@ use std::str;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use std::time::SystemTime;
use tokio::io::AsyncWriteExt;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::io::StreamReader;
@@ -62,14 +59,11 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_i
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::GetTenantError;
use crate::tenant::mgr::ShardResolveResult;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::mgr::TenantManager;
use crate::tenant::timeline::WaitLsnError;
use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
use crate::tenant::Tenant;
use crate::tenant::Timeline;
use crate::trace::Tracer;
use pageserver_api::key::rel_block_to_key;
@@ -260,8 +254,6 @@ async fn page_service_conn_main(
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
let socket = std::pin::pin!(socket);
fail::fail_point!("ps::connection-start::pre-login");
// XXX: pgbackend.run() should take the connection_ctx,
// and create a child per-query context when it invokes process_query.
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
@@ -561,7 +553,13 @@ impl PageServerHandler {
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
let tenant = self
.get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
.tenant_manager
.get_active_tenant_with_timeout(
tenant_id,
ShardSelector::First,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
// Make request tracer if needed
@@ -605,7 +603,6 @@ impl PageServerHandler {
};
trace!("query: {copy_data_bytes:?}");
fail::fail_point!("ps::handle-pagerequest-message");
// Trace request if needed
if let Some(t) = tracer.as_mut() {
@@ -620,7 +617,6 @@ impl PageServerHandler {
let (response, span) = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
fail::fail_point!("ps::handle-pagerequest-message::exists");
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
(
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -630,7 +626,6 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::Nblocks(req) => {
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
(
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -640,7 +635,6 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::GetPage(req) => {
fail::fail_point!("ps::handle-pagerequest-message::getpage");
// shard_id is filled in by the handler
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
(
@@ -651,7 +645,6 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::DbSize(req) => {
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
(
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -661,7 +654,6 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::GetSlruSegment(req) => {
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
(
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
@@ -737,7 +729,13 @@ impl PageServerHandler {
// Create empty timeline
info!("creating new timeline");
let tenant = self
.get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
.tenant_manager
.get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
let timeline = tenant
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
@@ -915,39 +913,6 @@ impl PageServerHandler {
}
}
#[instrument(skip_all, fields(shard_id, %lsn))]
async fn handle_make_lsn_lease<IO>(
&self,
pgb: &mut PostgresBackend<IO>,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<(), QueryError>
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
let timeline = self
.get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
.await?;
let lease = timeline.make_lsn_lease(lsn, ctx)?;
let valid_until = lease
.valid_until
.duration_since(SystemTime::UNIX_EPOCH)
.map_err(|e| QueryError::Other(e.into()))?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
b"valid_until",
)]))?
.write_message_noflush(&BeMessage::DataRow(&[Some(
&valid_until.as_millis().to_be_bytes(),
)]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
Ok(())
}
#[instrument(skip_all, fields(shard_id))]
async fn handle_get_rel_exists_request(
&mut self,
@@ -1410,68 +1375,19 @@ impl PageServerHandler {
selector: ShardSelector,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = self
.get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT)
.tenant_manager
.get_active_tenant_with_timeout(
tenant_id,
selector,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant.get_timeline(timeline_id, true)?;
set_tracing_field_shard_id(&timeline);
Ok(timeline)
}
/// Get a shard's [`Tenant`] in its active state, if present. If we don't find the shard and some
/// slots for this tenant are `InProgress` then we will wait.
/// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait.
///
/// `timeout` is used as a total timeout for the whole wait operation.
async fn get_active_tenant_with_timeout(
&self,
tenant_id: TenantId,
shard_selector: ShardSelector,
timeout: Duration,
) -> Result<Arc<Tenant>, GetActiveTenantError> {
let wait_start = Instant::now();
let deadline = wait_start + timeout;
// Resolve TenantId to TenantShardId. This is usually a quick one-shot thing, the loop is
// for handling the rare case that the slot we're accessing is InProgress.
let tenant_shard = loop {
let resolved = self
.tenant_manager
.resolve_attached_shard(&tenant_id, shard_selector);
match resolved {
ShardResolveResult::Found(tenant_shard) => break tenant_shard,
ShardResolveResult::NotFound => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
tenant_id,
)));
}
ShardResolveResult::InProgress(barrier) => {
// We can't authoritatively answer right now: wait for InProgress state
// to end, then try again
tokio::select! {
_ = self.await_connection_cancelled() => {
return Err(GetActiveTenantError::Cancelled)
},
_ = barrier.wait() => {
// The barrier completed: proceed around the loop to try looking up again
},
_ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
return Err(GetActiveTenantError::WaitForActiveTimeout {
latest_state: None,
wait_time: timeout,
});
}
}
}
};
};
tracing::debug!("Waiting for tenant to enter active state...");
tenant_shard
.wait_to_become_active(deadline.duration_since(Instant::now()))
.await?;
Ok(tenant_shard)
}
}
#[async_trait::async_trait]
@@ -1513,7 +1429,6 @@ where
_pgb: &mut PostgresBackend<IO>,
_sm: &FeStartupPacket,
) -> Result<(), QueryError> {
fail::fail_point!("ps::connection-start::startup-packet");
Ok(())
}
@@ -1528,12 +1443,11 @@ where
Err(QueryError::SimulatedConnectionError)
});
fail::fail_point!("ps::connection-start::process-query");
let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string:?}");
let parts = query_string.split_whitespace().collect::<Vec<_>>();
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
if query_string.starts_with("pagestream_v2 ") {
let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
@@ -1558,7 +1472,9 @@ where
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
} else if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
@@ -1583,7 +1499,10 @@ where
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
} else if query_string.starts_with("basebackup ") {
let (_, params_raw) = query_string.split_at("basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for basebackup command"
@@ -1601,23 +1520,26 @@ where
self.check_permission(Some(tenant_id))?;
let lsn = if let Some(lsn_str) = params.get(2) {
let lsn = if params.len() >= 3 {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
let gzip = match params.get(3) {
Some(&"--gzip") => true,
None => false,
Some(third_param) => {
let gzip = if params.len() >= 4 {
if params[3] == "--gzip" {
true
} else {
return Err(QueryError::Other(anyhow::anyhow!(
"Parameter in position 3 unknown {third_param}",
)))
"Parameter in position 3 unknown {}",
params[3],
)));
}
} else {
false
};
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
@@ -1641,7 +1563,10 @@ where
res?;
}
// return pair of prev_lsn and last_lsn
else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
else if query_string.starts_with("get_last_record_rlsn ") {
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for get_last_record_rlsn command"
@@ -1683,7 +1608,10 @@ where
.await?;
}
// same as basebackup, but result includes relational data as well
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
else if query_string.starts_with("fullbackup ") {
let (_, params_raw) = query_string.split_at("fullbackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for fullbackup command"
@@ -1700,18 +1628,18 @@ where
.record("timeline_id", field::display(timeline_id));
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if let Some(lsn_str) = params.get(2) {
let lsn = if params.len() > 2 {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
let prev_lsn = if params.len() > 3 {
Some(
Lsn::from_str(prev_lsn_str)
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
)
} else {
None
@@ -1744,7 +1672,8 @@ where
// 2. Run:
// cat my_backup/base.tar | psql -h $PAGESERVER \
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
let params = &parts[2..];
let (_, params_raw) = query_string.split_at("import basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 5 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import basebackup command"
@@ -1793,7 +1722,8 @@ where
//
// Files are scheduled to be persisted to remote storage, and the
// caller should poll the http api to check when that is done.
let params = &parts[2..];
let (_, params_raw) = query_string.split_at("import wal ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 4 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import wal command"
@@ -1831,45 +1761,10 @@ where
// important because psycopg2 executes "SET datestyle TO 'ISO'"
// on connect
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("lease lsn ") {
let params = &parts[2..];
if params.len() != 3 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number {} for lease lsn command",
params.len()
)));
}
let tenant_shard_id = TenantShardId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_shard_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_shard_id.tenant_id))?;
// The caller is responsible for providing correct lsn.
let lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
match self
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
.await
{
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error obtaining lsn lease for {lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else if let Some(params) = parts.strip_prefix(&["show"]) {
} else if query_string.starts_with("show ") {
// show <tenant_id>
let (_, params_raw) = query_string.split_at("show ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 1 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for config command"
@@ -1883,10 +1778,12 @@ where
self.check_permission(Some(tenant_id))?;
let tenant = self
.tenant_manager
.get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[

View File

@@ -9,6 +9,7 @@
use super::tenant::{PageReconstructError, Timeline};
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::WAL_INGEST;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::walrecord::NeonWalRecord;
use crate::{aux_file, repository::*};
@@ -34,16 +35,12 @@ use std::ops::ControlFlow;
use std::ops::Range;
use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, trace, warn};
use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::vec_map::{VecMap, VecMapOrdering};
use utils::{bin_ser::BeSer, lsn::Lsn};
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
const MAX_AUX_FILE_DELTAS: usize = 1024;
#[derive(Debug)]
pub enum LsnForTimestamp {
@@ -721,11 +718,10 @@ impl Timeline {
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
let current_policy = self.last_aux_file_policy.load();
match current_policy {
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
Some(AuxFilePolicy::CrossValidation) => {
match self.get_switch_aux_file_policy() {
AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
AuxFilePolicy::CrossValidation => {
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
match (v1_result, v2_result) {
@@ -1473,40 +1469,7 @@ impl<'a> DatadirModification<'a> {
content: &[u8],
ctx: &RequestContext,
) -> anyhow::Result<()> {
let switch_policy = self.tline.get_switch_aux_file_policy();
let policy = {
let current_policy = self.tline.last_aux_file_policy.load();
// Allowed switch path:
// * no aux files -> v1/v2/cross-validation
// * cross-validation->v2
let current_policy = if current_policy.is_none() {
// This path will only be hit once per tenant: we will decide the final policy in this code block.
// The next call to `put_file` will always have `last_aux_file_policy != None`.
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
if aux_files_key_v1.is_empty() {
None
} else {
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
Some(AuxFilePolicy::V1)
}
} else {
current_policy
};
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
self.tline.do_switch_aux_policy(switch_policy)?;
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
switch_policy
} else {
// This branch handles non-valid migration path, and the case that switch_policy == current_policy.
// And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
}
};
let policy = self.tline.get_switch_aux_file_policy();
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
let key = aux_file::encode_aux_file_key(path);
// retrieve the key from the engine
@@ -1714,6 +1677,8 @@ impl<'a> DatadirModification<'a> {
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
let mut writer = self.tline.writer().await;
let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
@@ -1753,6 +1718,8 @@ impl<'a> DatadirModification<'a> {
writer.update_directory_entries_count(kind, count as u64);
}
timer.observe_duration();
Ok(())
}
@@ -1788,12 +1755,6 @@ impl<'a> DatadirModification<'a> {
self.tline.get(key, lsn, ctx).await
}
/// Only used during unit tests, force putting a key into the modification.
#[cfg(test)]
pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
self.put(key, val);
}
fn put(&mut self, key: Key, val: Value) {
let values = self.pending_updates.entry(key).or_default();
// Replace the previous value if it exists at the same lsn

File diff suppressed because it is too large Load Diff

View File

@@ -238,13 +238,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
io_buf,
Err(Error::new(
ErrorKind::Other,
format!("blob too large ({len} bytes)"),
format!("blob too large ({} bytes)", len),
)),
);
}
if len > 0x0fff_ffff {
tracing::warn!("writing blob above future limit ({len} bytes)");
}
let mut len_buf = (len as u32).to_be_bytes();
len_buf[0] |= 0x80;
io_buf.extend_from_slice(&len_buf[..]);

View File

@@ -11,7 +11,6 @@
use anyhow::bail;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::CompactionAlgorithm;
use pageserver_api::models::CompactionAlgorithmSettings;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, ThrottleConfig};
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -321,7 +320,7 @@ pub struct TenantConf {
pub compaction_period: Duration,
// Level0 delta layer threshold for compaction.
pub compaction_threshold: usize,
pub compaction_algorithm: CompactionAlgorithmSettings,
pub compaction_algorithm: CompactionAlgorithm,
// Determines how much history is retained, to allow
// branching and read replicas at an older point in time.
// The unit is #of bytes of WAL.
@@ -374,8 +373,6 @@ pub struct TenantConf {
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
/// file is written.
pub switch_aux_file_policy: AuxFilePolicy,
}
@@ -407,7 +404,7 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
pub compaction_algorithm: Option<CompactionAlgorithm>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
@@ -498,9 +495,7 @@ impl TenantConfOpt {
.unwrap_or(global_conf.compaction_threshold),
compaction_algorithm: self
.compaction_algorithm
.as_ref()
.unwrap_or(&global_conf.compaction_algorithm)
.clone(),
.unwrap_or(global_conf.compaction_algorithm),
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
image_creation_threshold: self
@@ -553,9 +548,7 @@ impl Default for TenantConf {
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
.expect("cannot parse default compaction period"),
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
compaction_algorithm: CompactionAlgorithmSettings {
kind: DEFAULT_COMPACTION_ALGORITHM,
},
compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
gc_horizon: DEFAULT_GC_HORIZON,
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
.expect("cannot parse default gc period"),
@@ -581,7 +574,7 @@ impl Default for TenantConf {
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
switch_aux_file_policy: AuxFilePolicy::V1,
}
}
}

View File

@@ -181,23 +181,25 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
async fn remove_tenant_remote_delete_mark(
conf: &PageServerConf,
remote_storage: &GenericRemoteStorage,
remote_storage: Option<&GenericRemoteStorage>,
tenant_shard_id: &TenantShardId,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
backoff::retry(
|| async { remote_storage.delete(&path, cancel).await },
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
cancel,
)
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("remove_tenant_remote_delete_mark")?;
if let Some(remote_storage) = remote_storage {
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
backoff::retry(
|| async { remote_storage.delete(&path, cancel).await },
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
cancel,
)
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("remove_tenant_remote_delete_mark")?;
}
Ok(())
}
@@ -295,7 +297,7 @@ impl DeleteTenantFlow {
#[instrument(skip_all)]
pub(crate) async fn run(
conf: &'static PageServerConf,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
cancel: &CancellationToken,
@@ -306,7 +308,9 @@ impl DeleteTenantFlow {
let mut guard = Self::prepare(&tenant).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
if let Err(e) =
Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
{
tenant.set_broken(format!("{e:#}")).await;
return Err(e);
}
@@ -323,7 +327,7 @@ impl DeleteTenantFlow {
async fn run_inner(
guard: &mut OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: &GenericRemoteStorage,
remote_storage: Option<&GenericRemoteStorage>,
tenant: &Tenant,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
@@ -335,9 +339,14 @@ impl DeleteTenantFlow {
))?
});
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
.await
.context("remote_mark")?;
// IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
// Though sounds scary, different mark name?
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
.await
.context("remote_mark")?
}
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
Err(anyhow::anyhow!(
@@ -474,7 +483,7 @@ impl DeleteTenantFlow {
fn schedule_background(
guard: OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) {
@@ -503,7 +512,7 @@ impl DeleteTenantFlow {
async fn background(
mut guard: OwnedMutexGuard<Self>,
conf: &PageServerConf,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: &Arc<Tenant>,
) -> Result<(), DeleteTenantError> {
@@ -542,7 +551,7 @@ impl DeleteTenantFlow {
remove_tenant_remote_delete_mark(
conf,
&remote_storage,
remote_storage.as_ref(),
&tenant.tenant_shard_id,
&task_mgr::shutdown_token(),
)

View File

@@ -7,7 +7,7 @@ use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::LocationConfigMode;
use pageserver_api::shard::{
ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
};
use pageserver_api::upcall_api::ReAttachResponseTenant;
use rand::{distributions::Alphanumeric, Rng};
@@ -16,9 +16,10 @@ use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap};
use std::ops::Deref;
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, Instant};
use sysinfo::SystemExt;
use tokio::fs;
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
use anyhow::Context;
use once_cell::sync::Lazy;
@@ -46,7 +47,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::inmemory_layer;
use crate::tenant::timeline::ShutdownMode;
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
use utils::crashsafe::path_with_suffix_extension;
use utils::fs_ext::PathExt;
@@ -127,8 +128,6 @@ pub(crate) enum ShardSelector {
First,
/// Pick the shard that holds this key
Page(Key),
/// The shard ID is known: pick the given shard
Known(ShardIndex),
}
/// A convenience for use with the re_attach ControlPlaneClient function: rather
@@ -172,7 +171,7 @@ impl TenantStartupMode {
}
/// Result type for looking up a TenantId to a specific shard
pub(crate) enum ShardResolveResult {
enum ShardResolveResult {
NotFound,
Found(Arc<Tenant>),
// Wait for this barrrier, then query again
@@ -192,6 +191,71 @@ impl TenantsMap {
}
}
/// A page service client sends a TenantId, and to look up the correct Tenant we must
/// resolve this to a fully qualified TenantShardId.
///
/// During shard splits: we shall see parent shards in InProgress state and skip them, and
/// instead match on child shards which should appear in Attached state. Very early in a shard
/// split, or in other cases where a shard is InProgress, we will return our own InProgress result
/// to instruct the caller to wait for that to finish before querying again.
fn resolve_attached_shard(
&self,
tenant_id: &TenantId,
selector: ShardSelector,
) -> ShardResolveResult {
let mut want_shard = None;
let mut any_in_progress = None;
match self {
TenantsMap::Initializing => ShardResolveResult::NotFound,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
// Ignore all slots that don't contain an attached tenant
let tenant = match &slot.1 {
TenantSlot::Attached(t) => t,
TenantSlot::InProgress(barrier) => {
// We might still find a usable shard, but in case we don't, remember that
// we saw at least one InProgress slot, so that we can distinguish this case
// from a simple NotFound in our return value.
any_in_progress = Some(barrier.clone());
continue;
}
_ => continue,
};
match selector {
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return ShardResolveResult::Found(tenant.clone())
}
ShardSelector::Page(key) => {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if Some(tenant.shard_identity.number) == want_shard {
return ShardResolveResult::Found(tenant.clone());
}
}
_ => continue,
}
}
// Fall through: we didn't find a slot that was in Attached state & matched our selector. If
// we found one or more InProgress slot, indicate to caller that they should retry later. Otherwise
// this requested shard simply isn't found.
if let Some(barrier) = any_in_progress {
ShardResolveResult::InProgress(barrier)
} else {
ShardResolveResult::NotFound
}
}
}
}
/// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map.
///
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
@@ -356,17 +420,22 @@ async fn init_load_generations(
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
// are processed, even though we don't block on recovery completing here.
let attached_tenants = generations
.iter()
.flat_map(|(id, start_mode)| {
match start_mode {
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
TenantStartupMode::Secondary => None,
}
.map(|gen| (*id, *gen))
})
.collect();
resources.deletion_queue_client.recover(attached_tenants)?;
//
// Must only do this if remote storage is enabled, otherwise deletion queue
// is not running and channel push will fail.
if resources.remote_storage.is_some() {
let attached_tenants = generations
.iter()
.flat_map(|(id, start_mode)| {
match start_mode {
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
TenantStartupMode::Secondary => None,
}
.map(|gen| (*id, *gen))
})
.collect();
resources.deletion_queue_client.recover(attached_tenants)?;
}
Ok(Some(generations))
}
@@ -420,6 +489,53 @@ fn load_tenant_config(
}
};
// Clean up legacy `metadata` files.
// Doing it here because every single tenant directory is visited here.
// In any later code, there's different treatment of tenant dirs
// ... depending on whether the tenant is in re-attach response or not
// ... epending on whether the tenant is ignored or not
assert_eq!(
&conf.tenant_path(&tenant_shard_id),
&tenant_dir_path,
"later use of conf....path() methods would be dubious"
);
let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
Ok(iter) => {
let mut timelines = Vec::new();
for res in iter {
let p = res?;
let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
// skip any entries that aren't TimelineId, such as
// - *.___temp dirs
// - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
continue;
};
timelines.push(timeline_id);
}
timelines
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
Err(e) => return Err(anyhow::anyhow!(e)),
};
for timeline_id in timelines {
let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
let metadata_path = timeline_path.join(METADATA_FILE_NAME);
match std::fs::remove_file(&metadata_path) {
Ok(()) => {
crashsafe::fsync(timeline_path)
.context("fsync timeline dir after removing legacy metadata file")?;
info!("removed legacy metadata file at {metadata_path}");
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// something removed the file earlier, or it was never there
// We don't care, this software version doesn't write it again, so, we're good.
}
Err(e) => {
anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
}
}
}
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
@@ -524,7 +640,6 @@ pub async fn init_tenant_mgr(
TenantSlot::Attached(Tenant::create_broken_tenant(
conf,
tenant_shard_id,
resources.remote_storage.clone(),
format!("{}", e),
)),
);
@@ -717,7 +832,6 @@ fn tenant_spawn(
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
);
let remote_storage = resources.remote_storage.clone();
let tenant = match Tenant::spawn(
conf,
tenant_shard_id,
@@ -732,7 +846,7 @@ fn tenant_spawn(
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
}
};
@@ -1608,7 +1722,7 @@ impl TenantManager {
for child_shard_id in &child_shards {
let child_shard_id = *child_shard_id;
let child_shard = {
let locked = TENANTS.read().unwrap();
let locked = self.tenants.read().unwrap();
let peek_slot =
tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
peek_slot.and_then(|s| s.get_attached()).cloned()
@@ -1841,13 +1955,7 @@ impl TenantManager {
deletion_queue_client: &DeletionQueueClient,
) -> Result<(), TenantStateError> {
let tmp_path = self
.detach_tenant0(
conf,
&TENANTS,
tenant_shard_id,
detach_ignored,
deletion_queue_client,
)
.detach_tenant0(conf, tenant_shard_id, detach_ignored, deletion_queue_client)
.await?;
spawn_background_purge(tmp_path);
@@ -1857,7 +1965,6 @@ impl TenantManager {
async fn detach_tenant0(
&self,
conf: &'static PageServerConf,
tenants: &std::sync::RwLock<TenantsMap>,
tenant_shard_id: TenantShardId,
detach_ignored: bool,
deletion_queue_client: &DeletionQueueClient,
@@ -1872,7 +1979,7 @@ impl TenantManager {
};
let removal_result = remove_tenant_from_memory(
tenants,
self.tenants,
tenant_shard_id,
tenant_dir_rename_operation(tenant_shard_id),
)
@@ -1908,7 +2015,7 @@ impl TenantManager {
pub(crate) fn list_tenants(
&self,
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
let tenants = TENANTS.read().unwrap();
let tenants = self.tenants.read().unwrap();
let m = match &*tenants {
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
@@ -2019,75 +2126,66 @@ impl TenantManager {
Ok(reparented)
}
/// A page service client sends a TenantId, and to look up the correct Tenant we must
/// resolve this to a fully qualified TenantShardId.
///
/// During shard splits: we shall see parent shards in InProgress state and skip them, and
/// instead match on child shards which should appear in Attached state. Very early in a shard
/// split, or in other cases where a shard is InProgress, we will return our own InProgress result
/// to instruct the caller to wait for that to finish before querying again.
pub(crate) fn resolve_attached_shard(
/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
/// state, then wait for up to `timeout`. If the [`Tenant`] is not currently in [`TenantState::Active`],
/// then wait for up to `timeout` (minus however long we waited for the slot).
pub(crate) async fn get_active_tenant_with_timeout(
&self,
tenant_id: &TenantId,
selector: ShardSelector,
) -> ShardResolveResult {
let tenants = self.tenants.read().unwrap();
let mut want_shard = None;
let mut any_in_progress = None;
tenant_id: TenantId,
shard_selector: ShardSelector,
timeout: Duration,
cancel: &CancellationToken,
) -> Result<Arc<Tenant>, GetActiveTenantError> {
let wait_start = Instant::now();
let deadline = wait_start + timeout;
match &*tenants {
TenantsMap::Initializing => ShardResolveResult::NotFound,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
// Ignore all slots that don't contain an attached tenant
let tenant = match &slot.1 {
TenantSlot::Attached(t) => t,
TenantSlot::InProgress(barrier) => {
// We might still find a usable shard, but in case we don't, remember that
// we saw at least one InProgress slot, so that we can distinguish this case
// from a simple NotFound in our return value.
any_in_progress = Some(barrier.clone());
// Resolve TenantId to TenantShardId. This is usually a quick one-shot thing, the loop is
// for handling the rare case that the slot we're accessing is InProgress.
let tenant_shard = loop {
let resolved = {
let locked = self.tenants.read().unwrap();
locked.resolve_attached_shard(&tenant_id, shard_selector)
};
match resolved {
ShardResolveResult::Found(tenant_shard) => break tenant_shard,
ShardResolveResult::NotFound => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
tenant_id,
)));
}
ShardResolveResult::InProgress(barrier) => {
// We can't authoritatively answer right now: wait for InProgress state
// to end, then try again
match timeout_cancellable(
deadline.duration_since(Instant::now()),
cancel,
barrier.wait(),
)
.await
{
Ok(_) => {
// The barrier completed: proceed around the loop to try looking up again
continue;
}
_ => continue,
};
match selector {
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return ShardResolveResult::Found(tenant.clone())
Err(TimeoutCancellableError::Timeout) => {
return Err(GetActiveTenantError::WaitForActiveTimeout {
latest_state: None,
wait_time: timeout,
});
}
ShardSelector::Page(key) => {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if Some(tenant.shard_identity.number) == want_shard {
return ShardResolveResult::Found(tenant.clone());
}
Err(TimeoutCancellableError::Cancelled) => {
return Err(GetActiveTenantError::Cancelled);
}
ShardSelector::Known(shard)
if tenant.shard_identity.shard_index() == shard =>
{
return ShardResolveResult::Found(tenant.clone());
}
_ => continue,
}
}
};
};
// Fall through: we didn't find a slot that was in Attached state & matched our selector. If
// we found one or more InProgress slot, indicate to caller that they should retry later. Otherwise
// this requested shard simply isn't found.
if let Some(barrier) = any_in_progress {
ShardResolveResult::InProgress(barrier)
} else {
ShardResolveResult::NotFound
}
}
}
tracing::debug!("Waiting for tenant to enter active state...");
tenant_shard
.wait_to_become_active(deadline.duration_since(Instant::now()))
.await?;
Ok(tenant_shard)
}
}
@@ -2163,7 +2261,7 @@ pub(crate) async fn load_tenant(
tenant_id: TenantId,
generation: Generation,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
@@ -2824,7 +2922,7 @@ pub(crate) async fn immediate_gc(
}
let timeline = tenant.get_timeline(timeline_id, false).ok();
let rtc = timeline.as_ref().map(|x| &x.remote_client);
let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
if let Some(rtc) = rtc {
// layer drops schedule actions on remote timeline client to actually do the

View File

@@ -189,7 +189,6 @@ use camino::Utf8Path;
use chrono::{NaiveDateTime, Utc};
pub(crate) use download::download_initdb_tar_zst;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::shard::{ShardIndex, TenantShardId};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
@@ -318,7 +317,7 @@ pub struct RemoteTimelineClient {
upload_queue: Mutex<UploadQueue>,
pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
metrics: Arc<RemoteTimelineClientMetrics>,
storage_impl: GenericRemoteStorage,
@@ -462,11 +461,11 @@ impl RemoteTimelineClient {
} else {
0
};
self.metrics.remote_physical_size_gauge.set(size);
self.metrics.remote_physical_size_set(size);
}
pub fn get_remote_physical_size(&self) -> u64 {
self.metrics.remote_physical_size_gauge.get()
self.metrics.remote_physical_size_get()
}
//
@@ -519,7 +518,6 @@ impl RemoteTimelineClient {
&self,
layer_file_name: &LayerName,
layer_metadata: &LayerFileMetadata,
local_path: &Utf8Path,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<u64> {
@@ -538,7 +536,6 @@ impl RemoteTimelineClient {
self.timeline_id,
layer_file_name,
layer_metadata,
local_path,
cancel,
ctx,
)
@@ -612,17 +609,6 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
self: &Arc<Self>,
last_aux_file_policy: Option<AuxFilePolicy>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.last_aux_file_policy = last_aux_file_policy;
self.schedule_index_upload(upload_queue);
Ok(())
}
///
/// Launch an index-file upload operation in the background, if necessary.
///
@@ -1192,7 +1178,7 @@ impl RemoteTimelineClient {
&self.storage_impl,
uploaded.local_path(),
&remote_path,
uploaded.metadata().file_size,
uploaded.metadata().file_size(),
cancel,
)
.await
@@ -1573,7 +1559,7 @@ impl RemoteTimelineClient {
&self.storage_impl,
local_path,
&remote_path,
layer_metadata.file_size,
layer_metadata.file_size(),
&self.cancel,
)
.measure_remote_op(
@@ -1768,7 +1754,7 @@ impl RemoteTimelineClient {
UploadOp::UploadLayer(_, m) => (
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
),
UploadOp::UploadMetadata(_, _) => (
RemoteOpFileKind::Index,
@@ -1863,7 +1849,6 @@ impl RemoteTimelineClient {
dangling_files: HashMap::default(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
last_aux_file_policy: initialized.last_aux_file_policy,
};
let upload_queue = std::mem::replace(
@@ -2152,7 +2137,7 @@ mod tests {
tenant_ctx: _tenant_ctx,
} = test_setup;
let client = &timeline.remote_client;
let client = timeline.remote_client.as_ref().unwrap();
// Download back the index.json, and check that the list of files is correct
let initial_index_part = match client
@@ -2343,7 +2328,7 @@ mod tests {
timeline,
..
} = TestSetup::new("metrics").await.unwrap();
let client = &timeline.remote_client;
let client = timeline.remote_client.as_ref().unwrap();
let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let local_path = local_layer_path(

View File

@@ -21,6 +21,7 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::storage_layer::layer::local_layer_path;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::Generation;
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
@@ -49,13 +50,19 @@ pub async fn download_layer_file<'a>(
timeline_id: TimelineId,
layer_file_name: &'a LayerName,
layer_metadata: &'a LayerFileMetadata,
local_path: &Utf8Path,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<u64, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
let local_path = local_layer_path(
conf,
&tenant_shard_id,
&timeline_id,
layer_file_name,
&layer_metadata.generation,
);
let remote_path = remote_layer_path(
&tenant_shard_id.tenant_id,
@@ -75,7 +82,7 @@ pub async fn download_layer_file<'a>(
// For more context about durable_rename check this email from postgres mailing list:
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
let bytes_amount = download_retry(
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
@@ -84,7 +91,7 @@ pub async fn download_layer_file<'a>(
)
.await?;
let expected = layer_metadata.file_size;
let expected = layer_metadata.file_size();
if expected != bytes_amount {
return Err(DownloadError::Other(anyhow!(
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",

View File

@@ -5,7 +5,6 @@
use std::collections::HashMap;
use chrono::NaiveDateTime;
use pageserver_api::models::AuxFilePolicy;
use serde::{Deserialize, Serialize};
use utils::id::TimelineId;
@@ -17,6 +16,46 @@ use pageserver_api::shard::ShardIndex;
use utils::lsn::Lsn;
/// Metadata gathered for each of the layer files.
///
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
//#[cfg_attr(test, derive(Default))]
pub struct LayerFileMetadata {
file_size: u64,
pub(crate) generation: Generation,
pub(crate) shard: ShardIndex,
}
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
fn from(other: &IndexLayerMetadata) -> Self {
LayerFileMetadata {
file_size: other.file_size,
generation: other.generation,
shard: other.shard,
}
}
}
impl LayerFileMetadata {
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
LayerFileMetadata {
file_size,
generation,
shard,
}
}
pub fn file_size(&self) -> u64 {
self.file_size
}
}
// TODO seems like another part of the remote storage file format
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
/// In-memory representation of an `index_part.json` file
///
/// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -37,7 +76,7 @@ pub struct IndexPart {
///
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
/// that latest version stores.
pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
// It's duplicated for convenience when reading the serialized structure, but is
@@ -49,16 +88,6 @@ pub struct IndexPart {
#[serde(default)]
pub(crate) lineage: Lineage,
/// Describes the kind of aux files stored in the timeline.
///
/// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
/// A V1 setting after V2 files have been committed is not accepted.
///
/// None means no aux files have been written to the storage before the point
/// when this flag is introduced.
#[serde(skip_serializing_if = "Option::is_none", default)]
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
}
impl IndexPart {
@@ -72,11 +101,10 @@ impl IndexPart {
/// is always generated from the keys of `layer_metadata`)
/// - 4: timeline_layers is fully removed.
/// - 5: lineage was added
/// - 6: last_aux_file_policy is added.
const LATEST_VERSION: usize = 6;
const LATEST_VERSION: usize = 5;
// Versions we may see when reading from a bucket.
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
pub const FILE_NAME: &'static str = "index_part.json";
@@ -85,9 +113,11 @@ impl IndexPart {
disk_consistent_lsn: Lsn,
metadata: TimelineMetadata,
lineage: Lineage,
last_aux_file_policy: Option<AuxFilePolicy>,
) -> Self {
let layer_metadata = layers_and_metadata.clone();
let layer_metadata = layers_and_metadata
.iter()
.map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
.collect();
Self {
version: Self::LATEST_VERSION,
@@ -96,7 +126,6 @@ impl IndexPart {
metadata,
deleted_at: None,
lineage,
last_aux_file_policy,
}
}
@@ -126,13 +155,8 @@ impl IndexPart {
example_metadata.disk_consistent_lsn(),
example_metadata,
Default::default(),
Some(AuxFilePolicy::V1),
)
}
pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
self.last_aux_file_policy
}
}
impl From<&UploadQueueInitialized> for IndexPart {
@@ -141,22 +165,13 @@ impl From<&UploadQueueInitialized> for IndexPart {
let metadata = uq.latest_metadata.clone();
let lineage = uq.latest_lineage.clone();
Self::new(
&uq.latest_files,
disk_consistent_lsn,
metadata,
lineage,
uq.last_aux_file_policy,
)
Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
}
}
/// Metadata gathered for each of the layer files.
///
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct LayerFileMetadata {
/// Serialized form of [`LayerFileMetadata`].
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub struct IndexLayerMetadata {
pub file_size: u64,
#[serde(default = "Generation::none")]
@@ -168,12 +183,12 @@ pub struct LayerFileMetadata {
pub shard: ShardIndex,
}
impl LayerFileMetadata {
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
LayerFileMetadata {
file_size,
generation,
shard,
impl From<&LayerFileMetadata> for IndexLayerMetadata {
fn from(other: &LayerFileMetadata) -> Self {
IndexLayerMetadata {
file_size: other.file_size,
generation: other.generation,
shard: other.shard,
}
}
}
@@ -267,12 +282,12 @@ mod tests {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
@@ -284,7 +299,6 @@ mod tests {
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -309,12 +323,12 @@ mod tests {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
@@ -326,7 +340,6 @@ mod tests {
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -352,12 +365,12 @@ mod tests {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 2,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
@@ -370,7 +383,6 @@ mod tests {
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
lineage: Lineage::default(),
last_aux_file_policy: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -416,7 +428,6 @@ mod tests {
.unwrap(),
deleted_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -440,12 +451,12 @@ mod tests {
let expected = IndexPart {
version: 4,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
@@ -457,7 +468,6 @@ mod tests {
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
lineage: Lineage::default(),
last_aux_file_policy: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -482,12 +492,12 @@ mod tests {
let expected = IndexPart {
version: 5,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
file_size: 23289856,
generation: Generation::new(1),
shard: ShardIndex::unsharded(),
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
file_size: 1015808,
generation: Generation::new(1),
shard: ShardIndex::unsharded(),
@@ -501,57 +511,6 @@ mod tests {
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
},
last_aux_file_policy: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
#[test]
fn v6_indexpart_is_parsed() {
let example = r#"{
"version":6,
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
"deleted_at": "2023-07-31T09:00:00.123",
"lineage":{
"original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
"reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
},
"last_aux_file_policy": "V2"
}"#;
let expected = IndexPart {
version: 6,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
lineage: Lineage {
reparenting_history_truncated: false,
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
},
last_aux_file_policy: Some(AuxFilePolicy::V2),
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();

View File

@@ -6,9 +6,11 @@ mod scheduler;
use std::{sync::Arc, time::SystemTime};
use crate::{
config::PageServerConf,
context::RequestContext,
disk_usage_eviction_task::DiskUsageEvictionInfo,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
virtual_file::MaybeFatalIo,
};
use self::{
@@ -19,8 +21,9 @@ use self::{
use super::{
config::{SecondaryLocationConfig, TenantConfOpt},
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
span::debug_assert_current_span_has_tenant_id,
storage_layer::LayerName,
storage_layer::{layer::local_layer_path, LayerName},
};
use pageserver_api::{
@@ -175,7 +178,13 @@ impl SecondaryTenant {
/// Cancellation safe, but on cancellation the eviction will go through
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
pub(crate) async fn evict_layer(
self: &Arc<Self>,
conf: &PageServerConf,
timeline_id: TimelineId,
name: LayerName,
metadata: LayerFileMetadata,
) {
debug_assert_current_span_has_tenant_id();
let guard = match self.gate.enter() {
@@ -188,11 +197,41 @@ impl SecondaryTenant {
let now = SystemTime::now();
let local_path = local_layer_path(
conf,
&self.tenant_shard_id,
&timeline_id,
&name,
&metadata.generation,
);
let this = self.clone();
// spawn it to be cancellation safe
tokio::task::spawn_blocking(move || {
let _guard = guard;
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
let deleted = std::fs::remove_file(local_path);
let not_found = deleted
.as_ref()
.is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
let deleted = if not_found {
false
} else {
deleted
.map(|()| true)
.fatal_err("Deleting layer during eviction")
};
if !deleted {
// skip updating accounting and putting perhaps later timestamp
return;
}
// Update the timeline's state. This does not have to be synchronized with
// the download process, because:
@@ -211,15 +250,8 @@ impl SecondaryTenant {
// of the cache.
let mut detail = this.detail.lock().unwrap();
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
let removed = timeline_detail.on_disk_layers.remove(&name);
// We might race with removal of the same layer during downloads, if it was removed
// from the heatmap. If we see that the OnDiskState is gone, then no need to
// do a physical deletion or store in evicted_at.
if let Some(removed) = removed {
removed.remove_blocking();
timeline_detail.evicted_at.insert(name, now);
}
timeline_detail.on_disk_layers.remove(&name);
timeline_detail.evicted_at.insert(name, now);
}
})
.await

View File

@@ -26,7 +26,7 @@ use crate::{
tasks::{warn_when_period_overrun, BackgroundLoopKind},
},
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
TEMP_FILE_SUFFIX,
METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
};
use super::{
@@ -45,10 +45,10 @@ use crate::tenant::{
use camino::Utf8PathBuf;
use chrono::format::{DelayedFormat, StrftimeItems};
use futures::Future;
use futures::{Future, StreamExt};
use pageserver_api::models::SecondaryProgress;
use pageserver_api::shard::TenantShardId;
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, warn, Instrument};
@@ -62,10 +62,20 @@ use super::{
CommandRequest, DownloadCommand,
};
/// For each tenant, default period for how long must have passed since the last download_tenant call before
/// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
/// download, if the uploader populated it.
const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
/// For each tenant, how long must have passed since the last download_tenant call before
/// calling it again. This is approximately the time by which local data is allowed
/// to fall behind remote data.
///
/// TODO: this should just be a default, and the actual period should be controlled
/// via the heatmap itself
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
/// Range of concurrency we may use when downloading layers within a timeline. This is independent
/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
/// `PageServerConf::secondary_download_concurrency`
const MAX_LAYER_CONCURRENCY: usize = 16;
const MIN_LAYER_CONCURRENCY: usize = 1;
pub(super) async fn downloader_task(
tenant_manager: Arc<TenantManager>,
@@ -75,18 +85,19 @@ pub(super) async fn downloader_task(
cancel: CancellationToken,
root_ctx: RequestContext,
) {
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
// How many tenants' secondary download operations we will run concurrently
let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
let generator = SecondaryDownloader {
tenant_manager,
remote_storage,
root_ctx,
};
let mut scheduler = Scheduler::new(generator, concurrency);
let mut scheduler = Scheduler::new(generator, tenant_concurrency);
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("secondary_download_scheduler"))
.instrument(info_span!("secondary_downloads"))
.await
}
@@ -100,7 +111,6 @@ struct SecondaryDownloader {
pub(super) struct OnDiskState {
metadata: LayerFileMetadata,
access_time: SystemTime,
local_path: Utf8PathBuf,
}
impl OnDiskState {
@@ -111,26 +121,12 @@ impl OnDiskState {
_ame: LayerName,
metadata: LayerFileMetadata,
access_time: SystemTime,
local_path: Utf8PathBuf,
) -> Self {
Self {
metadata,
access_time,
local_path,
}
}
// This is infallible, because all errors are either acceptable (ENOENT), or totally
// unexpected (fatal).
pub(super) fn remove_blocking(&self) {
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
std::fs::remove_file(&self.local_path)
.or_else(fs_ext::ignore_not_found)
.fatal_err("Deleting secondary layer")
}
}
#[derive(Debug, Clone, Default)]
@@ -141,22 +137,14 @@ pub(super) struct SecondaryDetailTimeline {
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
}
// Aspects of a heatmap that we remember after downloading it
#[derive(Clone, Debug)]
struct DownloadSummary {
etag: Etag,
#[allow(unused)]
mtime: SystemTime,
upload_period: Duration,
}
/// This state is written by the secondary downloader, it is opaque
/// to TenantManager
#[derive(Debug)]
pub(super) struct SecondaryDetail {
pub(super) config: SecondaryLocationConfig,
last_download: Option<DownloadSummary>,
last_download: Option<Instant>,
last_etag: Option<Etag>,
next_download: Option<Instant>,
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
}
@@ -186,6 +174,7 @@ impl SecondaryDetail {
Self {
config,
last_download: None,
last_etag: None,
next_download: None,
timelines: HashMap::new(),
}
@@ -239,8 +228,9 @@ impl SecondaryDetail {
struct PendingDownload {
secondary_state: Arc<SecondaryTenant>,
last_download: Option<DownloadSummary>,
last_download: Option<Instant>,
target_time: Option<Instant>,
period: Option<Duration>,
}
impl scheduler::PendingJob for PendingDownload {
@@ -290,17 +280,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
tracing::debug!("Secondary tenant download completed");
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
// take priority to run again.
let mut detail = secondary_state.detail.lock().unwrap();
let period = detail
.last_download
.as_ref()
.map(|d| d.upload_period)
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
// We advance next_download irrespective of errors: we don't want error cases to result in
// expensive busy-polling.
detail.next_download = Some(Instant::now() + period_jitter(period, 5));
detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
}
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -333,11 +316,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
if detail.next_download.is_none() {
// Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times. Subsequent
// rounds will use a smaller jitter to avoid accidentally synchronizing later.
detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
"Using our constant, which is known to be small compared with clock range",
));
}
(detail.last_download.clone(), detail.next_download.unwrap())
(detail.last_download, detail.next_download.unwrap())
};
if now > next_download {
@@ -345,6 +328,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
secondary_state: secondary_tenant,
last_download,
target_time: Some(next_download),
period: Some(DOWNLOAD_FRESHEN_INTERVAL),
})
} else {
None
@@ -370,6 +354,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
Ok(PendingDownload {
target_time: None,
period: None,
last_download: None,
secondary_state: tenant,
})
@@ -386,6 +371,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
secondary_state,
last_download,
target_time,
period,
} = job;
let (completion, barrier) = utils::completion::channel();
@@ -407,7 +393,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
tracing::warn!("Insufficient space while downloading. Will retry later.");
}
Err(UpdateError::Cancelled) => {
tracing::info!("Shut down while downloading");
tracing::debug!("Shut down while downloading");
},
Err(UpdateError::Deserialize(e)) => {
tracing::error!("Corrupt content while downloading tenant: {e}");
@@ -422,15 +408,20 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
// If the job had a target execution time, we may check our final execution
// time against that for observability purposes.
if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
// Elapsed time includes any scheduling lag as well as the execution of the job
let elapsed = Instant::now().duration_since(target_time);
if let (Some(target_time), Some(period)) = (target_time, period) {
// Only track execution lag if this isn't our first download: otherwise, it is expected
// that execution will have taken longer than our configured interval, for example
// when starting up a pageserver and
if last_download.is_some() {
// Elapsed time includes any scheduling lag as well as the execution of the job
let elapsed = Instant::now().duration_since(target_time);
warn_when_period_overrun(
elapsed,
last_download.upload_period,
BackgroundLoopKind::SecondaryDownload,
);
warn_when_period_overrun(
elapsed,
period,
BackgroundLoopKind::SecondaryDownload,
);
}
}
CompleteDownload {
@@ -519,12 +510,12 @@ impl<'a> TenantDownloader<'a> {
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
// We will use the etag from last successful download to make the download conditional on changes
let last_download = self
let last_etag = self
.secondary_state
.detail
.lock()
.unwrap()
.last_download
.last_etag
.clone();
// Download the tenant's heatmap
@@ -533,7 +524,7 @@ impl<'a> TenantDownloader<'a> {
etag: heatmap_etag,
bytes: heatmap_bytes,
} = match tokio::select!(
bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
_ = self.secondary_state.cancel.cancelled() => return Ok(())
) {
HeatMapDownload::Unmodified => {
@@ -562,39 +553,6 @@ impl<'a> TenantDownloader<'a> {
heatmap.timelines.len()
);
// Get or initialize the local disk state for the timelines we will update
let mut timeline_states = HashMap::new();
for timeline in &heatmap.timelines {
let timeline_state = self
.secondary_state
.detail
.lock()
.unwrap()
.timelines
.get(&timeline.timeline_id)
.cloned();
let timeline_state = match timeline_state {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state =
init_timeline_state(self.conf, tenant_shard_id, timeline).await;
// Re-acquire detail lock now that we're done with async load from local FS
self.secondary_state
.detail
.lock()
.unwrap()
.timelines
.insert(timeline.timeline_id, timeline_state.clone());
timeline_state
}
};
timeline_states.insert(timeline.timeline_id, timeline_state);
}
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
// principle that deletions should be done before writes wherever possible, and so that we can use this
// phase to initialize our SecondaryProgress.
@@ -605,10 +563,6 @@ impl<'a> TenantDownloader<'a> {
// Download the layers in the heatmap
for timeline in heatmap.timelines {
let timeline_state = timeline_states
.remove(&timeline.timeline_id)
.expect("Just populated above");
if self.secondary_state.cancel.is_cancelled() {
tracing::debug!(
"Cancelled before downloading timeline {}",
@@ -618,7 +572,7 @@ impl<'a> TenantDownloader<'a> {
}
let timeline_id = timeline.timeline_id;
self.download_timeline(timeline, timeline_state, ctx)
self.download_timeline(timeline, ctx)
.instrument(tracing::info_span!(
"secondary_download_timeline",
tenant_id=%tenant_shard_id.tenant_id,
@@ -630,30 +584,7 @@ impl<'a> TenantDownloader<'a> {
// Only update last_etag after a full successful download: this way will not skip
// the next download, even if the heatmap's actual etag is unchanged.
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
etag: heatmap_etag,
mtime: heatmap_mtime,
upload_period: heatmap
.upload_period_ms
.map(|ms| Duration::from_millis(ms as u64))
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
});
// Robustness: we should have updated progress properly, but in case we didn't, make sure
// we don't leave the tenant in a state where we claim to have successfully downloaded
// everything, but our progress is incomplete. The invariant here should be that if
// we have set `last_download` to this heatmap's etag, then the next time we see that
// etag we can safely do no work (i.e. we must be complete).
let mut progress = self.secondary_state.progress.lock().unwrap();
debug_assert!(progress.layers_downloaded == progress.layers_total);
debug_assert!(progress.bytes_downloaded == progress.bytes_total);
if progress.layers_downloaded != progress.layers_total
|| progress.bytes_downloaded != progress.bytes_total
{
tracing::warn!("Correcting drift in progress stats ({progress:?})");
progress.layers_downloaded = progress.layers_total;
progress.bytes_downloaded = progress.bytes_total;
}
self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
Ok(())
}
@@ -709,7 +640,7 @@ impl<'a> TenantDownloader<'a> {
let mut layer_byte_count: u64 = timeline_state
.on_disk_layers
.values()
.map(|l| l.metadata.file_size)
.map(|l| l.metadata.file_size())
.sum();
// Remove on-disk layers that are no longer present in heatmap
@@ -720,7 +651,7 @@ impl<'a> TenantDownloader<'a> {
.get(layer_file_name)
.unwrap()
.metadata
.file_size;
.file_size();
let local_path = local_layer_path(
self.conf,
@@ -830,7 +761,6 @@ impl<'a> TenantDownloader<'a> {
async fn download_timeline(
&self,
timeline: HeatMapTimeline,
timeline_state: SecondaryDetailTimeline,
ctx: &RequestContext,
) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -839,8 +769,38 @@ impl<'a> TenantDownloader<'a> {
// Accumulate updates to the state
let mut touched = Vec::new();
// Clone a view of what layers already exist on disk
let timeline_state = self
.secondary_state
.detail
.lock()
.unwrap()
.timelines
.get(&timeline.timeline_id)
.cloned();
let timeline_state = match timeline_state {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state =
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
// Re-acquire detail lock now that we're done with async load from local FS
self.secondary_state
.detail
.lock()
.unwrap()
.timelines
.insert(timeline.timeline_id, timeline_state.clone());
timeline_state
}
};
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
let mut download_futs = Vec::new();
// Download heatmap layers that are not present on local disk, or update their
// access time if they are already present.
for layer in timeline.layers {
@@ -856,12 +816,20 @@ impl<'a> TenantDownloader<'a> {
if cfg!(debug_assertions) {
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
// are already present on disk are really there.
match tokio::fs::metadata(&on_disk.local_path).await {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&layer.name,
&layer.metadata.generation,
);
match tokio::fs::metadata(&local_path).await {
Ok(meta) => {
tracing::debug!(
"Layer {} present at {}, size {}",
layer.name,
on_disk.local_path,
local_path,
meta.len(),
);
}
@@ -869,7 +837,7 @@ impl<'a> TenantDownloader<'a> {
tracing::warn!(
"Layer {} not found at {} ({})",
layer.name,
on_disk.local_path,
local_path,
e
);
debug_assert!(false);
@@ -877,7 +845,9 @@ impl<'a> TenantDownloader<'a> {
}
}
if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|| on_disk.access_time != layer.access_time
{
// We already have this layer on disk. Update its access time.
tracing::debug!(
"Access time updated for layer {}: {} -> {}",
@@ -913,14 +883,31 @@ impl<'a> TenantDownloader<'a> {
}
}
match self
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
.await?
{
Some(layer) => touched.push(layer),
None => {
// Not an error but we didn't download it: remote layer is missing. Don't add it to the list of
// things to consider touched.
download_futs.push(self.download_layer(
tenant_shard_id,
&timeline.timeline_id,
layer,
ctx,
));
}
// Break up layer downloads into chunks, so that for each chunk we can re-check how much
// concurrency to use based on activity level of remote storage.
while !download_futs.is_empty() {
let chunk =
download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
let concurrency = Self::layer_concurrency(self.remote_storage.activity());
let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
let mut result_stream = std::pin::pin!(result_stream);
while let Some(result) = result_stream.next().await {
match result {
Err(e) => return Err(e),
Ok(None) => {
// No error, but we didn't download the layer. Don't mark it touched
}
Ok(Some(layer)) => touched.push(layer),
}
}
}
@@ -939,21 +926,13 @@ impl<'a> TenantDownloader<'a> {
v.get_mut().access_time = t.access_time;
}
Entry::Vacant(e) => {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&t.name,
&t.metadata.generation,
);
e.insert(OnDiskState::new(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
t.name,
t.metadata.clone(),
LayerFileMetadata::from(&t.metadata),
t.access_time,
local_path,
));
}
}
@@ -976,28 +955,14 @@ impl<'a> TenantDownloader<'a> {
&self.secondary_state.cancel
);
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
timeline_id,
&layer.name,
&layer.metadata.generation,
);
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
tracing::info!(
"Starting download of layer {}, size {}",
layer.name,
layer.metadata.file_size
);
let downloaded_bytes = match download_layer_file(
self.conf,
self.remote_storage,
*tenant_shard_id,
*timeline_id,
&layer.name,
&layer.metadata,
&local_path,
&LayerFileMetadata::from(&layer.metadata),
&self.secondary_state.cancel,
ctx,
)
@@ -1012,14 +977,6 @@ impl<'a> TenantDownloader<'a> {
"Skipped downloading missing layer {}, raced with compaction/gc?",
layer.name
);
// If the layer is 404, adjust the progress statistics to reflect that we will not download it.
let mut progress = self.secondary_state.progress.lock().unwrap();
progress.layers_total = progress.layers_total.saturating_sub(1);
progress.bytes_total = progress
.bytes_total
.saturating_sub(layer.metadata.file_size);
return Ok(None);
}
Err(e) => return Err(e.into()),
@@ -1055,6 +1012,19 @@ impl<'a> TenantDownloader<'a> {
Ok(Some(layer))
}
/// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
// When less than 75% of units are available, use minimum concurrency. Else, do a linear mapping
// of our concurrency range to the units available within the remaining 25%.
let clamp_at = (activity.read_total * 3) / 4;
if activity.read_available > clamp_at {
(MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
/ (activity.read_total - clamp_at)
} else {
MIN_LAYER_CONCURRENCY
}
}
}
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1104,7 +1074,11 @@ async fn init_timeline_state(
.fatal_err(&format!("Read metadata on {}", file_path));
let file_name = file_path.file_name().expect("created it from the dentry");
if crate::is_temporary(&file_path)
if file_name == METADATA_FILE_NAME {
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
continue;
} else if crate::is_temporary(&file_path)
|| is_temp_download_file(&file_path)
|| is_ephemeral_file(file_name)
{
@@ -1144,9 +1118,8 @@ async fn init_timeline_state(
tenant_shard_id,
&heatmap.timeline_id,
name,
remote_meta.metadata.clone(),
LayerFileMetadata::from(&remote_meta.metadata),
remote_meta.access_time,
file_path,
),
);
}
@@ -1178,3 +1151,58 @@ async fn init_timeline_state(
detail
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn layer_concurrency() {
// Totally idle
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 16,
read_total: 16,
write_available: 16,
write_total: 16
}),
MAX_LAYER_CONCURRENCY
);
// Totally busy
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 0,
read_total: 16,
write_available: 16,
write_total: 16
}),
MIN_LAYER_CONCURRENCY
);
// Edge of the range at which we interpolate
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 12,
read_total: 16,
write_available: 16,
write_total: 16
}),
MIN_LAYER_CONCURRENCY
);
// Midpoint of the range in which we interpolate
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 14,
read_total: 16,
write_available: 16,
write_total: 16
}),
MAX_LAYER_CONCURRENCY / 2
);
}
}

View File

@@ -1,6 +1,6 @@
use std::time::SystemTime;
use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(super) name: LayerName,
pub(super) metadata: LayerFileMetadata,
pub(super) metadata: IndexLayerMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
pub(super) access_time: SystemTime,
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
impl HeatMapLayer {
pub(crate) fn new(
name: LayerName,
metadata: LayerFileMetadata,
metadata: IndexLayerMetadata,
access_time: SystemTime,
) -> Self {
Self {

View File

@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("heatmap_upload_scheduler"))
.instrument(info_span!("heatmap_uploader"))
.await
}

View File

@@ -179,13 +179,6 @@ where
// Schedule some work, if concurrency limit permits it
self.spawn_pending();
// This message is printed every scheduling iteration as proof of liveness when looking at logs
tracing::info!(
"Status: {} tasks running, {} pending",
self.running.len(),
self.pending.len()
);
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
@@ -265,11 +258,7 @@ where
self.tasks.spawn(fut);
let replaced = self.running.insert(tenant_shard_id, in_progress);
debug_assert!(replaced.is_none());
if replaced.is_some() {
tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
}
self.running.insert(tenant_shard_id, in_progress);
}
/// For all pending tenants that are elegible for execution, spawn their task.
@@ -279,9 +268,7 @@ where
while !self.pending.is_empty() && self.running.len() < self.concurrency {
// unwrap: loop condition includes !is_empty()
let pending = self.pending.pop_front().unwrap();
if !self.running.contains_key(pending.get_tenant_shard_id()) {
self.do_spawn(pending);
}
self.do_spawn(pending);
}
}
@@ -334,8 +321,7 @@ where
let tenant_shard_id = job.get_tenant_shard_id();
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Command already running, waiting for it");
tracing::info!("Command already running, waiting for it");
barrier
} else {
let running = self.spawn_now(job);

View File

@@ -113,20 +113,12 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
}
}
/// Bag of data accumulated during a vectored get..
/// Bag of data accumulated during a vectored get
pub(crate) struct ValuesReconstructState {
/// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
/// should not expect to get anything from this hashmap.
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
/// The keys which are already retrieved
keys_done: KeySpaceRandomAccum,
/// The keys covered by the image layers
keys_with_image_coverage: Option<Range<Key>>,
// Statistics that are still accessible as a caller of `get_vectored_impl`.
layers_visited: u32,
delta_layers_visited: u32,
}
impl ValuesReconstructState {
@@ -134,9 +126,7 @@ impl ValuesReconstructState {
Self {
keys: HashMap::new(),
keys_done: KeySpaceRandomAccum::new(),
keys_with_image_coverage: None,
layers_visited: 0,
delta_layers_visited: 0,
}
}
@@ -150,17 +140,8 @@ impl ValuesReconstructState {
}
}
pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
pub(crate) fn on_layer_visited(&mut self) {
self.layers_visited += 1;
if let ReadableLayer::PersistentLayer(layer) = layer {
if layer.layer_desc().is_delta() {
self.delta_layers_visited += 1;
}
}
}
pub(crate) fn get_delta_layers_visited(&self) -> u32 {
self.delta_layers_visited
}
pub(crate) fn get_layers_visited(&self) -> u32 {
@@ -190,16 +171,6 @@ impl ValuesReconstructState {
}
}
/// On hitting image layer, we can mark all keys in this range as done, because
/// if the image layer does not contain a key, it is deleted/never added.
pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
assert_eq!(
prev_val, None,
"should consume the keyspace before the next iteration"
);
}
/// Update the state collected for a given key.
/// Returns true if this was the last value needed for the key and false otherwise.
///
@@ -262,12 +233,8 @@ impl ValuesReconstructState {
/// Returns the key space describing the keys that have
/// been marked as completed since the last call to this function.
/// Returns individual keys done, and the image layer coverage.
pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
(
self.keys_done.consume_keyspace(),
self.keys_with_image_coverage.take(),
)
pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
self.keys_done.consume_keyspace()
}
}

View File

@@ -47,7 +47,7 @@ use hex;
use itertools::Itertools;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::fs::File;
@@ -158,7 +158,6 @@ pub struct ImageLayerInner {
index_start_blk: u32,
index_root_blk: u32,
key_range: Range<Key>,
lsn: Lsn,
file: VirtualFile,
@@ -420,7 +419,6 @@ impl ImageLayerInner {
file,
file_id,
max_vectored_read_bytes,
key_range: actual_summary.key_range,
}))
}
@@ -473,27 +471,19 @@ impl ImageLayerInner {
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
let reads = self
.plan_reads(keyspace, None, ctx)
.plan_reads(keyspace, ctx)
.await
.map_err(GetVectoredError::Other)?;
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
.await;
reconstruct_state.on_image_layer_visited(&self.key_range);
Ok(())
}
/// Traverse the layer's index to build read operations on the overlap of the input keyspace
/// and the keys in this layer.
///
/// If shard_identity is provided, it will be used to filter keys down to those stored on
/// this shard.
async fn plan_reads(
&self,
keyspace: KeySpace,
shard_identity: Option<&ShardIdentity>,
ctx: &RequestContext,
) -> anyhow::Result<Vec<VectoredRead>> {
let mut planner = VectoredReadPlanner::new(
@@ -513,6 +503,7 @@ impl ImageLayerInner {
for range in keyspace.ranges.iter() {
let mut range_end_handled = false;
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
range.start.write_to_byte_slice(&mut search_key);
@@ -525,22 +516,12 @@ impl ImageLayerInner {
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
assert!(key >= range.start);
let flag = if let Some(shard_identity) = shard_identity {
if shard_identity.is_key_disposable(&key) {
BlobFlag::Ignore
} else {
BlobFlag::None
}
} else {
BlobFlag::None
};
if key >= range.end {
planner.handle_range_end(offset);
range_end_handled = true;
break;
} else {
planner.handle(key, self.lsn, offset, flag);
planner.handle(key, self.lsn, offset, BlobFlag::None);
}
}
@@ -553,50 +534,6 @@ impl ImageLayerInner {
Ok(planner.finish())
}
/// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
/// then execute vectored GET operations, passing the results of all read keys into the writer.
pub(super) async fn filter(
&self,
shard_identity: &ShardIdentity,
writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
// Fragment the range into the regions owned by this ShardIdentity
let plan = self
.plan_reads(
KeySpace {
// If asked for the total key space, plan_reads will give us all the keys in the layer
ranges: vec![Key::MIN..Key::MAX],
},
Some(shard_identity),
ctx,
)
.await?;
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
let mut key_count = 0;
for read in plan.into_iter() {
let buf_size = read.size();
let buf = BytesMut::with_capacity(buf_size);
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
let frozen_buf = blobs_buf.buf.freeze();
for meta in blobs_buf.blobs.iter() {
let img_buf = frozen_buf.slice(meta.start..meta.end);
key_count += 1;
writer
.put_image(meta.meta.key, img_buf, ctx)
.await
.context(format!("Storing key {}", meta.meta.key))?;
}
}
Ok(key_count)
}
async fn do_reads_and_update_state(
&self,
reads: Vec<VectoredRead>,
@@ -709,7 +646,7 @@ impl ImageLayerWriterInner {
lsn,
},
);
trace!("creating image layer {}", path);
info!("new image layer {path}");
let mut file = {
VirtualFile::open_with_options(
&path,
@@ -829,7 +766,7 @@ impl ImageLayerWriterInner {
// FIXME: why not carry the virtualfile here, it supports renaming?
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
info!("created image layer {}", layer.local_path());
trace!("created image layer {}", layer.local_path());
Ok(layer)
}
@@ -914,136 +851,3 @@ impl Drop for ImageLayerWriter {
}
}
}
#[cfg(test)]
mod test {
use bytes::Bytes;
use pageserver_api::{
key::Key,
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
};
use utils::{id::TimelineId, lsn::Lsn};
use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
use super::ImageLayerWriter;
#[tokio::test]
async fn image_layer_rewrite() {
let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
let (tenant, ctx) = harness.load().await;
// The LSN at which we will create an image layer to filter
let lsn = Lsn(0xdeadbeef0000);
let timeline_id = TimelineId::generate();
let timeline = tenant
.create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
let range = input_start..input_end;
// Build an image layer to filter
let resident = {
let mut writer = ImageLayerWriter::new(
harness.conf,
timeline_id,
harness.tenant_shard_id,
&range,
lsn,
&ctx,
)
.await
.unwrap();
let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
let mut key = range.start;
while key < range.end {
writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
key = key.next();
}
writer.finish(&timeline, &ctx).await.unwrap()
};
let original_size = resident.metadata().file_size;
// Filter for various shards: this exercises cases like values at start of key range, end of key
// range, middle of key range.
for shard_number in 0..4 {
let mut filtered_writer = ImageLayerWriter::new(
harness.conf,
timeline_id,
harness.tenant_shard_id,
&range,
lsn,
&ctx,
)
.await
.unwrap();
// TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
// to exercise filter()
let shard_identity = ShardIdentity::new(
ShardNumber(shard_number),
ShardCount::new(4),
ShardStripeSize(0x8000),
)
.unwrap();
let wrote_keys = resident
.filter(&shard_identity, &mut filtered_writer, &ctx)
.await
.unwrap();
let replacement = if wrote_keys > 0 {
Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
} else {
None
};
// This exact size and those below will need updating as/when the layer encoding changes, but
// should be deterministic for a given version of the format, as we used no randomness generating the input.
assert_eq!(original_size, 1597440);
match shard_number {
0 => {
// We should have written out just one stripe for our shard identity
assert_eq!(wrote_keys, 0x8000);
let replacement = replacement.unwrap();
// We should have dropped some of the data
assert!(replacement.metadata().file_size < original_size);
assert!(replacement.metadata().file_size > 0);
// Assert that we dropped ~3/4 of the data.
assert_eq!(replacement.metadata().file_size, 417792);
}
1 => {
// Shard 1 has no keys in our input range
assert_eq!(wrote_keys, 0x0);
assert!(replacement.is_none());
}
2 => {
// Shard 2 has one stripes in the input range
assert_eq!(wrote_keys, 0x8000);
let replacement = replacement.unwrap();
assert!(replacement.metadata().file_size < original_size);
assert!(replacement.metadata().file_size > 0);
assert_eq!(replacement.metadata().file_size, 417792);
}
3 => {
// Shard 3 has two stripes in the input range
assert_eq!(wrote_keys, 0x10000);
let replacement = replacement.unwrap();
assert!(replacement.metadata().file_size < original_size);
assert!(replacement.metadata().file_size > 0);
assert_eq!(replacement.metadata().file_size, 811008);
}
_ => unreachable!(),
}
}
}
}

View File

@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
use pageserver_api::shard::{ShardIndex, TenantShardId};
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
use tracing::Instrument;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use utils::sync::{gate, heavier_once_cell};
use utils::sync::heavier_once_cell;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
use super::delta_layer::{self, DeltaEntry};
use super::image_layer::{self};
use super::image_layer;
use super::{
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
};
use utils::generation::Generation;
@@ -161,7 +161,7 @@ impl Layer {
timeline.tenant_shard_id,
timeline.timeline_id,
file_name,
metadata.file_size,
metadata.file_size(),
);
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -194,7 +194,7 @@ impl Layer {
timeline.tenant_shard_id,
timeline.timeline_id,
file_name,
metadata.file_size,
metadata.file_size(),
);
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -227,7 +227,7 @@ impl Layer {
timeline
.metrics
.resident_physical_size_add(metadata.file_size);
.resident_physical_size_add(metadata.file_size());
ResidentLayer { downloaded, owner }
}
@@ -585,6 +585,9 @@ struct LayerInner {
/// [`Timeline::gate`] at the same time.
timeline: Weak<Timeline>,
/// Cached knowledge of [`Timeline::remote_client`] being `Some`.
have_remote_client: bool,
access_stats: LayerAccessStats,
/// This custom OnceCell is backed by std mutex, but only held for short time periods.
@@ -729,23 +732,23 @@ impl Drop for LayerInner {
if removed {
timeline.metrics.resident_physical_size_sub(file_size);
}
let res = timeline
.remote_client
.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
if let Err(e) = res {
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
// demonstrating this deadlock (without spawn_blocking): stop will drop
// queued items, which will have ResidentLayer's, and those drops would try
// to re-entrantly lock the RemoteTimelineClient inner state.
if !timeline.is_active() {
tracing::info!("scheduling deletion on drop failed: {e:#}");
if let Err(e) = res {
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
// demonstrating this deadlock (without spawn_blocking): stop will drop
// queued items, which will have ResidentLayer's, and those drops would try
// to re-entrantly lock the RemoteTimelineClient inner state.
if !timeline.is_active() {
tracing::info!("scheduling deletion on drop failed: {e:#}");
} else {
tracing::warn!("scheduling deletion on drop failed: {e:#}");
}
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
} else {
tracing::warn!("scheduling deletion on drop failed: {e:#}");
LAYER_IMPL_METRICS.inc_completed_deletes();
}
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
} else {
LAYER_IMPL_METRICS.inc_completed_deletes();
}
});
}
@@ -783,6 +786,7 @@ impl LayerInner {
path: local_path,
desc,
timeline: Arc::downgrade(timeline),
have_remote_client: timeline.remote_client.is_some(),
access_stats,
wanted_deleted: AtomicBool::new(false),
inner,
@@ -811,6 +815,8 @@ impl LayerInner {
/// in a new attempt to evict OR join the previously started attempt.
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
assert!(self.have_remote_client);
let mut rx = self.status.as_ref().unwrap().subscribe();
{
@@ -967,6 +973,10 @@ impl LayerInner {
return Err(DownloadError::NotFile(ft));
}
if timeline.remote_client.as_ref().is_none() {
return Err(DownloadError::NoRemoteStorage);
}
if let Some(ctx) = ctx {
self.check_expected_download(ctx)?;
}
@@ -1103,12 +1113,15 @@ impl LayerInner {
permit: heavier_once_cell::InitPermit,
ctx: &RequestContext,
) -> anyhow::Result<Arc<DownloadedLayer>> {
let result = timeline
let client = timeline
.remote_client
.as_ref()
.expect("checked before download_init_and_wait");
let result = client
.download_layer_file(
&self.desc.layer_name(),
&self.metadata(),
&self.path,
&timeline.cancel,
ctx,
)
@@ -1264,7 +1277,6 @@ impl LayerInner {
lsn_end: lsn_range.end,
remote: !resident,
access_stats,
l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
}
} else {
let lsn = self.desc.image_layer_lsn();
@@ -1281,10 +1293,20 @@ impl LayerInner {
/// `DownloadedLayer` is being dropped, so it calls this method.
fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
let can_evict = self.have_remote_client;
// we cannot know without inspecting LayerInner::inner if we should evict or not, even
// though here it is very likely
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
if !can_evict {
// it would be nice to assert this case out, but we are in drop
span.in_scope(|| {
tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage");
});
return;
}
// NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
// drop while the `self.inner` is being locked, leading to a deadlock.
@@ -1333,7 +1355,7 @@ impl LayerInner {
is_good_to_continue(&rx.borrow_and_update())?;
let Ok(gate) = timeline.gate.enter() else {
let Ok(_gate) = timeline.gate.enter() else {
return Err(EvictionCancelled::TimelineGone);
};
@@ -1421,7 +1443,7 @@ impl LayerInner {
Self::spawn_blocking(move || {
let _span = span.entered();
let res = self.evict_blocking(&timeline, &gate, &permit);
let res = self.evict_blocking(&timeline, &permit);
let waiters = self.inner.initializer_count();
@@ -1447,7 +1469,6 @@ impl LayerInner {
fn evict_blocking(
&self,
timeline: &Timeline,
_gate: &gate::GateGuard,
_permit: &heavier_once_cell::InitPermit,
) -> Result<(), EvictionCancelled> {
// now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
@@ -1557,6 +1578,8 @@ pub(crate) enum EvictionError {
pub(crate) enum DownloadError {
#[error("timeline has already shutdown")]
TimelineShutdown,
#[error("no remote storage configured")]
NoRemoteStorage,
#[error("context denies downloading")]
ContextAndConfigReallyDeniesDownloads,
#[error("downloading is really required but not allowed by this method")]
@@ -1802,15 +1825,16 @@ impl ResidentLayer {
use LayerKind::*;
let owner = &self.owner.0;
match self.downloaded.get(owner, ctx).await? {
Delta(ref d) => {
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
// while it's being held.
owner
.access_stats
.record_access(LayerAccessKind::KeyIter, ctx);
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
// while it's being held.
delta_layer::DeltaLayerInner::load_keys(d, ctx)
.await
.with_context(|| format!("Layer index is corrupted for {self}"))
@@ -1819,23 +1843,6 @@ impl ResidentLayer {
}
}
/// Read all they keys in this layer which match the ShardIdentity, and write them all to
/// the provided writer. Return the number of keys written.
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
pub(crate) async fn filter<'a>(
&'a self,
shard_identity: &ShardIdentity,
writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
use LayerKind::*;
match self.downloaded.get(&self.owner.0, ctx).await? {
Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
Image(i) => i.filter(shard_identity, writer, ctx).await,
}
}
/// Returns the amount of keys and values written to the writer.
pub(crate) async fn copy_delta_prefix(
&self,

View File

@@ -145,7 +145,7 @@ async fn smoke_test() {
.await
.expect("the local layer file still exists");
let rtc = &timeline.remote_client;
let rtc = timeline.remote_client.as_ref().unwrap();
{
let layers = &[layer];
@@ -761,7 +761,13 @@ async fn eviction_cancellation_on_drop() {
timeline.freeze_and_flush().await.unwrap();
// wait for the upload to complete so our Arc::strong_count assertion holds
timeline.remote_client.wait_completion().await.unwrap();
timeline
.remote_client
.as_ref()
.unwrap()
.wait_completion()
.await
.unwrap();
let (evicted_layer, not_evicted) = {
let mut layers = {

View File

@@ -347,33 +347,37 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
mod test {
use super::*;
#[test]
fn image_layer_parse() {
fn image_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Image(ImageLayerName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
Ok(())
}
#[test]
fn delta_layer_parse() {
fn delta_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Delta(DeltaLayerName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
..Lsn::from_hex("000000000154C481").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
Ok(())
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -9,10 +9,7 @@ use std::ops::{Deref, Range};
use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
RecordedDuration, Timeline,
};
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
use anyhow::{anyhow, Context};
use enumset::EnumSet;
@@ -25,13 +22,14 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
use crate::tenant::timeline::{Layer, ResidentLayer};
use crate::tenant::DeltaLayer;
use crate::tenant::PageReconstructError;
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
use crate::{page_cache, ZERO_PAGE};
use crate::keyspace::KeySpace;
use crate::repository::Key;
@@ -118,13 +116,9 @@ impl Timeline {
// 3. Create new image layers for partitions that have been modified
// "enough".
let mut partitioning = dense_partitioning;
partitioning
.parts
.extend(sparse_partitioning.into_dense().parts);
let image_layers = self
let dense_layers = self
.create_image_layers(
&partitioning,
&dense_partitioning,
lsn,
if flags.contains(CompactFlags::ForceImageLayerCreation) {
ImageLayerCreationMode::Force
@@ -136,8 +130,24 @@ impl Timeline {
.await
.map_err(anyhow::Error::from)?;
self.upload_new_image_layers(image_layers)?;
partitioning.parts.len()
// For now, nothing will be produced...
let sparse_layers = self
.create_image_layers(
&sparse_partitioning.clone().into_dense(),
lsn,
if flags.contains(CompactFlags::ForceImageLayerCreation) {
ImageLayerCreationMode::Force
} else {
ImageLayerCreationMode::Try
},
&image_ctx,
)
.await
.map_err(anyhow::Error::from)?;
assert!(sparse_layers.is_empty());
self.upload_new_image_layers(dense_layers)?;
dense_partitioning.parts.len()
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
@@ -176,24 +186,13 @@ impl Timeline {
async fn compact_shard_ancestors(
self: &Arc<Self>,
rewrite_max: usize,
ctx: &RequestContext,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut drop_layers = Vec::new();
let mut layers_to_rewrite: Vec<Layer> = Vec::new();
let layers_to_rewrite: Vec<Layer> = Vec::new();
// We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
// layer is behind this Lsn, it indicates that the layer is being retained beyond the
// pitr_interval, for example because a branchpoint references it.
//
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
// are rewriting layers.
let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
tracing::info!(
"latest_gc_cutoff: {}, pitr cutoff {}",
*latest_gc_cutoff,
self.gc_info.read().unwrap().cutoffs.pitr
);
// We will use the PITR cutoff as a condition for rewriting layers.
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
let layers = self.layers.read().await;
for layer_desc in layers.layer_map().iter_historic_layers() {
@@ -252,9 +251,9 @@ impl Timeline {
// Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
// without incurring the I/O cost of a rewrite.
if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
layer_desc.get_lsn_range().end, *latest_gc_cutoff);
if layer_desc.get_lsn_range().end >= pitr_cutoff {
debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
layer_desc.get_lsn_range().end, pitr_cutoff);
continue;
}
@@ -264,10 +263,13 @@ impl Timeline {
continue;
}
// Only rewrite layers if their generations differ. This guarantees:
// - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
// - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
if layer.metadata().generation == self.generation {
// Only rewrite layers if they would have different remote paths: either they belong to this
// shard but an old generation, or they belonged to another shard. This also implicitly
// guarantees that the layer is persistent in remote storage (as only remote persistent
// layers are carried across shard splits, any local-only layer would be in the current generation)
if layer.metadata().generation == self.generation
&& layer.metadata().shard.shard_count == self.shard_identity.count
{
debug!(%layer, "Skipping rewrite, is not from old generation");
continue;
}
@@ -280,77 +282,26 @@ impl Timeline {
}
// Fall through: all our conditions for doing a rewrite passed.
layers_to_rewrite.push(layer);
// TODO: implement rewriting
tracing::debug!(%layer, "Would rewrite layer");
}
// Drop read lock on layer map before we start doing time-consuming I/O
// Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
drop(layers);
let mut replace_image_layers = Vec::new();
for layer in layers_to_rewrite {
tracing::info!(layer=%layer, "Rewriting layer after shard split...");
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
&layer.layer_desc().key_range,
layer.layer_desc().image_layer_lsn(),
ctx,
)
.await?;
// Safety of layer rewrites:
// - We are writing to a different local file path than we are reading from, so the old Layer
// cannot interfere with the new one.
// - In the page cache, contents for a particular VirtualFile are stored with a file_id that
// is different for two layers with the same name (in `ImageLayerInner::new` we always
// acquire a fresh id from [`crate::page_cache::next_file_id`]. So readers do not risk
// reading the index from one layer file, and then data blocks from the rewritten layer file.
// - Any readers that have a reference to the old layer will keep it alive until they are done
// with it. If they are trying to promote from remote storage, that will fail, but this is the same
// as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
// - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
// - GC, which at worst witnesses us "undelete" a layer that they just deleted.
// - ingestion, which only inserts layers, therefore cannot collide with us.
let resident = layer.download_and_keep_resident().await?;
let keys_written = resident
.filter(&self.shard_identity, &mut image_layer_writer, ctx)
.await?;
if keys_written > 0 {
let new_layer = image_layer_writer.finish(self, ctx).await?;
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
layer.metadata().file_size,
new_layer.metadata().file_size);
replace_image_layers.push((layer, new_layer));
} else {
// Drop the old layer. Usually for this case we would already have noticed that
// the layer has no data for us with the ShardedRange check above, but
drop_layers.push(layer);
}
}
// At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
// metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
// to remote index) and be removed. This is inefficient but safe.
fail::fail_point!("compact-shard-ancestors-localonly");
// TODO: collect layers to rewrite
let replace_layers = Vec::new();
// Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
self.rewrite_layers(replace_image_layers, drop_layers)
.await?;
self.rewrite_layers(replace_layers, drop_layers).await?;
fail::fail_point!("compact-shard-ancestors-enqueued");
// We wait for all uploads to complete before finishing this compaction stage. This is not
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
// load.
self.remote_client.wait_completion().await?;
fail::fail_point!("compact-shard-ancestors-persistent");
if let Some(remote_client) = self.remote_client.as_ref() {
// We wait for all uploads to complete before finishing this compaction stage. This is not
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
// load.
remote_client.wait_completion().await?;
}
Ok(())
}
@@ -550,11 +501,8 @@ impl Timeline {
for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
if let Some(prev_key) = prev {
// just first fast filter, do not create hole entries for metadata keys. The last hole in the
// compaction is the gap between data key and metadata keys.
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
&& !Key::is_metadata_key(&prev_key)
{
// just first fast filter
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
let key_range = prev_key..next_key;
// Measuring hole by just subtraction of i128 representation of key range boundaries
// has not so much sense, because largest holes will corresponds field1/field2 changes.
@@ -1213,10 +1161,10 @@ impl TimelineAdaptor {
lsn: Lsn,
key_range: &Range<Key>,
ctx: &RequestContext,
) -> Result<(), CreateImageLayersError> {
) -> Result<(), PageReconstructError> {
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
let image_layer_writer = ImageLayerWriter::new(
let mut image_layer_writer = ImageLayerWriter::new(
self.timeline.conf,
self.timeline.timeline_id,
self.timeline.tenant_shard_id,
@@ -1227,34 +1175,47 @@ impl TimelineAdaptor {
.await?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
Err(CreateImageLayersError::Other(anyhow::anyhow!(
Err(PageReconstructError::Other(anyhow::anyhow!(
"failpoint image-layer-writer-fail-before-finish"
)))
});
let keyspace = KeySpace {
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
};
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
let start = Key::MIN;
let ImageLayerCreationOutcome {
image,
next_start_key: _,
} = self
.timeline
.create_image_layer_for_rel_blocks(
&keyspace,
image_layer_writer,
lsn,
ctx,
key_range.clone(),
start,
)
.await?;
if let Some(image_layer) = image {
self.new_images.push(image_layer);
let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
for range in &keyspace_ranges {
let mut key = range.start;
while key < range.end {
let img = match self.timeline.get(key, lsn, ctx).await {
Ok(img) => img,
Err(err) => {
// If we fail to reconstruct a VM or FSM page, we can zero the
// page without losing any actual user data. That seems better
// than failing repeatedly and getting stuck.
//
// We had a bug at one point, where we truncated the FSM and VM
// in the pageserver, but the Postgres didn't know about that
// and continued to generate incremental WAL records for pages
// that didn't exist in the pageserver. Trying to replay those
// WAL records failed to find the previous image of the page.
// This special case allows us to recover from that situation.
// See https://github.com/neondatabase/neon/issues/2601.
//
// Unfortunately we cannot do this for the main fork, or for
// any metadata keys, keys, as that would lead to actual data
// loss.
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
ZERO_PAGE.clone()
} else {
return Err(err);
}
}
};
image_layer_writer.put_image(key, img, ctx).await?;
key = key.next();
}
}
let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
self.new_images.push(image_layer);
timer.stop_and_record();

View File

@@ -26,21 +26,19 @@ use super::{Timeline, TimelineResources};
/// during attach or pageserver restart.
/// See comment in persist_index_part_with_deleted_flag.
async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
match timeline
.remote_client
.persist_index_part_with_deleted_flag()
.await
{
// If we (now, or already) marked it successfully as deleted, we can proceed
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
// Bail out otherwise
//
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
// two tasks from performing the deletion at the same time. The first task
// that starts deletion should run it to completion.
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
if let Some(remote_client) = timeline.remote_client.as_ref() {
match remote_client.persist_index_part_with_deleted_flag().await {
// If we (now, or already) marked it successfully as deleted, we can proceed
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
// Bail out otherwise
//
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
// two tasks from performing the deletion at the same time. The first task
// that starts deletion should run it to completion.
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
}
}
}
Ok(())
@@ -119,11 +117,11 @@ pub(super) async fn delete_local_timeline_directory(
/// Removes remote layers and an index file after them.
async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
timeline
.remote_client
.delete_all()
.await
.context("delete_all")
if let Some(remote_client) = &timeline.remote_client {
remote_client.delete_all().await.context("delete_all")?
};
Ok(())
}
// This function removs remaining traces of a timeline on disk.
@@ -262,7 +260,7 @@ impl DeleteTimelineFlow {
tenant: Arc<Tenant>,
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: RemoteTimelineClient,
remote_client: Option<RemoteTimelineClient>,
deletion_queue_client: DeletionQueueClient,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -280,8 +278,6 @@ impl DeleteTimelineFlow {
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.
CreateTimelineCause::Delete,
// Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
None,
)
.context("create_timeline_struct")?;

View File

@@ -12,7 +12,7 @@ use crate::{
};
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
#[derive(Debug, thiserror::Error)]
pub(crate) enum Error {
@@ -41,27 +41,6 @@ pub(crate) enum Error {
Unexpected(#[source] anyhow::Error),
}
impl From<Error> for ApiError {
fn from(value: Error) -> Self {
match value {
e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
// TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
Error::ShuttingDown => ApiError::ShuttingDown,
Error::OtherTimelineDetachOngoing(_) => {
ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
}
// All of these contain shutdown errors, in fact, it's the most common
e @ Error::FlushAncestor(_)
| e @ Error::RewrittenDeltaDownloadFailed(_)
| e @ Error::CopyDeltaPrefix(_)
| e @ Error::UploadRewritten(_)
| e @ Error::CopyFailed(_)
| e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
}
}
}
pub(crate) struct PreparedTimelineDetach {
layers: Vec<Layer>,
}
@@ -77,7 +56,7 @@ impl Default for Options {
fn default() -> Self {
Self {
rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(),
copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
}
}
}
@@ -91,16 +70,15 @@ pub(super) async fn prepare(
) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
use Error::*;
if detached.remote_client.as_ref().is_none() {
unimplemented!("no new code for running without remote storage");
}
let Some((ancestor, ancestor_lsn)) = detached
.ancestor_timeline
.as_ref()
.map(|tl| (tl.clone(), detached.ancestor_lsn))
else {
// TODO: check if we have already been detached; for this we need to read the stored data
// on remote client, for that we need a follow-up which makes uploads cheaper and maintains
// a projection of the commited data.
//
// the error is wrong per openapi
return Err(NoAncestor);
};
@@ -110,7 +88,7 @@ pub(super) async fn prepare(
if ancestor.ancestor_timeline.is_some() {
// non-technical requirement; we could flatten N ancestors just as easily but we chose
// not to, at least initially
// not to
return Err(TooManyAncestors);
}
@@ -337,6 +315,8 @@ async fn upload_rewritten_layer(
// FIXME: better shuttingdown error
target
.remote_client
.as_ref()
.unwrap()
.upload_layer_file(&copied, cancel)
.await
.map_err(UploadRewritten)?;
@@ -426,6 +406,8 @@ async fn remote_copy(
// FIXME: better shuttingdown error
adoptee
.remote_client
.as_ref()
.unwrap()
.copy_timeline_layer(adopted, &owned, cancel)
.await
.map(move |()| owned)
@@ -439,6 +421,11 @@ pub(super) async fn complete(
prepared: PreparedTimelineDetach,
_ctx: &RequestContext,
) -> Result<Vec<TimelineId>, anyhow::Error> {
let rtc = detached
.remote_client
.as_ref()
.expect("has to have a remote timeline client for timeline ancestor detach");
let PreparedTimelineDetach { layers } = prepared;
let ancestor = detached
@@ -455,13 +442,11 @@ pub(super) async fn complete(
//
// this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
// which could give us a completely wrong layer combination.
detached
.remote_client
.schedule_adding_existing_layers_to_index_detach_and_wait(
&layers,
(ancestor.timeline_id, ancestor_lsn),
)
.await?;
rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
&layers,
(ancestor.timeline_id, ancestor_lsn),
)
.await?;
let mut tasks = tokio::task::JoinSet::new();
@@ -506,6 +491,8 @@ pub(super) async fn complete(
async move {
let res = timeline
.remote_client
.as_ref()
.expect("reparented has to have remote client because detached has one")
.schedule_reparenting_and_wait(&new_parent)
.await;

View File

@@ -23,7 +23,7 @@ use std::{
use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, info_span, instrument, warn, Instrument};
use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
use crate::{
context::{DownloadBehavior, RequestContext},
@@ -211,6 +211,11 @@ impl Timeline {
// So, we just need to deal with this.
if self.remote_client.is_none() {
error!("no remote storage configured, cannot evict layers");
return ControlFlow::Continue(());
}
let mut js = tokio::task::JoinSet::new();
{
let guard = self.layers.read().await;

View File

@@ -7,28 +7,30 @@ use crate::{
index::{IndexPart, LayerFileMetadata},
},
storage_layer::LayerName,
Generation,
},
METADATA_FILE_NAME,
};
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use std::{
collections::{hash_map, HashMap},
str::FromStr,
};
use pageserver_api::shard::ShardIndex;
use std::{collections::HashMap, str::FromStr};
use utils::lsn::Lsn;
/// Identified files in the timeline directory.
pub(super) enum Discovered {
/// The only one we care about
Layer(LayerName, LocalLayerFileMetadata),
Layer(LayerName, Utf8PathBuf, u64),
/// Old ephmeral files from previous launches, should be removed
Ephemeral(String),
/// Old temporary timeline files, unsure what these really are, should be removed
Temporary(String),
/// Temporary on-demand download files, should be removed
TemporaryDownload(String),
/// "metadata" file we persist locally and include in `index_part.json`
Metadata,
/// Backup file from previously future layers
IgnoredBackup(Utf8PathBuf),
IgnoredBackup,
/// Unrecognized, warn about these
Unknown(String),
}
@@ -44,15 +46,14 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
let discovered = match LayerName::from_str(&file_name) {
Ok(file_name) => {
let file_size = direntry.metadata()?.len();
Discovered::Layer(
file_name,
LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
)
Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
}
Err(_) => {
if file_name.ends_with(".old") {
if file_name == METADATA_FILE_NAME {
Discovered::Metadata
} else if file_name.ends_with(".old") {
// ignore these
Discovered::IgnoredBackup(direntry.path().to_owned())
Discovered::IgnoredBackup
} else if remote_timeline_client::is_temp_download_file(direntry.path()) {
Discovered::TemporaryDownload(file_name)
} else if is_ephemeral_file(&file_name) {
@@ -75,32 +76,37 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
/// this structure extends it with metadata describing the layer's presence in local storage.
#[derive(Clone, Debug)]
pub(super) struct LocalLayerFileMetadata {
pub(super) file_size: u64,
pub(super) metadata: LayerFileMetadata,
pub(super) local_path: Utf8PathBuf,
}
impl LocalLayerFileMetadata {
pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
pub fn new(
local_path: Utf8PathBuf,
file_size: u64,
generation: Generation,
shard: ShardIndex,
) -> Self {
Self {
local_path,
file_size,
metadata: LayerFileMetadata::new(file_size, generation, shard),
}
}
}
/// For a layer that is present in remote metadata, this type describes how to handle
/// it during startup: it is either Resident (and we have some metadata about a local file),
/// or it is Evicted (and we only have remote metadata).
/// Decision on what to do with a layer file after considering its local and remote metadata.
#[derive(Clone, Debug)]
pub(super) enum Decision {
/// The layer is not present locally.
Evicted(LayerFileMetadata),
/// The layer is present locally, and metadata matches: we may hook up this layer to the
/// existing file in local storage.
Resident {
/// The layer is present locally, but local metadata does not match remote; we must
/// delete it and treat it as evicted.
UseRemote {
local: LocalLayerFileMetadata,
remote: LayerFileMetadata,
},
/// The layer is present locally, and metadata matches.
UseLocal(LocalLayerFileMetadata),
}
/// A layer needs to be left out of the layer map.
@@ -116,81 +122,77 @@ pub(super) enum DismissedLayer {
/// In order to make crash safe updates to layer map, we must dismiss layers which are only
/// found locally or not yet included in the remote `index_part.json`.
LocalOnly(LocalLayerFileMetadata),
/// The layer exists in remote storage but the local layer's metadata (e.g. file size)
/// does not match it
BadMetadata(LocalLayerFileMetadata),
}
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
pub(super) fn reconcile(
local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
index_part: Option<&IndexPart>,
disk_consistent_lsn: Lsn,
generation: Generation,
shard: ShardIndex,
) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
let Some(index_part) = index_part else {
// If we have no remote metadata, no local layer files are considered valid to load
return local_layers
.into_iter()
.map(|(layer_name, local_metadata)| {
(layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
})
.collect();
};
use Decision::*;
let mut result = Vec::new();
// name => (local_metadata, remote_metadata)
type Collected =
HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
let mut remote_layers = HashMap::new();
let mut discovered = discovered
.into_iter()
.map(|(layer_name, local_path, file_size)| {
(
layer_name,
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
// it is not in IndexPart, in which case using our current generation makes sense
// because it will be uploaded in this generation.
(
Some(LocalLayerFileMetadata::new(
local_path, file_size, generation, shard,
)),
None,
),
)
})
.collect::<Collected>();
// Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise
// construct DismissedLayers to get rid of them.
for (layer_name, local_metadata) in local_layers {
let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
continue;
};
if remote_metadata.file_size != local_metadata.file_size {
result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
continue;
}
remote_layers.insert(
layer_name,
Decision::Resident {
local: local_metadata,
remote: remote_metadata.clone(),
},
);
}
// Construct Decision for layers that were not found locally
// merge any index_part information, when available
index_part
.layer_metadata
.iter()
.as_ref()
.map(|ip| ip.layer_metadata.iter())
.into_iter()
.flatten()
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
.for_each(|(name, metadata)| {
if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
entry.insert(Decision::Evicted(metadata.clone()));
if let Some(existing) = discovered.get_mut(name) {
existing.1 = Some(metadata);
} else {
discovered.insert(name.to_owned(), (None, Some(metadata)));
}
});
// For layers that were found in authoritative remote metadata, apply a final check that they are within
// the disk_consistent_lsn.
result.extend(remote_layers.into_iter().map(|(name, decision)| {
if name.is_in_future(disk_consistent_lsn) {
match decision {
Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
Decision::Resident {
local,
remote: _remote,
} => (name, Err(DismissedLayer::Future { local: Some(local) })),
}
} else {
(name, Ok(decision))
}
}));
discovered
.into_iter()
.map(|(name, (local, remote))| {
let decision = if name.is_in_future(disk_consistent_lsn) {
Err(DismissedLayer::Future { local })
} else {
match (local, remote) {
(Some(local), Some(remote)) if local.metadata != remote => {
Ok(UseRemote { local, remote })
}
(Some(x), Some(_)) => Ok(UseLocal(x)),
(None, Some(x)) => Ok(Evicted(x)),
(Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
(None, None) => {
unreachable!("there must not be any non-local non-remote files")
}
}
};
result
(name, decision)
})
.collect::<Vec<_>>()
}
pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
@@ -199,15 +201,25 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
}
pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
let local_size = local.file_size;
pub(super) fn cleanup_local_file_for_remote(
local: &LocalLayerFileMetadata,
remote: &LayerFileMetadata,
) -> anyhow::Result<()> {
let local_size = local.metadata.file_size();
let remote_size = remote.file_size();
let path = &local.local_path;
let file_name = path.file_name().expect("must be file path");
tracing::warn!(
"removing local file {file_name:?} because it has unexpected length {local_size};"
);
std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
let file_name = path.file_name().expect("must be file path");
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
assert!(
path.exists(),
"we would leave the local_layer without a file if this does not hold: {path}",
);
Err(err)
} else {
Ok(())
}
}
pub(super) fn cleanup_future_layer(
@@ -229,8 +241,8 @@ pub(super) fn cleanup_local_only_file(
) -> anyhow::Result<()> {
let kind = name.kind();
tracing::info!(
"found local-only {kind} layer {name} size {}",
local.file_size
"found local-only {kind} layer {name}, metadata {:?}",
local.metadata
);
std::fs::remove_file(&local.local_path)?;
Ok(())

View File

@@ -212,34 +212,13 @@ impl LayerManager {
&mut self,
rewrite_layers: &[(Layer, ResidentLayer)],
drop_layers: &[Layer],
metrics: &TimelineMetrics,
_metrics: &TimelineMetrics,
) {
let mut updates = self.layer_map.batch_update();
for (old_layer, new_layer) in rewrite_layers {
debug_assert_eq!(
old_layer.layer_desc().key_range,
new_layer.layer_desc().key_range
);
debug_assert_eq!(
old_layer.layer_desc().lsn_range,
new_layer.layer_desc().lsn_range
);
// Safety: we may never rewrite the same file in-place. Callers are responsible
// for ensuring that they only rewrite layers after something changes the path,
// such as an increment in the generation number.
assert_ne!(old_layer.local_path(), new_layer.local_path());
// TODO: implement rewrites (currently this code path only used for drops)
assert!(rewrite_layers.is_empty());
Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
Self::insert_historic_layer(
new_layer.as_ref().clone(),
&mut updates,
&mut self.layer_fmgr,
);
metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
}
for l in drop_layers {
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
}

View File

@@ -705,7 +705,6 @@ impl ConnectionManagerState {
commit_lsn: info.commit_lsn,
safekeeper_connstr: info.safekeeper_connstr,
availability_zone: info.availability_zone,
standby_horizon: info.standby_horizon,
}
}
MessageType::SafekeeperDiscoveryResponse => {
@@ -726,21 +725,6 @@ impl ConnectionManagerState {
WALRECEIVER_BROKER_UPDATES.inc();
trace!(
"safekeeper info update: standby_horizon(cutoff)={}",
timeline_update.standby_horizon
);
if timeline_update.standby_horizon != 0 {
// ignore reports from safekeepers not connected to replicas
self.timeline
.standby_horizon
.store(Lsn(timeline_update.standby_horizon));
self.timeline
.metrics
.standby_horizon_gauge
.set(timeline_update.standby_horizon as i64);
}
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
let old_entry = self.wal_stream_candidates.insert(
new_safekeeper_id,
@@ -1110,7 +1094,6 @@ mod tests {
commit_lsn,
safekeeper_connstr: safekeeper_connstr.to_owned(),
availability_zone: None,
standby_horizon: 0,
},
latest_update,
}

View File

@@ -8,7 +8,6 @@ use std::collections::{HashMap, VecDeque};
use std::fmt::Debug;
use chrono::NaiveDateTime;
use pageserver_api::models::AuxFilePolicy;
use std::sync::Arc;
use tracing::info;
use utils::lsn::AtomicLsn;
@@ -61,9 +60,6 @@ pub(crate) struct UploadQueueInitialized {
/// Part of the flattened "next" `index_part.json`.
pub(crate) latest_lineage: Lineage,
/// The last aux file policy used on this timeline.
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
/// `disk_consistent_lsn` from the last metadata file that was successfully
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -193,7 +189,6 @@ impl UploadQueue {
dangling_files: HashMap::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
last_aux_file_policy: Default::default(),
};
*self = UploadQueue::Initialized(state);
@@ -213,7 +208,10 @@ impl UploadQueue {
let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
for (layer_name, layer_metadata) in &index_part.layer_metadata {
files.insert(layer_name.to_owned(), layer_metadata.clone());
files.insert(
layer_name.to_owned(),
LayerFileMetadata::from(layer_metadata),
);
}
info!(
@@ -241,7 +239,6 @@ impl UploadQueue {
dangling_files: HashMap::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
last_aux_file_policy: index_part.last_aux_file_policy(),
};
*self = UploadQueue::Initialized(state);
@@ -319,7 +316,9 @@ impl std::fmt::Display for UploadOp {
write!(
f,
"UploadLayer({}, size={:?}, gen={:?})",
layer, metadata.file_size, metadata.generation
layer,
metadata.file_size(),
metadata.generation
)
}
UploadOp::UploadMetadata(_, lsn) => {

View File

@@ -153,7 +153,10 @@ impl PostgresRedoManager {
process: self
.redo_process
.get()
.map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
.map(|p| WalRedoManagerProcessStatus {
pid: p.id(),
kind: std::borrow::Cow::Borrowed(p.kind().into()),
}),
}
}
}

View File

@@ -1,10 +1,7 @@
/// Layer of indirection previously used to support multiple implementations.
/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
use std::time::Duration;
use bytes::Bytes;
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use tracing::warn;
use utils::lsn::Lsn;
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
@@ -15,6 +12,7 @@ mod protocol;
mod process_impl {
pub(super) mod process_async;
pub(super) mod process_std;
}
#[derive(
@@ -36,7 +34,10 @@ pub enum Kind {
Async,
}
pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
pub(crate) enum Process {
Sync(process_impl::process_std::WalRedoProcess),
Async(process_impl::process_async::WalRedoProcess),
}
impl Process {
#[inline(always)]
@@ -45,17 +46,18 @@ impl Process {
tenant_shard_id: TenantShardId,
pg_version: u32,
) -> anyhow::Result<Self> {
if conf.walredo_process_kind != Kind::Async {
warn!(
configured = %conf.walredo_process_kind,
"the walredo_process_kind setting has been turned into a no-op, using async implementation"
);
}
Ok(Self(process_impl::process_async::WalRedoProcess::launch(
conf,
tenant_shard_id,
pg_version,
)?))
Ok(match conf.walredo_process_kind {
Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
conf,
tenant_shard_id,
pg_version,
)?),
Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
conf,
tenant_shard_id,
pg_version,
)?),
})
}
#[inline(always)]
@@ -67,12 +69,29 @@ impl Process {
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
self.0
.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
.await
match self {
Process::Sync(p) => {
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
.await
}
Process::Async(p) => {
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
.await
}
}
}
pub(crate) fn id(&self) -> u32 {
self.0.id()
match self {
Process::Sync(p) => p.id(),
Process::Async(p) => p.id(),
}
}
pub(crate) fn kind(&self) -> Kind {
match self {
Process::Sync(_) => Kind::Sync,
Process::Async(_) => Kind::Async,
}
}
}

View File

@@ -0,0 +1,405 @@
use self::no_leak_child::NoLeakChild;
use crate::{
config::PageServerConf,
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
walrecord::NeonWalRecord,
walredo::process::{no_leak_child, protocol},
};
use anyhow::Context;
use bytes::Bytes;
use nix::poll::{PollFd, PollFlags};
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use postgres_ffi::BLCKSZ;
use std::os::fd::AsRawFd;
#[cfg(feature = "testing")]
use std::sync::atomic::AtomicUsize;
use std::{
collections::VecDeque,
io::{Read, Write},
process::{ChildStdin, ChildStdout, Command, Stdio},
sync::{Mutex, MutexGuard},
time::Duration,
};
use tracing::{debug, error, instrument, Instrument};
use utils::{lsn::Lsn, nonblock::set_nonblock};
pub struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
// Some() on construction, only becomes None on Drop.
child: Option<NoLeakChild>,
stdout: Mutex<ProcessOutput>,
stdin: Mutex<ProcessInput>,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
}
struct ProcessInput {
stdin: ChildStdin,
n_requests: usize,
}
struct ProcessOutput {
stdout: ChildStdout,
pending_responses: VecDeque<Option<Bytes>>,
n_processed_responses: usize,
}
impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(pg_version=pg_version))]
pub(crate) fn launch(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
pg_version: u32,
) -> anyhow::Result<Self> {
crate::span::debug_assert_current_span_has_tenant_id();
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
use no_leak_child::NoLeakChildCommandExt;
// Start postgres itself
let child = Command::new(pg_bin_dir_path.join("postgres"))
// the first arg must be --wal-redo so the child process enters into walredo mode
.arg("--wal-redo")
// the child doesn't process this arg, but, having it in the argv helps indentify the
// walredo process for a particular tenant when debugging a pagserver
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
.stdin(Stdio::piped())
.stderr(Stdio::piped())
.stdout(Stdio::piped())
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to
// 1. close all file descriptors except stdin, stdout, stderr because
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
// the files it opens, and
// 2. to use seccomp to sandbox itself before processing the first
// walredo request.
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait(WalRedoKillCause::Startup);
});
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
let stderr = tokio::process::ChildStderr::from_std(stderr)
.context("convert to tokio::ChildStderr")?;
macro_rules! set_nonblock_or_log_err {
($file:ident) => {{
let res = set_nonblock($file.as_raw_fd());
if let Err(e) = &res {
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
}
res
}};
}
set_nonblock_or_log_err!(stdin)?;
set_nonblock_or_log_err!(stdout)?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
tokio::spawn(
async move {
scopeguard::defer! {
debug!("wal-redo-postgres stderr_logger_task finished");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
}
debug!("wal-redo-postgres stderr_logger_task started");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
use tokio::io::AsyncBufReadExt;
let mut stderr_lines = tokio::io::BufReader::new(stderr);
let mut buf = Vec::new();
let res = loop {
buf.clear();
// TODO we don't trust the process to cap its stderr length.
// Currently it can do unbounded Vec allocation.
match stderr_lines.read_until(b'\n', &mut buf).await {
Ok(0) => break Ok(()), // eof
Ok(num_bytes) => {
let output = String::from_utf8_lossy(&buf[..num_bytes]);
error!(%output, "received output");
}
Err(e) => {
break Err(e);
}
}
};
match res {
Ok(()) => (),
Err(e) => {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
);
Ok(Self {
conf,
tenant_shard_id,
child: Some(child),
stdin: Mutex::new(ProcessInput {
stdin,
n_requests: 0,
}),
stdout: Mutex::new(ProcessOutput {
stdout,
pending_responses: VecDeque::new(),
n_processed_responses: 0,
}),
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
})
}
pub(crate) fn id(&self) -> u32 {
self.child
.as_ref()
.expect("must not call this during Drop")
.id()
}
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
pub(crate) async fn apply_wal_records(
&self,
rel: RelTag,
blknum: u32,
base_img: &Option<Bytes>,
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
let tag = protocol::BufferTag { rel, blknum };
let input = self.stdin.lock().unwrap();
// Serialize all the messages to send the WAL redo process first.
//
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
//
// Most requests start with a before-image with BLCKSZ bytes, followed by
// by some other WAL records. Start with a buffer that can hold that
// comfortably.
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
if let Some(img) = base_img {
protocol::build_push_page_msg(tag, img, &mut writebuf);
}
for (lsn, rec) in records.iter() {
if let NeonWalRecord::Postgres {
will_init: _,
rec: postgres_rec,
} = rec
{
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
} else {
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
}
}
protocol::build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
if res.is_err() {
// not all of these can be caused by this particular input, however these are so rare
// in tests so capture all.
self.record_and_log(&writebuf);
}
res
}
fn apply_wal_records0(
&self,
writebuf: &[u8],
input: MutexGuard<ProcessInput>,
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
let mut nwrite = 0usize;
while nwrite < writebuf.len() {
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
let n = loop {
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
Err(nix::errno::Errno::EINTR) => continue,
res => break res,
}
}?;
if n == 0 {
anyhow::bail!("WAL redo timed out");
}
// If 'stdin' is writeable, do write.
let in_revents = stdin_pollfds[0].revents().unwrap();
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
}
if in_revents.contains(PollFlags::POLLHUP) {
// We still have more data to write, but the process closed the pipe.
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
}
}
let request_no = proc.n_requests;
proc.n_requests += 1;
drop(proc);
// To improve walredo performance we separate sending requests and receiving
// responses. Them are protected by different mutexes (output and input).
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
// then there is not warranty that T1 will first granted output mutex lock.
// To address this issue we maintain number of sent requests, number of processed
// responses and ring buffer with pending responses. After sending response
// (under input mutex), threads remembers request number. Then it releases
// input mutex, locks output mutex and fetch in ring buffer all responses until
// its stored request number. The it takes correspondent element from
// pending responses ring buffer and truncate all empty elements from the front,
// advancing processed responses number.
let mut output = self.stdout.lock().unwrap();
let n_processed_responses = output.n_processed_responses;
while n_processed_responses + output.pending_responses.len() <= request_no {
// We expect the WAL redo process to respond with an 8k page image. We read it
// into this buffer.
let mut resultbuf = vec![0; BLCKSZ.into()];
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
while nresult < BLCKSZ.into() {
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
// We do two things simultaneously: reading response from stdout
// and forward any logging information that the child writes to its stderr to the page server's log.
let n = loop {
match nix::poll::poll(
&mut stdout_pollfds[..],
wal_redo_timeout.as_millis() as i32,
) {
Err(nix::errno::Errno::EINTR) => continue,
res => break res,
}
}?;
if n == 0 {
anyhow::bail!("WAL redo timed out");
}
// If we have some data in stdout, read it to the result buffer.
let out_revents = stdout_pollfds[0].revents().unwrap();
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
}
if out_revents.contains(PollFlags::POLLHUP) {
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
}
}
output
.pending_responses
.push_back(Some(Bytes::from(resultbuf)));
}
// Replace our request's response with None in `pending_responses`.
// Then make space in the ring buffer by clearing out any seqence of contiguous
// `None`'s from the front of `pending_responses`.
// NB: We can't pop_front() because other requests' responses because another
// requester might have grabbed the output mutex before us:
// T1: grab input mutex
// T1: send request_no 23
// T1: release input mutex
// T2: grab input mutex
// T2: send request_no 24
// T2: release input mutex
// T2: grab output mutex
// T2: n_processed_responses + output.pending_responses.len() <= request_no
// 23 0 24
// T2: enters poll loop that reads stdout
// T2: put response for 23 into pending_responses
// T2: put response for 24 into pending_resposnes
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
// T2: takes its response_24
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: releases output mutex
// T1: grabs output mutex
// T1: n_processed_responses + output.pending_responses.len() > request_no
// 23 2 23
// T1: skips poll loop that reads stdout
// T1: takes its response_23
// pending_responses now looks like this: Front None None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Back
// n_processed_responses now has value 25
let res = output.pending_responses[request_no - n_processed_responses]
.take()
.expect("we own this request_no, nobody else is supposed to take it");
while let Some(front) = output.pending_responses.front() {
if front.is_none() {
output.pending_responses.pop_front();
output.n_processed_responses += 1;
} else {
break;
}
}
Ok(res)
}
#[cfg(feature = "testing")]
fn record_and_log(&self, writebuf: &[u8]) {
use std::sync::atomic::Ordering;
let millis = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis();
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
let res = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.read(true)
.open(path)
.and_then(|mut f| f.write_all(writebuf));
// trip up allowed_errors
if let Err(e) = res {
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
} else {
tracing::error!(filename, "erroring walredo input saved");
}
}
#[cfg(not(feature = "testing"))]
fn record_and_log(&self, _: &[u8]) {}
}
impl Drop for WalRedoProcess {
fn drop(&mut self) {
self.child
.take()
.expect("we only do this once")
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
// no way to wait for stderr_logger_task from Drop because that is async only
}
}

View File

@@ -1,78 +0,0 @@
From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 2 Feb 2024 22:26:45 +0200
Subject: [PATCH 1/1] Make v0.6.0 work with Neon
Now that the WAL-logging happens as a separate step at the end of the
build, we need a few neon-specific hints to make it work.
---
src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
index 680789b..ec54dea 100644
--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+#ifdef NEON_SMGR
+ smgr_start_unlogged_build(RelationGetSmgr(indexRel));
+#endif
+
/* Perform inserts */
HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
+#endif
+
/* Close relations within worker */
index_close(indexRel, indexLockmode);
table_close(heapRel, heapLockmode);
@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
SeedRandom(42);
#endif
+#ifdef NEON_SMGR
+ smgr_start_unlogged_build(RelationGetSmgr(index));
+#endif
+
InitBuildState(buildstate, heap, index, indexInfo, forkNum);
BuildGraph(buildstate, forkNum);
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
if (RelationNeedsWAL(index))
+ {
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+#ifdef NEON_SMGR
+ {
+#if PG_VERSION_NUM >= 160000
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+ }
+#endif
+ }
+
+#ifdef NEON_SMGR
+ smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+
FreeBuildState(buildstate);
}
--
2.39.2

View File

@@ -49,8 +49,9 @@ char *neon_auth_token;
int readahead_buffer_size = 128;
int flush_every_n_requests = 8;
int neon_protocol_version = 2;
int neon_protocol_version = 1;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
static int stripe_size;
@@ -94,44 +95,18 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
static PagestoreShmemState *pagestore_shared;
static uint64 pagestore_local_counter = 0;
typedef enum PSConnectionState {
PS_Disconnected, /* no connection yet */
PS_Connecting_Startup, /* connection starting up */
PS_Connecting_PageStream, /* negotiating pagestream */
PS_Connected, /* connected, pagestream established */
} PSConnectionState;
/* This backend's per-shard connections */
typedef struct
{
TimestampTz last_connect_time; /* read-only debug value */
TimestampTz last_reconnect_time;
uint32 delay_us;
int n_reconnect_attempts;
PGconn *conn;
/*---
* Pageserver connection state, i.e.
* disconnected: conn == NULL, wes == NULL;
* conn_startup: connection initiated, waiting for connection establishing
* conn_ps: PageStream query sent, waiting for confirmation
* connected: PageStream established
*/
PSConnectionState state;
PGconn *conn;
/*---
* WaitEventSet containing:
* - WL_SOCKET_READABLE on 'conn'
* - WL_LATCH_SET on MyLatch, and
* - WL_EXIT_ON_PM_DEATH.
* - WL_SOCKET_READABLE on 'conn'
* - WL_LATCH_SET on MyLatch, and
* - WL_EXIT_ON_PM_DEATH.
*/
WaitEventSet *wes_read;
/*---
* WaitEventSet containing:
* - WL_SOCKET_WRITABLE on 'conn'
* - WL_LATCH_SET on MyLatch, and
* - WL_EXIT_ON_PM_DEATH.
*/
WaitEventSet *wes_write;
WaitEventSet *wes;
} PageServer;
static PageServer page_servers[MAX_SHARDS];
@@ -328,269 +303,119 @@ get_shard_number(BufferTag *tag)
return hash % n_shards;
}
static inline void
CLEANUP_AND_DISCONNECT(PageServer *shard)
{
if (shard->wes_read)
{
FreeWaitEventSet(shard->wes_read);
shard->wes_read = NULL;
}
if (shard->wes_write)
{
FreeWaitEventSet(shard->wes_write);
shard->wes_write = NULL;
}
if (shard->conn)
{
PQfinish(shard->conn);
shard->conn = NULL;
}
shard->state = PS_Disconnected;
}
/*
* Connect to a pageserver, or continue to try to connect if we're yet to
* complete the connection (e.g. due to receiving an earlier cancellation
* during connection start).
* Returns true if successfully connected; false if the connection failed.
*
* Throws errors in unrecoverable situations, or when this backend's query
* is canceled.
*/
static bool
pageserver_connect(shardno_t shard_no, int elevel)
{
PageServer *shard = &page_servers[shard_no];
char *query;
int ret;
const char *keywords[3];
const char *values[3];
int n;
PGconn *conn;
WaitEventSet *wes;
char connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
static TimestampTz last_connect_time = 0;
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
TimestampTz now;
uint64_t us_since_last_connect;
bool broke_from_loop = false;
Assert(page_servers[shard_no].conn == NULL);
/*
* Get the connection string for this shard. If the shard map has been
* updated since we last looked, this will also disconnect any existing
* pageserver connections as a side effect.
* Note that connstr is used both during connection start, and when we
* log the successful connection.
*/
load_shard_map(shard_no, connstr, NULL);
switch (shard->state)
now = GetCurrentTimestamp();
us_since_last_connect = now - last_connect_time;
if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
{
case PS_Disconnected:
{
const char *keywords[3];
const char *values[3];
int n_pgsql_params;
TimestampTz now;
int64 us_since_last_attempt;
/* Make sure we start with a clean slate */
CLEANUP_AND_DISCONNECT(shard);
neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
now = GetCurrentTimestamp();
us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
shard->last_reconnect_time = now;
/*
* If we did other tasks between reconnect attempts, then we won't
* need to wait as long as a full delay.
*/
if (us_since_last_attempt < shard->delay_us)
{
pg_usleep(shard->delay_us - us_since_last_attempt);
}
/* update the delay metric */
shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
/*
* Connect using the connection string we got from the
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
* variable was set, use that as the password.
*
* The connection options are parsed in the order they're given, so when
* we set the password before the connection string, the connection string
* can override the password from the env variable. Seems useful, although
* we don't currently use that capability anywhere.
*/
keywords[0] = "dbname";
values[0] = connstr;
n_pgsql_params = 1;
if (neon_auth_token)
{
keywords[1] = "password";
values[1] = neon_auth_token;
n_pgsql_params++;
}
keywords[n_pgsql_params] = NULL;
values[n_pgsql_params] = NULL;
shard->conn = PQconnectStartParams(keywords, values, 1);
if (!shard->conn)
{
neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
return false;
}
shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
MyLatch, NULL);
AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
NULL, NULL);
AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
MyLatch, NULL);
AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
NULL, NULL);
AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
PQsocket(shard->conn),
NULL, NULL);
shard->state = PS_Connecting_Startup;
/* fallthrough */
pg_usleep(delay_us);
delay_us *= 2;
}
case PS_Connecting_Startup:
else
{
char *pagestream_query;
int ps_send_query_ret;
bool connected = false;
delay_us = MIN_RECONNECT_INTERVAL_USEC;
}
neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
/*
* Connect using the connection string we got from the
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
* variable was set, use that as the password.
*
* The connection options are parsed in the order they're given, so when
* we set the password before the connection string, the connection string
* can override the password from the env variable. Seems useful, although
* we don't currently use that capability anywhere.
*/
n = 0;
if (neon_auth_token)
{
keywords[n] = "password";
values[n] = neon_auth_token;
n++;
}
keywords[n] = "dbname";
values[n] = connstr;
n++;
keywords[n] = NULL;
values[n] = NULL;
n++;
conn = PQconnectdbParams(keywords, values, 1);
last_connect_time = GetCurrentTimestamp();
do
{
WaitEvent event;
int poll_result = PQconnectPoll(shard->conn);
if (PQstatus(conn) == CONNECTION_BAD)
{
char *msg = pchomp(PQerrorMessage(conn));
switch (poll_result)
{
default: /* unknown/unused states are handled as a failed connection */
case PGRES_POLLING_FAILED:
{
char *pqerr = PQerrorMessage(shard->conn);
char *msg = NULL;
neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
PQfinish(conn);
if (pqerr)
msg = pchomp(pqerr);
CLEANUP_AND_DISCONNECT(shard);
if (msg)
{
neon_shard_log(shard_no, elevel,
"could not connect to pageserver: %s",
msg);
pfree(msg);
}
else
neon_shard_log(shard_no, elevel,
"could not connect to pageserver");
return false;
}
case PGRES_POLLING_READING:
/* Sleep until there's something to do */
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
/* query cancellation, backend shutdown */
CHECK_FOR_INTERRUPTS();
/* PQconnectPoll() handles the socket polling state updates */
break;
case PGRES_POLLING_WRITING:
/* Sleep until there's something to do */
(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
/* query cancellation, backend shutdown */
CHECK_FOR_INTERRUPTS();
/* PQconnectPoll() handles the socket polling state updates */
break;
case PGRES_POLLING_OK:
neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
connected = true;
break;
}
}
while (!connected);
/* No more polling needed; connection succeeded */
shard->last_connect_time = GetCurrentTimestamp();
switch (neon_protocol_version)
{
ereport(elevel,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
errdetail_internal("%s", msg)));
pfree(msg);
return false;
}
switch (neon_protocol_version)
{
case 2:
pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
break;
case 1:
pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
break;
default:
elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
}
if (PQstatus(shard->conn) == CONNECTION_BAD)
{
char *msg = pchomp(PQerrorMessage(shard->conn));
CLEANUP_AND_DISCONNECT(shard);
ereport(elevel,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
errdetail_internal("%s", msg)));
pfree(msg);
return false;
}
ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
pfree(pagestream_query);
if (ps_send_query_ret != 1)
{
CLEANUP_AND_DISCONNECT(shard);
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
return false;
}
shard->state = PS_Connecting_PageStream;
/* fallthrough */
}
case PS_Connecting_PageStream:
ret = PQsendQuery(conn, query);
pfree(query);
if (ret != 1)
{
neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
PQfinish(conn);
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
return false;
}
if (PQstatus(shard->conn) == CONNECTION_BAD)
{
char *msg = pchomp(PQerrorMessage(shard->conn));
CLEANUP_AND_DISCONNECT(shard);
ereport(elevel,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
errdetail_internal("%s", msg)));
pfree(msg);
return false;
}
wes = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
MyLatch, NULL);
AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
NULL, NULL);
AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
while (PQisBusy(shard->conn))
PG_TRY();
{
while (PQisBusy(conn))
{
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
@@ -598,37 +423,40 @@ pageserver_connect(shardno_t shard_no, int elevel)
/* Data available in socket? */
if (event.events & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(shard->conn))
if (!PQconsumeInput(conn))
{
char *msg = pchomp(PQerrorMessage(shard->conn));
char *msg = pchomp(PQerrorMessage(conn));
PQfinish(conn);
FreeWaitEventSet(wes);
CLEANUP_AND_DISCONNECT(shard);
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
msg);
pfree(msg);
return false;
/* Returning from inside PG_TRY is bad, so we break/return later */
broke_from_loop = true;
break;
}
}
}
shard->state = PS_Connected;
/* fallthrough */
}
case PS_Connected:
/*
* We successfully connected. Future connections to this PageServer
* will do fast retries again, with exponential backoff.
*/
shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
return true;
default:
neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
PG_CATCH();
{
PQfinish(conn);
FreeWaitEventSet(wes);
PG_RE_THROW();
}
/* This shouldn't be hit */
Assert(false);
PG_END_TRY();
if (broke_from_loop)
{
return false;
}
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
page_servers[shard_no].conn = conn;
page_servers[shard_no].wes = wes;
return true;
}
/*
@@ -648,7 +476,7 @@ retry:
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
@@ -674,8 +502,7 @@ retry:
/*
* Reset prefetch and drop connection to the shard.
* It also drops connection to all other shards involved in prefetch, through
* prefetch_on_ps_disconnect().
* It also drops connection to all other shards involved in prefetch.
*/
static void
pageserver_disconnect(shardno_t shard_no)
@@ -685,6 +512,9 @@ pageserver_disconnect(shardno_t shard_no)
* whole prefetch queue, even for other pageservers. It should not
* cause big problems, because connection loss is supposed to be a
* rare event.
*
* Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
* because prefetch request may be registered before connection is established.
*/
prefetch_on_ps_disconnect();
@@ -697,36 +527,37 @@ pageserver_disconnect(shardno_t shard_no)
static void
pageserver_disconnect_shard(shardno_t shard_no)
{
PageServer *shard = &page_servers[shard_no];
/*
* If anything goes wrong while we were sending a request, it's not clear
* what state the connection is in. For example, if we sent the request
* but didn't receive a response yet, we might receive the response some
* time later after we have already sent a new unrelated request. Close
* the connection to avoid getting confused.
* Similarly, even when we're in PS_DISCONNECTED, we may have junk to
* clean up: It is possible that we encountered an error allocating any
* of the wait event sets or the psql connection, or failed when we tried
* to attach wait events to the WaitEventSets.
*/
CLEANUP_AND_DISCONNECT(shard);
shard->state = PS_Disconnected;
if (page_servers[shard_no].conn)
{
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
PQfinish(page_servers[shard_no].conn);
page_servers[shard_no].conn = NULL;
}
if (page_servers[shard_no].wes != NULL)
{
FreeWaitEventSet(page_servers[shard_no].wes);
page_servers[shard_no].wes = NULL;
}
}
static bool
pageserver_send(shardno_t shard_no, NeonRequest *request)
{
StringInfoData req_buff;
PageServer *shard = &page_servers[shard_no];
PGconn *pageserver_conn;
PGconn *pageserver_conn = page_servers[shard_no].conn;
/* If the connection was lost for some reason, reconnect */
if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
{
neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
pageserver_disconnect(shard_no);
pageserver_conn = NULL;
}
req_buff = nm_pack_request(request);
@@ -740,19 +571,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
* connection in case of failure.
*/
if (shard->state != PS_Connected)
if (!page_servers[shard_no].conn)
{
while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
{
HandleMainLoopInterrupts();
shard->n_reconnect_attempts += 1;
n_reconnect_attempts += 1;
}
shard->n_reconnect_attempts = 0;
} else {
Assert(shard->conn != NULL);
n_reconnect_attempts = 0;
}
pageserver_conn = shard->conn;
pageserver_conn = page_servers[shard_no].conn;
/*
* Send request.
@@ -761,17 +590,13 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
* should use async mode and check for interrupts while waiting. In
* practice, our requests are small enough to always fit in the output and
* TCP buffer.
*
* Note that this also will fail when the connection is in the
* PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
* point, but on the grand scheme of things it's only a small issue.
*/
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
pfree(msg);
pfree(req_buff.data);
return false;
@@ -786,7 +611,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
pfree(msg);
}
return true;
}
@@ -795,68 +619,58 @@ pageserver_receive(shardno_t shard_no)
{
StringInfoData resp_buff;
NeonResponse *resp;
PageServer *shard = &page_servers[shard_no];
PGconn *pageserver_conn = shard->conn;
/* read response */
int rc;
PGconn *pageserver_conn = page_servers[shard_no].conn;
if (shard->state != PS_Connected)
{
neon_shard_log(shard_no, LOG,
"pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
shard->state);
if (!pageserver_conn)
return NULL;
}
Assert(pageserver_conn);
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
if (rc >= 0)
PG_TRY();
{
/* call_PQgetCopyData handles rc == 0 */
Assert(rc > 0);
/* read response */
int rc;
PG_TRY();
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
if (rc >= 0)
{
resp_buff.len = rc;
resp_buff.cursor = 0;
resp = nm_unpack_response(&resp_buff);
PQfreemem(resp_buff.data);
if (message_level_is_interesting(PageStoreTrace))
{
char *msg = nm_to_string((NeonMessage *) resp);
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
pfree(msg);
}
}
PG_CATCH();
else if (rc == -1)
{
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
pageserver_disconnect(shard_no);
PG_RE_THROW();
resp = NULL;
}
PG_END_TRY();
if (message_level_is_interesting(PageStoreTrace))
else if (rc == -2)
{
char *msg = nm_to_string((NeonMessage *) resp);
char *msg = pchomp(PQerrorMessage(pageserver_conn));
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
pfree(msg);
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
}
else
{
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
}
}
else if (rc == -1)
PG_CATCH();
{
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
pageserver_disconnect(shard_no);
resp = NULL;
}
else if (rc == -2)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
}
else
{
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
PG_RE_THROW();
}
PG_END_TRY();
return (NeonResponse *) resp;
}
@@ -867,7 +681,7 @@ pageserver_flush(shardno_t shard_no)
{
PGconn *pageserver_conn = page_servers[shard_no].conn;
if (page_servers[shard_no].state != PS_Connected)
if (!pageserver_conn)
{
neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
}
@@ -883,7 +697,6 @@ pageserver_flush(shardno_t shard_no)
return false;
}
}
return true;
}
@@ -1047,7 +860,7 @@ pg_init_libpagestore(void)
"Version of compute<->page server protocol",
NULL,
&neon_protocol_version,
2, /* use protocol version 2 */
1, /* default to old protocol for now */
1, /* min */
2, /* max */
PGC_SU_BACKEND,
@@ -1078,7 +891,5 @@ pg_init_libpagestore(void)
dbsize_hook = neon_dbsize;
}
memset(page_servers, 0, sizeof(page_servers));
lfc_init();
}

View File

@@ -45,7 +45,6 @@
*/
#include "postgres.h"
#include "access/parallel.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "access/xlogdefs.h"
@@ -94,10 +93,6 @@ static char *hexdump_page(char *page);
const int SmgrTrace = DEBUG5;
#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
##__VA_ARGS__)
page_server_api *page_server;
/* unlogged relation build states */
@@ -530,8 +525,6 @@ prefetch_flush_requests(void)
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
* NOTE: callers should make sure they can handle query cancellations in this
* function's call path.
*/
static bool
prefetch_wait_for(uint64 ring_index)
@@ -567,8 +560,6 @@ prefetch_wait_for(uint64 ring_index)
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*
* NOTE: this does IO, and can get canceled out-of-line.
*/
static bool
prefetch_read(PrefetchRequest *slot)
@@ -580,14 +571,6 @@ prefetch_read(PrefetchRequest *slot)
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_receive);
if (slot->status != PRFS_REQUESTED ||
slot->response != NULL ||
slot->my_ring_index != MyPState->ring_receive)
neon_shard_log(slot->shard_no, ERROR,
"Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
slot->status, (size_t) (void *) slot->response,
slot->my_ring_index, MyPState->ring_receive);
old = MemoryContextSwitchTo(MyPState->errctx);
response = (NeonResponse *) page_server->receive(slot->shard_no);
MemoryContextSwitchTo(old);
@@ -605,11 +588,6 @@ prefetch_read(PrefetchRequest *slot)
}
else
{
neon_shard_log(slot->shard_no, WARNING,
"No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
slot->my_ring_index,
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
slot->buftag.forkNum, slot->buftag.blockNum);
return false;
}
}
@@ -624,7 +602,6 @@ void
prefetch_on_ps_disconnect(void)
{
MyPState->ring_flush = MyPState->ring_unused;
while (MyPState->ring_receive < MyPState->ring_unused)
{
PrefetchRequest *slot;
@@ -647,7 +624,6 @@ prefetch_on_ps_disconnect(void)
slot->status = PRFS_TAG_REMAINS;
MyPState->n_requests_inflight -= 1;
MyPState->ring_receive += 1;
prefetch_set_unused(ring_index);
}
}
@@ -714,8 +690,6 @@ static void
prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
{
bool found;
uint64 mySlotNo = slot->my_ring_index;
NeonGetPageRequest request = {
.req.tag = T_NeonGetPageRequest,
/* lsn and not_modified_since are filled in below */
@@ -724,8 +698,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
.blkno = slot->buftag.blockNum,
};
Assert(mySlotNo == MyPState->ring_unused);
if (force_request_lsns)
slot->request_lsns = *force_request_lsns;
else
@@ -738,11 +710,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_unused);
while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
{
Assert(mySlotNo == MyPState->ring_unused);
/* loop */
}
while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
/* update prefetch state */
MyPState->n_requests_inflight += 1;
@@ -753,6 +721,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
/* update slot state */
slot->status = PRFS_REQUESTED;
prfh_insert(MyPState->prf_hash, slot, &found);
Assert(!found);
}
@@ -924,10 +893,6 @@ Retry:
return ring_index;
}
/*
* Note: this function can get canceled and use a long jump to the next catch
* context. Take care.
*/
static NeonResponse *
page_server_request(void const *req)
{
@@ -959,38 +924,19 @@ page_server_request(void const *req)
* Current sharding model assumes that all metadata is present only at shard 0.
* We still need to call get_shard_no() to check if shard map is up-to-date.
*/
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
{
shard_no = 0;
}
do
{
PG_TRY();
{
while (!page_server->send(shard_no, (NeonRequest *) req)
|| !page_server->flush(shard_no))
{
/* do nothing */
}
consume_prefetch_responses();
resp = page_server->receive(shard_no);
}
PG_CATCH();
{
/*
* Cancellation in this code needs to be handled better at some
* point, but this currently seems fine for now.
*/
page_server->disconnect(shard_no);
PG_RE_THROW();
}
PG_END_TRY();
while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
consume_prefetch_responses();
resp = page_server->receive(shard_no);
} while (resp == NULL);
return resp;
}
@@ -1402,10 +1348,6 @@ PageIsEmptyHeapPage(char *buffer)
return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
}
/*
* A page is being evicted from the shared buffer cache. Update the
* last-written LSN of the page, and WAL-log it if needed.
*/
static void
#if PG_MAJORVERSION_NUM < 16
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1414,7 +1356,12 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
#endif
{
XLogRecPtr lsn = PageGetLSN((Page) buffer);
bool log_page;
if (ShutdownRequestPending)
return;
/* Don't log any pages if we're not allowed to do so. */
if (!XLogInsertAllowed())
return;
/*
* Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1423,21 +1370,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
* correctness, the non-logged updates are not critical. But we want to
* have a reasonably up-to-date VM and FSM in the page server.
*/
log_page = false;
if (force)
{
Assert(XLogInsertAllowed());
log_page = true;
}
else if (XLogInsertAllowed() &&
!ShutdownRequestPending &&
(forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
{
log_page = true;
}
if (log_page)
if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
{
/* FSM is never WAL-logged and we don't care. */
XLogRecPtr recptr;
recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
@@ -1450,8 +1385,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
}
if (lsn == InvalidXLogRecPtr)
else if (lsn == InvalidXLogRecPtr)
{
/*
* When PostgreSQL extends a relation, it calls smgrextend() with an
@@ -1487,31 +1421,19 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
}
else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
else
{
/*
* Its a bad sign if there is a page with zero LSN in the buffer
* cache in a standby, too. However, PANICing seems like a cure
* worse than the disease, as the damage has likely already been
* done in the primary. So in a standby, make this an assertion,
* and in a release build just LOG the error and soldier on. We
* update the last-written LSN of the page with a conservative
* value in that case, which is the last replayed LSN.
*/
ereport(RecoveryInProgress() ? LOG : PANIC,
ereport(PANIC,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
Assert(false);
lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
}
}
else
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1604,92 +1526,8 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
if (RecoveryInProgress())
{
/*---
* In broad strokes, a replica always requests the page at the current
* replay LSN. But looking closer, what exactly is the replay LSN? Is
* it the last replayed record, or the record being replayed? And does
* the startup process performing the replay need to do something
* differently than backends running queries? Let's take a closer look
* at the different scenarios:
*
* 1. Startup process reads a page, last_written_lsn is old.
*
* Read the old version of the page. We will apply the WAL record on
* it to bring it up-to-date.
*
* We could read the new version, with the changes from this WAL
* record already applied, to offload the work of replaying the record
* to the pageserver. The pageserver might not have received the WAL
* record yet, though, so a read of the old page version and applying
* the record ourselves is likely faster. Also, the redo function
* might be surprised if the changes have already applied. That's
* normal during crash recovery, but not in hot standby.
*
* 2. Startup process reads a page, last_written_lsn == record we're
* replaying.
*
* Can this happen? There are a few theoretical cases when it might:
*
* A) The redo function reads the same page twice. We had already read
* and applied the changes once, and now we're reading it for the
* second time. That would be a rather silly thing for a redo
* function to do, and I'm not aware of any that would do it.
*
* B) The redo function modifies multiple pages, and it already
* applied the changes to one of the pages, released the lock on
* it, and is now reading a second page. Furthermore, the first
* page was already evicted from the buffer cache, and also from
* the last-written LSN cache, so that the per-relation or global
* last-written LSN was already updated. All the WAL redo functions
* hold the locks on pages that they modify, until all the changes
* have been modified (?), which would make that impossible.
* However, we skip the locking, if the page isn't currently in the
* page cache (see neon_redo_read_buffer_filter below).
*
* Even if the one of the above cases were possible in theory, they
* would also require the pages being modified by the redo function to
* be immediately evicted from the page cache.
*
* So this probably does not happen in practice. But if it does, we
* request the new version, including the changes from the record
* being replayed. That seems like the correct behavior in any case.
*
* 3. Backend process reads a page with old last-written LSN
*
* Nothing special here. Read the old version.
*
* 4. Backend process reads a page with last_written_lsn == record being replayed
*
* This can happen, if the redo function has started to run, and saw
* that the page isn't present in the page cache (see
* neon_redo_read_buffer_filter below). Normally, in a normal
* Postgres server, the redo function would hold a lock on the page,
* so we would get blocked waiting the redo function to release the
* lock. To emulate that, wait for the WAL replay of the record to
* finish.
*/
/* Request the page at the end of the last fully replayed LSN. */
XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
if (last_written_lsn > replay_lsn)
{
/* GetCurrentReplayRecPtr was introduced in v15 */
#if PG_VERSION_NUM >= 150000
Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
#endif
/*
* Cases 2 and 4. If this is a backend (case 4), the
* neon_read_at_lsn() call later will wait for the WAL record to be
* fully replayed.
*/
result.request_lsn = last_written_lsn;
}
else
{
/* cases 1 and 3 */
result.request_lsn = replay_lsn;
}
/* Request the page at the last replayed LSN. */
result.request_lsn = GetXLogReplayRecPtr(NULL);
result.not_modified_since = last_written_lsn;
result.effective_request_lsn = result.request_lsn;
Assert(last_written_lsn <= result.request_lsn);
@@ -1958,9 +1796,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
}
pfree(resp);
return exists;
@@ -2412,7 +2248,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
/*
* Try to find prefetched page in the list of received pages.
*/
Retry:
Retry:
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
if (entry != NULL)
@@ -2498,9 +2334,7 @@ Retry:
((NeonErrorResponse *) resp)->message)));
break;
default:
NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
}
/* buffer was used, clean up for later reuse */
@@ -2771,9 +2605,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
}
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
@@ -2826,9 +2658,7 @@ neon_dbsize(Oid dbNode)
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -2992,14 +2822,10 @@ neon_start_unlogged_build(SMgrRelation reln)
reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
/*
* Create the local file. In a parallel build, the leader is expected to
* call this first and do it.
*
* FIXME: should we pass isRedo true to create the tablespace dir if it
* doesn't exist? Is it needed?
*/
if (!IsParallelWorker())
mdcreate(reln, MAIN_FORKNUM, false);
mdcreate(reln, MAIN_FORKNUM, false);
}
/*
@@ -3023,17 +2849,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
/*
* In a parallel build, (only) the leader process performs the 2nd
* phase.
*/
if (IsParallelWorker())
{
unlogged_build_rel = NULL;
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
}
else
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
}
/*
@@ -3167,9 +2983,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
break;
default:
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
}
pfree(resp);
@@ -3387,7 +3201,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
BufferTag tag;
uint32 hash;
LWLock *partitionLock;
int buf_id;
Buffer buffer;
bool no_redo_needed;
if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
@@ -3425,20 +3239,20 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
else
{
/* Try to find the relevant buffer */
buf_id = BufTableLookup(&tag, hash);
buffer = BufTableLookup(&tag, hash);
no_redo_needed = buf_id < 0;
no_redo_needed = buffer < 0;
}
/* In both cases st lwlsn past this WAL record */
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
/*
* we don't have the buffer in memory, update lwLsn past this record, also
* evict page from file cache
*/
if (no_redo_needed)
{
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
lfc_evict(rinfo, forknum, blkno);
}
LWLockRelease(partitionLock);

View File

@@ -1852,30 +1852,34 @@ static void
CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
{
hs->ts = 0;
hs->xmin = InvalidFullTransactionId;
hs->catalog_xmin = InvalidFullTransactionId;
hs->xmin.value = ~0; /* largest unsigned value */
hs->catalog_xmin.value = ~0; /* largest unsigned value */
for (int i = 0; i < wp->n_safekeepers; i++)
{
if (wp->safekeeper[i].state == SS_ACTIVE)
if (wp->safekeeper[i].appendResponse.hs.ts != 0)
{
HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;
if (FullTransactionIdIsNormal(skhs->xmin)
&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
{
hs->xmin = skhs->xmin;
hs->ts = skhs->ts;
}
if (FullTransactionIdIsNormal(skhs->catalog_xmin)
&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
{
hs->catalog_xmin = skhs->catalog_xmin;
hs->ts = skhs->ts;
}
}
}
if (hs->xmin.value == ~0)
hs->xmin = InvalidFullTransactionId;
if (hs->catalog_xmin.value == ~0)
hs->catalog_xmin = InvalidFullTransactionId;
}
/*
@@ -1942,28 +1946,14 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
}
CombineHotStanbyFeedbacks(&hsFeedback, wp);
if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
{
FullTransactionId xmin = hsFeedback.xmin;
FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
FullTransactionId next_xid = ReadNextFullTransactionId();
/*
* Page server is updating nextXid in checkpoint each 1024 transactions,
* so feedback xmin can be actually larger then nextXid and
* function TransactionIdInRecentPast return false in this case,
* preventing update of slot's xmin.
*/
if (FullTransactionIdPrecedes(next_xid, xmin))
xmin = next_xid;
if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
catalog_xmin = next_xid;
agg_hs_feedback = hsFeedback;
elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
ProcessStandbyHSFeedback(hsFeedback.ts,
XidFromFullTransactionId(xmin),
EpochFromFullTransactionId(xmin),
XidFromFullTransactionId(catalog_xmin),
EpochFromFullTransactionId(catalog_xmin));
XidFromFullTransactionId(hsFeedback.xmin),
EpochFromFullTransactionId(hsFeedback.xmin),
XidFromFullTransactionId(hsFeedback.catalog_xmin),
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
}
CheckGracefulShutdown(wp);

21
poetry.lock generated
View File

@@ -2405,7 +2405,6 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2530,13 +2529,13 @@ files = [
[[package]]
name = "requests"
version = "2.32.0"
version = "2.31.0"
description = "Python HTTP for Humans."
optional = false
python-versions = ">=3.8"
python-versions = ">=3.7"
files = [
{file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
{file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
{file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
{file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
]
[package.dependencies]
@@ -2960,16 +2959,6 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3207,4 +3196,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"

View File

@@ -9,7 +9,6 @@ default = []
testing = []
[dependencies]
ahash.workspace = true
anyhow.workspace = true
async-compression.workspace = true
async-trait.workspace = true
@@ -25,10 +24,8 @@ camino.workspace = true
chrono.workspace = true
clap.workspace = true
consumption_metrics.workspace = true
crossbeam-deque.workspace = true
dashmap.workspace = true
env_logger.workspace = true
framed-websockets.workspace = true
futures.workspace = true
git-version.workspace = true
hashbrown.workspace = true
@@ -38,6 +35,7 @@ hmac.workspace = true
hostname.workspace = true
http.workspace = true
humantime.workspace = true
hyper-tungstenite.workspace = true
hyper.workspace = true
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
@@ -54,6 +52,7 @@ opentelemetry.workspace = true
parking_lot.workspace = true
parquet.workspace = true
parquet_derive.workspace = true
pbkdf2 = { workspace = true, features = ["simple", "std"] }
pin-project-lite.workspace = true
postgres_backend.workspace = true
pq_proto.workspace = true
@@ -77,12 +76,12 @@ smol_str.workspace = true
smallvec.workspace = true
socket2.workspace = true
subtle.workspace = true
sync_wrapper.workspace = true
task-local-extensions.workspace = true
thiserror.workspace = true
tikv-jemallocator.workspace = true
tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
tokio-postgres.workspace = true
tokio-postgres-rustls.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
tokio = { workspace = true, features = ["signal"] }
@@ -97,6 +96,8 @@ utils.workspace = true
uuid.workspace = true
webpki-roots.workspace = true
x509-parser.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
postgres-protocol.workspace = true
redis.workspace = true
@@ -105,8 +106,6 @@ workspace_hack.workspace = true
[dev-dependencies]
camino-tempfile.workspace = true
fallible-iterator.workspace = true
tokio-tungstenite.workspace = true
pbkdf2 = { workspace = true, features = ["simple", "std"] }
rcgen.workspace = true
rstest.workspace = true
tokio-postgres-rustls.workspace = true

View File

@@ -365,10 +365,7 @@ async fn authenticate_with_secret(
config: &'static AuthenticationConfig,
) -> auth::Result<ComputeCredentials> {
if let Some(password) = unauthenticated_password {
let ep = EndpointIdInt::from(&info.endpoint);
let auth_outcome =
validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
let auth_outcome = validate_password_and_exchange(&password, secret).await?;
let keys = match auth_outcome {
crate::sasl::Outcome::Success(key) => key,
crate::sasl::Outcome::Failure(reason) => {
@@ -389,7 +386,7 @@ async fn authenticate_with_secret(
// Currently, we use it for websocket connections (latency).
if allow_cleartext {
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
return hacks::authenticate_cleartext(ctx, info, client, secret, config).await;
return hacks::authenticate_cleartext(ctx, info, client, secret).await;
}
// Finally, proceed with the main auth flow (SCRAM-based).
@@ -557,7 +554,7 @@ mod tests {
context::RequestMonitoring,
proxy::NeonOptions,
rate_limiter::{EndpointRateLimiter, RateBucketInfo},
scram::{threadpool::ThreadPool, ServerSecret},
scram::ServerSecret,
stream::{PqStream, Stream},
};
@@ -599,7 +596,6 @@ mod tests {
}
static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
thread_pool: ThreadPool::new(1),
scram_protocol_timeout: std::time::Duration::from_secs(5),
rate_limiter_enabled: true,
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),

View File

@@ -3,10 +3,8 @@ use super::{
};
use crate::{
auth::{self, AuthFlow},
config::AuthenticationConfig,
console::AuthSecret,
context::RequestMonitoring,
intern::EndpointIdInt,
sasl,
stream::{self, Stream},
};
@@ -22,7 +20,6 @@ pub async fn authenticate_cleartext(
info: ComputeUserInfo,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
secret: AuthSecret,
config: &'static AuthenticationConfig,
) -> auth::Result<ComputeCredentials> {
warn!("cleartext auth flow override is enabled, proceeding");
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -30,14 +27,8 @@ pub async fn authenticate_cleartext(
// pause the timer while we communicate with the client
let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
let ep = EndpointIdInt::from(&info.endpoint);
let auth_flow = AuthFlow::new(client)
.begin(auth::CleartextPassword {
secret,
endpoint: ep,
pool: config.thread_pool.clone(),
})
.begin(auth::CleartextPassword(secret))
.await?;
drop(paused);
// cleartext auth is only allowed to the ws/http protocol.

Some files were not shown because too many files have changed in this diff Show More