Compare commits

..

1 Commits

Author SHA1 Message Date
Alex Chi Z
a40eee31a7 refactor(pageserver): better k-merge implementation for tiered compaction
Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-05-20 17:13:40 -04:00
226 changed files with 4477 additions and 11311 deletions

View File

@@ -3,13 +3,13 @@ description: 'Create Branch using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
project_id:
description: 'ID of the Project to create Branch in'
desctiption: 'ID of the Project to create Branch in'
required: true
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
outputs:
dsn:

View File

@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
project_id:
description: 'ID of the Project which should be deleted'
desctiption: 'ID of the Project which should be deleted'
required: true
branch_id:
description: 'ID of the branch to delete'
desctiption: 'ID of the branch to delete'
required: true
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
runs:

View File

@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
region_id:
description: 'Region ID, if not set the project will be created in the default region'
desctiption: 'Region ID, if not set the project will be created in the default region'
default: aws-us-east-2
postgres_version:
description: 'Postgres version; default is 15'
default: '15'
desctiption: 'Postgres version; default is 15'
default: 15
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
provisioner:
description: 'k8s-pod or k8s-neonvm'
desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'
compute_units:
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
default: '[1, 1]'
outputs:

View File

@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
inputs:
api_key:
description: 'Neon API key'
desctiption: 'Neon API key'
required: true
project_id:
description: 'ID of the Project to delete'
desctiption: 'ID of the Project to delete'
required: true
api_host:
description: 'Neon API host'
desctiption: 'Neon API host'
default: console-stage.neon.build
runs:

View File

@@ -24,7 +24,7 @@ jobs:
actionlint:
needs: [ check-permissions ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: reviewdog/action-actionlint@v1
@@ -36,15 +36,3 @@ jobs:
fail_on_error: true
filter_mode: nofilter
level: error
- run: |
PAT='^\s*runs-on:.*-latest'
if grep -ERq $PAT .github/workflows
then
grep -ERl $PAT .github/workflows |\
while read -r f
do
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
done
exit 1
fi

View File

@@ -44,7 +44,7 @@ jobs:
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -60,7 +60,7 @@ jobs:
github.event.action == 'labeled' &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -109,7 +109,7 @@ jobs:
github.event.action == 'closed' &&
github.event.pull_request.head.repo.full_name != github.repository
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch

View File

@@ -38,11 +38,6 @@ on:
description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
required: false
default: false
run_only_pgvector_tests:
type: boolean
description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
required: false
default: false
defaults:
run:
@@ -55,7 +50,6 @@ concurrency:
jobs:
bench:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -126,7 +120,6 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
generate-matrices:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
#
# Available platforms:
@@ -137,7 +130,7 @@ jobs:
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
env:
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
outputs:
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
@@ -204,7 +197,6 @@ jobs:
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
pgbench-compare:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
needs: [ generate-matrices ]
strategy:
@@ -351,92 +343,6 @@ jobs:
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
pgbench-pgvector:
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
TEST_PG_BENCH_SCALES_MATRIX: "1"
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 16
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-captest-pgvector"
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Benchmark pgvector hnsw indexing
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_perf_olap.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: Benchmark pgvector hnsw queries
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
clickbench-compare:
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
# we use for performance testing in pgbench-compare.
@@ -445,7 +351,7 @@ jobs:
#
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
if: ${{ !cancelled() }}
needs: [ generate-matrices, pgbench-compare ]
strategy:
@@ -549,7 +455,7 @@ jobs:
# We might change it after https://github.com/neondatabase/neon/issues/2900.
#
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
if: ${{ !cancelled() }}
needs: [ generate-matrices, clickbench-compare ]
strategy:
@@ -651,7 +557,7 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
user-examples-compare:
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
if: ${{ !cancelled() }}
needs: [ generate-matrices, tpch-compare ]
strategy:

View File

@@ -88,7 +88,7 @@ jobs:
merge-images:
needs: [ build-image ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
env:
IMAGE_TAG: ${{ inputs.image-tag }}

View File

@@ -35,7 +35,7 @@ jobs:
cancel-previous-e2e-tests:
needs: [ check-permissions ]
if: github.event_name == 'pull_request'
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Cancel previous e2e-tests runs for this PR
@@ -548,8 +548,8 @@ jobs:
report-benchmarks-failures:
needs: [ benchmarks, create-test-report ]
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
runs-on: ubuntu-22.04
if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
runs-on: ubuntu-latest
steps:
- uses: slackapi/slack-github-action@v1
@@ -723,13 +723,9 @@ jobs:
uses: ./.github/workflows/trigger-e2e-tests.yml
secrets: inherit
neon-image-arch:
neon-image:
needs: [ check-permissions, build-build-tools-image, tag ]
strategy:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: [ self-hosted, gen3, large ]
steps:
- name: Checkout
@@ -751,6 +747,12 @@ jobs:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- uses: docker/build-push-action@v5
with:
context: .
@@ -762,52 +764,25 @@ jobs:
push: true
pull: true
file: Dockerfile
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
cache-from: type=registry,ref=neondatabase/neon:cache
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
tags: |
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
neondatabase/neon:${{needs.tag.outputs.build-tag}}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
neon-image:
needs: [ neon-image-arch, tag ]
runs-on: ubuntu-22.04
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch image
run: |
docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Push multi-arch image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
compute-node-image-arch:
compute-node-image:
needs: [ check-permissions, build-build-tools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
steps:
- name: Checkout
@@ -854,14 +829,15 @@ jobs:
push: true
pull: true
file: Dockerfile.compute-node
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
tags: |
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
if: matrix.version == 'v16'
if: ${{ matrix.version == 'v16' }}
uses: docker/build-push-action@v5
with:
target: compute-tools-image
@@ -875,57 +851,14 @@ jobs:
pull: true
file: Dockerfile.compute-node
tags: |
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
compute-node-image:
needs: [ compute-node-image-arch, tag ]
runs-on: ubuntu-22.04
strategy:
matrix:
version: [ v14, v15, v16 ]
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch compute-node image
run: |
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
- name: Create multi-arch compute-tools image
if: matrix.version == 'v16'
run: |
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
- name: Push multi-arch compute-tools image to ECR
if: matrix.version == 'v16'
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, gen3, large ]
@@ -933,8 +866,11 @@ jobs:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
defaults:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.29.3
VM_BUILDER_VERSION: v0.28.1
steps:
- name: Checkout
@@ -947,48 +883,26 @@ jobs:
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
chmod +x vm-builder
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
# it won't have the proper authentication (written at v0.6.0)
- name: Pulling compute-node image
run: |
docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Build vm image
run: |
./vm-builder \
-spec=vm-image-spec.yaml \
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Pushing vm-compute-node image
run: |
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ check-permissions, tag, neon-image, compute-node-image ]
strategy:
fail-fast: false
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
runs-on: [ self-hosted, gen3, small ]
steps:
- name: Checkout
@@ -1006,7 +920,7 @@ jobs:
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
run: |
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
echo "Pageserver version string: $pageserver_version"
@@ -1032,52 +946,82 @@ jobs:
promote-images:
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: ubuntu-22.04
env:
VERSIONS: v14 v15 v16
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
# Don't add if-condition here.
# The job should always be run because we have dependant other jobs that shouldn't be skipped
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Copy vm-compute-node images to ECR
- name: Install Crane & ECR helper
run: |
for version in ${VERSIONS}; do
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Copy vm-compute-node images to Docker Hub
run: |
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
- name: Add latest tag to images
if: github.ref_name == 'main'
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
run: |
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
docker buildx imagetools create -t $repo/neon:latest \
$repo/neon:${{ needs.tag.outputs.build-tag }}
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
docker buildx imagetools create -t $repo/compute-tools:latest \
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
- name: Push images to production ECR
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
run: |
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
for version in ${VERSIONS}; do
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
- name: Configure Docker Hub login
run: |
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
echo "" > /github/home/.docker/config.json
crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
done
- name: Push vm-compute-node to Docker Hub
run: |
crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
- name: Push latest tags to Docker Hub
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
run: |
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Set PR's status to pending and request a remote CI test
run: |

View File

@@ -19,7 +19,7 @@ permissions: {}
jobs:
check-image:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
outputs:
tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
found: ${{ steps.check-image.outputs.found }}

View File

@@ -16,7 +16,7 @@ permissions: {}
jobs:
check-permissions:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Disallow CI runs on PRs from forks
if: |

View File

@@ -9,7 +9,7 @@ on:
jobs:
cleanup:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Cleanup
run: |

View File

@@ -20,7 +20,7 @@ concurrency:
jobs:
test-postgres-client-libs:
# TODO: switch to gen2 runner, requires docker
runs-on: ubuntu-22.04
runs-on: [ ubuntu-latest ]
env:
DEFAULT_PG_VERSION: 14

View File

@@ -26,7 +26,7 @@ permissions: {}
jobs:
tag-image:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
env:
FROM_TAG: ${{ inputs.from-tag }}

View File

@@ -19,7 +19,7 @@ on:
jobs:
notify:
runs-on: ubuntu-22.04
runs-on: [ ubuntu-latest ]
steps:
- uses: neondatabase/dev-actions/release-pr-notify@main

View File

@@ -26,7 +26,7 @@ defaults:
jobs:
create-storage-release-branch:
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
permissions:
contents: write # for `git push`
@@ -53,7 +53,7 @@ jobs:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
## Storage & Compute release ${RELEASE_DATE}
## Release ${RELEASE_DATE}
**Please merge this Pull Request using 'Create a merge commit' button**
EOF
@@ -65,7 +65,7 @@ jobs:
create-proxy-release-branch:
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
permissions:
contents: write # for `git push`

View File

@@ -19,7 +19,7 @@ env:
jobs:
cancel-previous-e2e-tests:
if: github.event_name == 'pull_request'
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Cancel previous e2e-tests runs for this PR
@@ -31,7 +31,7 @@ jobs:
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
tag:
runs-on: ubuntu-22.04
runs-on: [ ubuntu-latest ]
outputs:
build-tag: ${{ steps.build-tag.outputs.tag }}
@@ -62,7 +62,7 @@ jobs:
trigger-e2e-tests:
needs: [ tag ]
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
env:
TAG: ${{ needs.tag.outputs.build-tag }}
steps:

190
Cargo.lock generated
View File

@@ -776,6 +776,7 @@ dependencies = [
"pin-project",
"serde",
"time",
"tz-rs",
"url",
"uuid",
]
@@ -1290,6 +1291,12 @@ dependencies = [
"tiny-keccak",
]
[[package]]
name = "const_fn"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935"
[[package]]
name = "const_format"
version = "0.2.30"
@@ -1464,21 +1471,26 @@ dependencies = [
[[package]]
name = "crossbeam-deque"
version = "0.8.5"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
dependencies = [
"cfg-if",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
version = "0.9.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"memoffset 0.8.0",
"scopeguard",
]
[[package]]
@@ -1969,6 +1981,21 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "foreign-types"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
dependencies = [
"foreign-types-shared",
]
[[package]]
name = "foreign-types-shared"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
[[package]]
name = "form_urlencoded"
version = "1.1.0"
@@ -2598,6 +2625,19 @@ dependencies = [
"tokio-io-timeout",
]
[[package]]
name = "hyper-tls"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [
"bytes",
"hyper 0.14.26",
"native-tls",
"tokio",
"tokio-native-tls",
]
[[package]]
name = "hyper-util"
version = "0.1.3"
@@ -2915,12 +2955,6 @@ version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
[[package]]
name = "linux-raw-sys"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
[[package]]
name = "lock_api"
version = "0.4.10"
@@ -3139,6 +3173,24 @@ version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
[[package]]
name = "native-tls"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
dependencies = [
"lazy_static",
"libc",
"log",
"openssl",
"openssl-probe",
"openssl-sys",
"schannel",
"security-framework",
"security-framework-sys",
"tempfile",
]
[[package]]
name = "nix"
version = "0.25.1"
@@ -3309,6 +3361,15 @@ dependencies = [
"libc",
]
[[package]]
name = "num_threads"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
dependencies = [
"libc",
]
[[package]]
name = "oauth2"
version = "4.4.2"
@@ -3358,12 +3419,50 @@ version = "11.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "openssl"
version = "0.10.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
dependencies = [
"bitflags 2.4.1",
"cfg-if",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
[[package]]
name = "openssl-macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "openssl-probe"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
dependencies = [
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "opentelemetry"
version = "0.20.0"
@@ -3570,7 +3669,6 @@ dependencies = [
"serde",
"serde_json",
"svg_fmt",
"thiserror",
"tokio",
"tokio-util",
"toml_edit",
@@ -3863,9 +3961,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
[[package]]
name = "pbkdf2"
version = "0.12.2"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
dependencies = [
"digest",
"hmac",
@@ -4012,6 +4110,17 @@ dependencies = [
"tokio-postgres",
]
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"native-tls",
"tokio",
"tokio-native-tls",
"tokio-postgres",
]
[[package]]
name = "postgres-protocol"
version = "0.6.4"
@@ -4120,7 +4229,6 @@ version = "0.1.0"
dependencies = [
"byteorder",
"bytes",
"itertools",
"pin-project-lite",
"postgres-protocol",
"rand 0.8.5",
@@ -4278,7 +4386,6 @@ dependencies = [
name = "proxy"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"async-compression",
"async-trait",
@@ -4295,7 +4402,6 @@ dependencies = [
"chrono",
"clap",
"consumption_metrics",
"crossbeam-deque",
"dashmap",
"env_logger",
"fallible-iterator",
@@ -4310,7 +4416,6 @@ dependencies = [
"http 1.1.0",
"http-body-util",
"humantime",
"humantime-serde",
"hyper 0.14.26",
"hyper 1.2.0",
"hyper-util",
@@ -4321,6 +4426,7 @@ dependencies = [
"md5",
"measured",
"metrics",
"native-tls",
"once_cell",
"opentelemetry",
"parking_lot 0.12.1",
@@ -4328,6 +4434,7 @@ dependencies = [
"parquet_derive",
"pbkdf2",
"pin-project-lite",
"postgres-native-tls",
"postgres-protocol",
"postgres_backend",
"pq_proto",
@@ -4346,7 +4453,6 @@ dependencies = [
"rstest",
"rustc-hash",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"rustls-pemfile 2.1.1",
"scopeguard",
"serde",
@@ -4376,6 +4482,7 @@ dependencies = [
"utils",
"uuid",
"walkdir",
"webpki-roots 0.25.2",
"workspace_hack",
"x509-parser",
]
@@ -4682,21 +4789,20 @@ dependencies = [
"http 0.2.9",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls 0.24.0",
"hyper-tls",
"ipnet",
"js-sys",
"log",
"mime",
"native-tls",
"once_cell",
"percent-encoding",
"pin-project-lite",
"rustls 0.21.11",
"rustls-pemfile 1.0.2",
"serde",
"serde_json",
"serde_urlencoded",
"tokio",
"tokio-rustls 0.24.0",
"tokio-native-tls",
"tokio-util",
"tower-service",
"url",
@@ -4704,7 +4810,6 @@ dependencies = [
"wasm-bindgen-futures",
"wasm-streams 0.3.0",
"web-sys",
"webpki-roots 0.25.2",
"winreg 0.50.0",
]
@@ -5129,24 +5234,21 @@ dependencies = [
"futures-util",
"hex",
"histogram",
"humantime",
"itertools",
"once_cell",
"native-tls",
"pageserver",
"pageserver_api",
"postgres-native-tls",
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest 0.12.4",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"serde",
"serde_json",
"serde_with",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-stream",
"tokio-util",
@@ -5820,7 +5922,6 @@ dependencies = [
"anyhow",
"clap",
"comfy-table",
"humantime",
"hyper 0.14.26",
"pageserver_api",
"pageserver_client",
@@ -6091,6 +6192,8 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
dependencies = [
"itoa",
"js-sys",
"libc",
"num_threads",
"serde",
"time-core",
"time-macros",
@@ -6166,7 +6269,7 @@ dependencies = [
[[package]]
name = "tokio-epoll-uring"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
dependencies = [
"futures",
"nix 0.26.4",
@@ -6200,6 +6303,16 @@ dependencies = [
"syn 2.0.52",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.7"
@@ -6606,6 +6719,15 @@ version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
[[package]]
name = "tz-rs"
version = "0.6.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4"
dependencies = [
"const_fn",
]
[[package]]
name = "uname"
version = "0.1.1"
@@ -6678,12 +6800,11 @@ dependencies = [
[[package]]
name = "uring-common"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
dependencies = [
"bytes",
"io-uring",
"libc",
"linux-raw-sys 0.6.4",
]
[[package]]
@@ -7352,7 +7473,6 @@ dependencies = [
name = "workspace_hack"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"aws-config",
"aws-runtime",
@@ -7511,9 +7631,9 @@ dependencies = [
[[package]]
name = "zeroize"
version = "1.7.0"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
dependencies = [
"zeroize_derive",
]

View File

@@ -41,15 +41,14 @@ license = "Apache-2.0"
## All dependency versions, used in the project
[workspace.dependencies]
ahash = "0.8"
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
azure_core = "0.19"
azure_identity = "0.19"
azure_storage = "0.19"
azure_storage_blobs = "0.19"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
@@ -75,7 +74,6 @@ clap = { version = "4.0", features = ["derive"] }
comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
crossbeam-deque = "0.8.5"
crossbeam-utils = "0.8.5"
dashmap = { version = "5.5.0", features = ["raw-api"] }
either = "1.8"
@@ -114,6 +112,7 @@ md5 = "0.7.0"
measured = { version = "0.0.21", features=["lasso"] }
measured-process = { version = "0.0.21" }
memoffset = "0.8"
native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
notify = "6.0.0"
num_cpus = "1.15"
@@ -190,7 +189,7 @@ url = "2.2"
urlencoding = "2.1"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
rustls-native-certs = "0.7"
webpki-roots = "0.25"
x509-parser = "0.15"
## TODO replace this with tracing
@@ -199,6 +198,7 @@ log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
@@ -239,7 +239,8 @@ tonic-build = "0.9"
[patch.crates-io]
# Needed to get `tokio-postgres-rustls` to depend on our fork.
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID

View File

@@ -89,7 +89,7 @@ RUN apt update && \
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
make clean && cp -R /sfcgal/* /
@@ -98,7 +98,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
./autogen.sh && \
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
@@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
mkdir build && cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -149,7 +149,7 @@ RUN apt update && \
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
# generate and copy upgrade scripts
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
cp upgrade/* /usr/local/pgsql/share/extension/ && \
@@ -194,7 +194,7 @@ RUN case "$(uname -m)" in \
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
mkdir build && cd build && \
cmake .. -DCMAKE_BUILD_TYPE=Release && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -204,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -222,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -243,15 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY patches/pgvector.patch /pgvector.patch
# By default, pgvector Makefile uses `-march=native`. We don't want that,
# because we build the images on different machines than where we run them.
# Pass OPTFLAGS="" to remove it.
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \
echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
patch -p1 < /pgvector.patch && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
#########################################################################################
@@ -266,7 +263,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
@@ -281,7 +278,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
@@ -297,7 +294,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
@@ -313,7 +310,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
@@ -329,7 +326,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
@@ -345,7 +342,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
@@ -361,7 +358,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
@@ -377,7 +374,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
@@ -393,7 +390,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
@@ -424,7 +421,7 @@ RUN case "${PG_VERSION}" in \
apt-get install -y cmake && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
cd build && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -462,7 +459,7 @@ RUN case "${PG_VERSION}" in \
esac && \
wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make install -j $(getconf _NPROCESSORS_ONLN) && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
@@ -481,7 +478,7 @@ RUN apt-get update && \
apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
mkdir build && cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
@@ -505,7 +502,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
@@ -531,7 +528,7 @@ RUN apt-get update && \
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
cmake \
-D RDK_BUILD_CAIRO_SUPPORT=OFF \
-D RDK_BUILD_INCHI_SUPPORT=ON \
@@ -571,7 +568,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
@@ -588,7 +585,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
@@ -605,7 +602,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
@@ -631,7 +628,7 @@ RUN case "${PG_VERSION}" in \
esac && \
wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
@@ -647,7 +644,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
@@ -696,7 +693,7 @@ ARG PG_VERSION
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
@@ -713,7 +710,7 @@ ARG PG_VERSION
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
# it's needed to enable extension because it uses untrusted C language
@@ -733,7 +730,7 @@ ARG PG_VERSION
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
@@ -749,7 +746,7 @@ ARG PG_VERSION
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
@@ -771,7 +768,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
@@ -787,7 +784,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
@@ -804,7 +801,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control

View File

@@ -1,5 +1,3 @@
use std::path::Path;
use anyhow::{anyhow, Context};
use tracing::warn;
@@ -19,24 +17,17 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
.arg(size_bytes.to_string())
.spawn();
if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
return Ok(());
}
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => {
// The command failed. Maybe it was because the resize-swap file doesn't exist?
// The --once flag causes it to delete itself on success so we don't disable swap
// while postgres is running; maybe this is fine.
match Path::new(RESIZE_SWAP_BIN).try_exists() {
Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
// The path doesn't exist; we're actually ok
Ok(false) => {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
Ok(())
},
}
}
false => Err(anyhow!("process exited with {status}")),
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| {

View File

@@ -243,13 +243,9 @@ impl StorageController {
anyhow::bail!("initdb failed with status {status}");
}
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", self.postgres_port),
format!("port = {}", self.postgres_port),
)
.await?;
};

View File

@@ -9,7 +9,6 @@ license.workspace = true
anyhow.workspace = true
clap.workspace = true
comfy-table.workspace = true
humantime.workspace = true
hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true

View File

@@ -7,9 +7,8 @@ use pageserver_api::{
TenantDescribeResponse, TenantPolicyRequest,
},
models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
TenantShardSplitRequest, TenantShardSplitResponse,
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
},
shard::{ShardStripeSize, TenantShardId},
};
@@ -126,28 +125,6 @@ enum Command {
#[arg(long)]
tenant_id: TenantId,
},
/// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
/// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
TenantDrop {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
unclean: bool,
},
NodeDrop {
#[arg(long)]
node_id: NodeId,
#[arg(long)]
unclean: bool,
},
TenantSetTimeBasedEviction {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
period: humantime::Duration,
#[arg(long)]
threshold: humantime::Duration,
},
}
#[derive(Parser)]
@@ -697,46 +674,6 @@ async fn main() -> anyhow::Result<()> {
}
}
}
Command::TenantDrop { tenant_id, unclean } => {
if !unclean {
anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.")
}
storcon_client
.dispatch::<(), ()>(
Method::POST,
format!("debug/v1/tenant/{tenant_id}/drop"),
None,
)
.await?;
}
Command::NodeDrop { node_id, unclean } => {
if !unclean {
anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.")
}
storcon_client
.dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
.await?;
}
Command::TenantSetTimeBasedEviction {
tenant_id,
period,
threshold,
} => {
vps_client
.tenant_config(&TenantConfigRequest {
tenant_id,
config: TenantConfig {
eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
EvictionPolicyLayerAccessThreshold {
period: period.into(),
threshold: threshold.into(),
},
)),
..Default::default()
},
})
.await?;
}
}
Ok(())

View File

@@ -99,13 +99,6 @@ name = "async-executor"
[[bans.deny]]
name = "smol"
[[bans.deny]]
# We want to use rustls instead of the platform's native tls implementation.
name = "native-tls"
[[bans.deny]]
name = "openssl"
# This section is considered when running `cargo deny check sources`.
# More documentation about the 'sources' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html

View File

@@ -1,4 +1,4 @@
ARG REPOSITORY=neondatabase
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG COMPUTE_IMAGE=compute-node-v14
ARG TAG=latest

View File

@@ -8,6 +8,8 @@
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
# to verify custom image builds (e.g pre-published ones).
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
set -eux -o pipefail
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

View File

@@ -1,7 +1,7 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use bytes::BufMut;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::RepOriginId;
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
use std::{fmt, ops::Range};
@@ -39,9 +39,6 @@ pub const RELATION_SIZE_PREFIX: u8 = 0x61;
/// The key prefix of AUX file keys.
pub const AUX_KEY_PREFIX: u8 = 0x62;
/// The key prefix of ReplOrigin keys.
pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -56,8 +53,14 @@ impl Key {
/// Encode a metadata key to a storage key.
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
assert!(is_metadata_key_slice(key), "key not in metadata key range");
// Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
Self::from_i128(i128::from_be_bytes(*key))
Key {
field1: key[0],
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
field5: key[11],
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
}
}
/// Encode a metadata key to a storage key.
@@ -65,6 +68,17 @@ impl Key {
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
}
/// Extract a metadata key to a writer. The result should always be 16 bytes.
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
writer.put_u8(self.field1);
assert!(self.field2 <= 0xFFFF);
writer.put_u16(self.field2 as u16);
writer.put_u32(self.field3);
writer.put_u32(self.field4);
writer.put_u8(self.field5);
writer.put_u32(self.field6);
}
/// Get the range of metadata keys.
pub const fn metadata_key_range() -> Range<Self> {
Key {
@@ -107,7 +121,7 @@ impl Key {
/// As long as Neon does not support tablespace (because of lack of access to local file system),
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
pub fn to_i128(&self) -> i128 {
assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
(((self.field1 & 0x7F) as i128) << 120)
| (((self.field2 & 0xFFFF) as i128) << 104)
| ((self.field3 as i128) << 72)
@@ -161,7 +175,7 @@ impl Key {
}
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
/// Use [`Key::from_metadata_key`] instead.
pub fn from_slice(b: &[u8]) -> Self {
Key {
field1: b[0],
@@ -174,7 +188,7 @@ impl Key {
}
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
/// Use [`Key::extract_metadata_key_to_writer`] instead.
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
buf[0] = self.field1;
BE::write_u32(&mut buf[1..5], self.field2);
@@ -385,14 +399,7 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: 0xffff_ffff,
}
}
impl Key {
#[inline(always)]
pub fn is_rel_size_key(&self) -> bool {
self.field1 == 0 && self.field6 == u32::MAX
field6: 0xffffffff,
}
}
@@ -433,25 +440,6 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
}
}
#[inline(always)]
pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
if key.field1 == 0x01
&& key.field3 == 0
&& key.field4 == 0
&& key.field5 == 0
&& key.field6 == 0
{
match key.field2 {
0 => Some(Ok(SlruKind::Clog)),
1 => Some(Ok(SlruKind::MultiXactMembers)),
2 => Some(Ok(SlruKind::MultiXactOffsets)),
x => Some(Err(x)),
}
} else {
None
}
}
#[inline(always)]
pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
Key {
@@ -480,17 +468,7 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
field3: 1,
field4: segno,
field5: 0,
field6: 0xffff_ffff,
}
}
impl Key {
pub fn is_slru_segment_size_key(&self) -> bool {
self.field1 == 0x01
&& self.field2 < 0x03
&& self.field3 == 0x01
&& self.field5 == 0
&& self.field6 == u32::MAX
field6: 0xffffffff,
}
}
@@ -591,37 +569,6 @@ pub const AUX_FILES_KEY: Key = Key {
field6: 2,
};
#[inline(always)]
pub fn repl_origin_key(origin_id: RepOriginId) -> Key {
Key {
field1: REPL_ORIGIN_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: origin_id as u32,
}
}
/// Get the range of replorigin keys.
pub fn repl_origin_key_range() -> Range<Key> {
Key {
field1: REPL_ORIGIN_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: REPL_ORIGIN_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0x10000,
}
}
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
@@ -630,78 +577,73 @@ pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
impl Key {
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
#[inline(always)]
pub fn is_inherited_key(self) -> bool {
!NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
}
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
#[inline(always)]
pub fn is_inherited_key(key: Key) -> bool {
!NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
}
#[inline(always)]
pub fn is_rel_fsm_block_key(self) -> bool {
self.field1 == 0x00
&& self.field4 != 0
&& self.field5 == FSM_FORKNUM
&& self.field6 != 0xffffffff
}
#[inline(always)]
pub fn is_rel_fsm_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
}
#[inline(always)]
pub fn is_rel_vm_block_key(self) -> bool {
self.field1 == 0x00
&& self.field4 != 0
&& self.field5 == VISIBILITYMAP_FORKNUM
&& self.field6 != 0xffffffff
}
#[inline(always)]
pub fn is_rel_vm_block_key(key: Key) -> bool {
key.field1 == 0x00
&& key.field4 != 0
&& key.field5 == VISIBILITYMAP_FORKNUM
&& key.field6 != 0xffffffff
}
#[inline(always)]
pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
Ok(match self.field1 {
0x01 => {
let kind = match self.field2 {
0x00 => SlruKind::Clog,
0x01 => SlruKind::MultiXactMembers,
0x02 => SlruKind::MultiXactOffsets,
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2),
};
let segno = self.field4;
let blknum = self.field6;
#[inline(always)]
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
Ok(match key.field1 {
0x01 => {
let kind = match key.field2 {
0x00 => SlruKind::Clog,
0x01 => SlruKind::MultiXactMembers,
0x02 => SlruKind::MultiXactOffsets,
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
};
let segno = key.field4;
let blknum = key.field6;
(kind, segno, blknum)
}
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
})
}
(kind, segno, blknum)
}
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
#[inline(always)]
pub fn is_slru_block_key(self) -> bool {
self.field1 == 0x01 // SLRU-related
&& self.field3 == 0x00000001 // but not SlruDir
&& self.field6 != 0xffffffff // and not SlruSegSize
}
#[inline(always)]
pub fn is_slru_block_key(key: Key) -> bool {
key.field1 == 0x01 // SLRU-related
&& key.field3 == 0x00000001 // but not SlruDir
&& key.field6 != 0xffffffff // and not SlruSegSize
}
#[inline(always)]
pub fn is_rel_block_key(&self) -> bool {
self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
}
#[inline(always)]
pub fn is_rel_block_key(key: &Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
}
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
#[inline(always)]
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match self.field1 {
0x00 => (
RelTag {
spcnode: self.field2,
dbnode: self.field3,
relnode: self.field4,
forknum: self.field5,
},
self.field6,
),
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
})
}
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
#[inline(always)]
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (
RelTag {
spcnode: key.field2,
dbnode: key.field3,
relnode: key.field4,
forknum: key.field5,
},
key.field6,
),
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
impl std::str::FromStr for Key {
@@ -745,15 +687,10 @@ mod tests {
let mut metadata_key = vec![AUX_KEY_PREFIX];
metadata_key.extend_from_slice(&[0xFF; 15]);
let encoded_key = Key::from_metadata_key(&metadata_key);
let output_key = encoded_key.to_i128().to_be_bytes();
let mut output_key = Vec::new();
encoded_key.extract_metadata_key_to_writer(&mut output_key);
assert_eq!(metadata_key, output_key);
assert!(encoded_key.is_metadata_key());
assert!(is_metadata_key_slice(&metadata_key));
}
#[test]
fn test_possible_largest_key() {
Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
// TODO: put this key into the system and see if anything breaks.
}
}

View File

@@ -9,6 +9,7 @@ use std::{
collections::HashMap,
io::{BufRead, Read},
num::{NonZeroU64, NonZeroUsize},
str::FromStr,
sync::atomic::AtomicUsize,
time::{Duration, SystemTime},
};
@@ -161,22 +162,6 @@ impl std::fmt::Debug for TenantState {
}
}
/// A temporary lease to a specific lsn inside a timeline.
/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
#[serde_as]
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LsnLease {
#[serde_as(as = "SystemTimeAsRfc3339Millis")]
pub valid_until: SystemTime,
}
serde_with::serde_conv!(
SystemTimeAsRfc3339Millis,
SystemTime,
|time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
|value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
);
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum ActivatingFrom {
@@ -305,7 +290,7 @@ pub struct TenantConfig {
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
// defer parsing compaction_algorithm, like eviction_policy
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
pub compaction_algorithm: Option<CompactionAlgorithm>,
pub gc_horizon: Option<u64>,
pub gc_period: Option<String>,
pub image_creation_threshold: Option<usize>,
@@ -333,28 +318,14 @@ pub struct TenantConfig {
/// Unset -> V1
/// -> V2
/// -> CrossValidation -> V2
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AuxFilePolicy {
/// V1 aux file policy: store everything in AUX_FILE_KEY
#[strum(ascii_case_insensitive)]
V1,
/// V2 aux file policy: store in the AUX_FILE keyspace
#[strum(ascii_case_insensitive)]
V2,
/// Cross validation runs both formats on the write path and does validation
/// on the read path.
#[strum(ascii_case_insensitive)]
CrossValidation,
}
@@ -420,6 +391,23 @@ impl AuxFilePolicy {
}
}
impl FromStr for AuxFilePolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.to_lowercase();
if s == "v1" {
Ok(Self::V1)
} else if s == "v2" {
Ok(Self::V2)
} else if s == "crossvalidation" || s == "cross_validation" {
Ok(Self::CrossValidation)
} else {
anyhow::bail!("cannot parse {} to aux file policy", s)
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind")]
pub enum EvictionPolicy {
@@ -438,28 +426,13 @@ impl EvictionPolicy {
}
}
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind")]
pub enum CompactionAlgorithm {
Legacy,
Tiered,
}
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
pub struct CompactionAlgorithmSettings {
pub kind: CompactionAlgorithm,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct EvictionPolicyLayerAccessThreshold {
#[serde(with = "humantime_serde")]
@@ -816,8 +789,6 @@ pub enum HistoricLayerInfo {
lsn_end: Lsn,
remote: bool,
access_stats: LayerAccessStats,
l0: bool,
},
Image {
layer_file_name: String,
@@ -1416,7 +1387,6 @@ impl PagestreamBeMessage {
#[cfg(test)]
mod tests {
use serde_json::json;
use std::str::FromStr;
use super::*;
@@ -1679,14 +1649,4 @@ mod tests {
AuxFilePolicy::V2
));
}
#[test]
fn test_aux_parse() {
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
assert_eq!(
AuxFilePolicy::from_str("cross-validation").unwrap(),
AuxFilePolicy::CrossValidation
);
}
}

View File

@@ -3,7 +3,7 @@ use std::cmp::Ordering;
use std::fmt;
use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
use postgres_ffi::relfile_utils::forknumber_to_name;
use postgres_ffi::Oid;
///
@@ -68,57 +68,6 @@ impl fmt::Display for RelTag {
}
}
#[derive(Debug, thiserror::Error)]
pub enum ParseRelTagError {
#[error("invalid forknum")]
InvalidForknum(#[source] std::num::ParseIntError),
#[error("missing triplet member {}", .0)]
MissingTripletMember(usize),
#[error("invalid triplet member {}", .0)]
InvalidTripletMember(usize, #[source] std::num::ParseIntError),
}
impl std::str::FromStr for RelTag {
type Err = ParseRelTagError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
use ParseRelTagError::*;
// FIXME: in postgres logs this separator is dot
// Example:
// could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
// with a regex we could get this more painlessly
let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
Some((t, f)) => {
let forknum = forkname_to_number(Some(f));
let forknum = if let Ok(f) = forknum {
f
} else {
f.parse::<u8>().map_err(InvalidForknum)?
};
(t, Some(forknum))
}
None => (s, None),
};
let mut split = triplet
.splitn(3, '/')
.enumerate()
.map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
let spcnode = split.next().ok_or(MissingTripletMember(0))??;
let dbnode = split.next().ok_or(MissingTripletMember(1))??;
let relnode = split.next().ok_or(MissingTripletMember(2))??;
Ok(RelTag {
spcnode,
forknum: forknum.unwrap_or(MAIN_FORKNUM),
dbnode,
relnode,
})
}
}
impl RelTag {
pub fn to_segfile_name(&self, segno: u32) -> String {
let mut name = if self.spcnode == GLOBALTABLESPACE_OID {

View File

@@ -1,6 +1,9 @@
use std::{ops::RangeInclusive, str::FromStr};
use crate::{key::Key, models::ShardParameters};
use crate::{
key::{is_rel_block_key, Key},
models::ShardParameters,
};
use hex::FromHex;
use postgres_ffi::relfile_utils::INIT_FORKNUM;
use serde::{Deserialize, Serialize};
@@ -425,12 +428,6 @@ impl<'de> Deserialize<'de> for TenantShardId {
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardStripeSize(pub u32);
impl Default for ShardStripeSize {
fn default() -> Self {
DEFAULT_STRIPE_SIZE
}
}
/// Layout version: for future upgrades where we might change how the key->shard mapping works
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardLayout(u8);
@@ -562,14 +559,6 @@ impl ShardIdentity {
}
}
/// Obtains the shard number and count combined into a `ShardIndex`.
pub fn shard_index(&self) -> ShardIndex {
ShardIndex {
shard_count: self.count,
shard_number: self.number,
}
}
pub fn shard_slug(&self) -> String {
if self.count > ShardCount(0) {
format!("-{:02x}{:02x}", self.number.0, self.count.0)
@@ -669,7 +658,7 @@ fn key_is_shard0(key: &Key) -> bool {
// because they must be included in basebackups.
let is_initfork = key.field5 == INIT_FORKNUM;
!key.is_rel_block_key() || is_initfork
!is_rel_block_key(key) || is_initfork
}
/// Provide the same result as the function in postgres `hashfn.h` with the same name
@@ -716,25 +705,6 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
ShardNumber((hash % count.0 as u32) as u8)
}
/// For debugging, while not exposing the internals.
#[derive(Debug)]
#[allow(unused)] // used by debug formatting by pagectl
struct KeyShardingInfo {
shard0: bool,
shard_number: ShardNumber,
}
pub fn describe(
key: &Key,
shard_count: ShardCount,
stripe_size: ShardStripeSize,
) -> impl std::fmt::Debug {
KeyShardingInfo {
shard0: key_is_shard0(key),
shard_number: key_to_shard_number(shard_count, stripe_size, key),
}
}
#[cfg(test)]
mod tests {
use utils::Hex;

View File

@@ -178,13 +178,6 @@ impl PgConnectionConfig {
}
}
impl fmt::Display for PgConnectionConfig {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// The password is intentionally hidden and not part of this display string.
write!(f, "postgresql://{}:{}", self.host, self.port)
}
}
impl fmt::Debug for PgConnectionConfig {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`

View File

@@ -126,7 +126,6 @@ fn main() -> anyhow::Result<()> {
.allowlist_type("PageHeaderData")
.allowlist_type("DBState")
.allowlist_type("RelMapFile")
.allowlist_type("RepOriginId")
// Because structs are used for serialization, tell bindgen to emit
// explicit padding fields.
.explicit_padding(true)

View File

@@ -110,7 +110,6 @@ pub mod pg_constants;
pub mod relfile_utils;
// Export some widely used datatypes that are unlikely to change across Postgres versions
pub use v14::bindings::RepOriginId;
pub use v14::bindings::{uint32, uint64, Oid};
pub use v14::bindings::{BlockNumber, OffsetNumber};
pub use v14::bindings::{MultiXactId, TransactionId};

View File

@@ -102,7 +102,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
// pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
// pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
@@ -167,7 +167,6 @@ pub const RM_RELMAP_ID: u8 = 7;
pub const RM_STANDBY_ID: u8 = 8;
pub const RM_HEAP2_ID: u8 = 9;
pub const RM_HEAP_ID: u8 = 10;
pub const RM_REPLORIGIN_ID: u8 = 19;
pub const RM_LOGICALMSG_ID: u8 = 21;
// from neon_rmgr.h
@@ -224,10 +223,6 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
pub const XLP_LONG_HEADER: u16 = 0x0002;
/* From xlog.h */
pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
/* From replication/slot.h */
pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */
+ 64 /* NameData */ + 4*4;
@@ -242,9 +237,6 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
(BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
/* From origin.c */
pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
// List of subdirectories inside pgdata.
// Copied from src/bin/initdb/initdb.c
pub const PGDATA_SUBDIRS: [&str; 22] = [

View File

@@ -373,29 +373,31 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
&[&(repeats as i32)],
)?;
info!(
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
client.pg_current_wal_insert_lsn()?,
XLOG_SIZE_OF_XLOG_RECORD
);
// Emit the XLOG_SWITCH
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
let xlog_switch_record_end: PgLsn =
client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
!= XLOG_SIZE_OF_XLOG_SHORT_PHD
{
warn!(
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
xlog_switch_record_end,
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
);
continue;
}
return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
break;
}
info!(
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
client.pg_current_wal_insert_lsn()?,
XLOG_SIZE_OF_XLOG_RECORD
);
// Emit the XLOG_SWITCH
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
let next_segment = PgLsn::from(0x0200_0000);
ensure!(
xlog_switch_record_end < next_segment,
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
xlog_switch_record_end,
next_segment
);
ensure!(
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
xlog_switch_record_end,
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
);
Ok(vec![before_xlog_switch, xlog_switch_record_end])
}
}

View File

@@ -7,7 +7,6 @@ license.workspace = true
[dependencies]
bytes.workspace = true
byteorder.workspace = true
itertools.workspace = true
pin-project-lite.workspace = true
postgres-protocol.workspace = true
rand.workspace = true

View File

@@ -7,9 +7,8 @@ pub mod framed;
use byteorder::{BigEndian, ReadBytesExt};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use std::{borrow::Cow, fmt, io, str};
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
// re-export for use in utils pageserver_feedback.rs
pub use postgres_protocol::PG_EPOCH;
@@ -51,37 +50,15 @@ pub enum FeStartupPacket {
},
}
#[derive(Debug, Clone, Default)]
pub struct StartupMessageParamsBuilder {
params: BytesMut,
}
impl StartupMessageParamsBuilder {
/// Set parameter's value by its name.
/// name and value must not contain a \0 byte
pub fn insert(&mut self, name: &str, value: &str) {
self.params.put(name.as_bytes());
self.params.put(&b"\0"[..]);
self.params.put(value.as_bytes());
self.params.put(&b"\0"[..]);
}
pub fn freeze(self) -> StartupMessageParams {
StartupMessageParams {
params: self.params.freeze(),
}
}
}
#[derive(Debug, Clone, Default)]
#[derive(Debug)]
pub struct StartupMessageParams {
params: Bytes,
params: HashMap<String, String>,
}
impl StartupMessageParams {
/// Get parameter's value by its name.
pub fn get(&self, name: &str) -> Option<&str> {
self.iter().find_map(|(k, v)| (k == name).then_some(v))
self.params.get(name).map(|s| s.as_str())
}
/// Split command-line options according to PostgreSQL's logic,
@@ -135,19 +112,15 @@ impl StartupMessageParams {
/// Iterate through key-value pairs in an arbitrary order.
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
let params =
std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
params.split_terminator('\0').tuples()
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
}
// This function is mostly useful in tests.
#[doc(hidden)]
pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
let mut b = StartupMessageParamsBuilder::default();
for (k, v) in pairs {
b.insert(k, v)
Self {
params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
}
b.freeze()
}
}
@@ -372,21 +345,35 @@ impl FeStartupPacket {
(major_version, minor_version) => {
// StartupMessage
let s = str::from_utf8(&msg).map_err(|_e| {
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
})?;
let s = s.strip_suffix('\0').ok_or_else(|| {
ProtocolError::Protocol(
"StartupMessage params: missing null terminator".to_string(),
)
})?;
// Parse pairs of null-terminated strings (key, value).
// See `postgres: ProcessStartupPacket, build_startup_packet`.
let mut tokens = str::from_utf8(&msg)
.map_err(|_e| {
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
})?
.strip_suffix('\0') // drop packet's own null
.ok_or_else(|| {
ProtocolError::Protocol(
"StartupMessage params: missing null terminator".to_string(),
)
})?
.split_terminator('\0');
let mut params = HashMap::new();
while let Some(name) = tokens.next() {
let value = tokens.next().ok_or_else(|| {
ProtocolError::Protocol(
"StartupMessage params: key without value".to_string(),
)
})?;
params.insert(name.to_owned(), value.to_owned());
}
FeStartupPacket::StartupMessage {
major_version,
minor_version,
params: StartupMessageParams {
params: msg.slice_ref(s.as_bytes()),
},
params: StartupMessageParams { params },
}
}
};

View File

@@ -26,14 +26,14 @@ use futures::stream::Stream;
use futures_util::StreamExt;
use futures_util::TryStreamExt;
use http_types::{StatusCode, Url};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
use crate::RemoteStorageActivity;
use crate::{
error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
TimeTravelError, TimeoutOrCancel,
};
pub struct AzureBlobStorage {
@@ -138,8 +138,6 @@ impl AzureBlobStorage {
let mut last_modified = None;
let mut metadata = HashMap::new();
let started_at = start_measuring_requests(kind);
let download = async {
let response = builder
// convert to concrete Pageable
@@ -203,22 +201,13 @@ impl AzureBlobStorage {
})
};
let download = tokio::select! {
tokio::select! {
bufs = download => bufs,
cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
},
};
let started_at = ScopeGuard::into_inner(started_at);
let outcome = match &download {
Ok(_) => AttemptOutcome::Ok,
Err(_) => AttemptOutcome::Err,
};
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, outcome, started_at);
download
}
}
async fn permit(
@@ -352,10 +341,7 @@ impl RemoteStorage for AzureBlobStorage {
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Put;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let _permit = self.permit(RequestKind::Put, cancel).await?;
let op = async {
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -379,25 +365,14 @@ impl RemoteStorage for AzureBlobStorage {
match fut.await {
Ok(Ok(_response)) => Ok(()),
Ok(Err(azure)) => Err(azure.into()),
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
}
};
let res = tokio::select! {
tokio::select! {
res = op => res,
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
let outcome = match res {
Ok(_) => AttemptOutcome::Ok,
Err(_) => AttemptOutcome::Err,
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, outcome, started_at);
res
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
}
}
async fn download(
@@ -443,13 +418,12 @@ impl RemoteStorage for AzureBlobStorage {
paths: &'a [RemotePath],
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Delete;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let _permit = self.permit(RequestKind::Delete, cancel).await?;
let op = async {
// TODO batch requests are not supported by the SDK
// TODO batch requests are also not supported by the SDK
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
for path in paths {
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
@@ -474,16 +448,10 @@ impl RemoteStorage for AzureBlobStorage {
Ok(())
};
let res = tokio::select! {
tokio::select! {
res = op => res,
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
}
}
async fn copy(
@@ -492,9 +460,7 @@ impl RemoteStorage for AzureBlobStorage {
to: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Copy;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let _permit = self.permit(RequestKind::Copy, cancel).await?;
let timeout = tokio::time::sleep(self.timeout);
@@ -538,21 +504,15 @@ impl RemoteStorage for AzureBlobStorage {
}
};
let res = tokio::select! {
tokio::select! {
res = op => res,
_ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
_ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
_ = timeout => {
let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
let e = e.context(format!("Timeout, last status: {copy_status:?}"));
Err(e)
},
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res
}
}
async fn time_travel_recover(
@@ -566,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage {
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
Err(TimeTravelError::Unimplemented)
}
fn activity(&self) -> RemoteStorageActivity {
self.concurrency_limiter.activity()
}
}
pin_project_lite::pin_project! {

View File

@@ -12,7 +12,6 @@
mod azure_blob;
mod error;
mod local_fs;
mod metrics;
mod s3_bucket;
mod simulate_failures;
mod support;
@@ -122,8 +121,8 @@ impl RemotePath {
self.0.file_name()
}
pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
Self(self.0.join(path))
pub fn join(&self, segment: &Utf8Path) -> Self {
Self(self.0.join(segment))
}
pub fn get_path(&self) -> &Utf8PathBuf {
@@ -264,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
done_if_after: SystemTime,
cancel: &CancellationToken,
) -> Result<(), TimeTravelError>;
/// Query how busy we currently are: may be used by callers which wish to politely
/// back off if there are already a lot of operations underway.
fn activity(&self) -> RemoteStorageActivity;
}
pub struct RemoteStorageActivity {
pub read_available: usize,
pub read_total: usize,
pub write_available: usize,
pub write_total: usize,
}
/// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -445,6 +455,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
}
pub fn activity(&self) -> RemoteStorageActivity {
match self {
Self::LocalFs(s) => s.activity(),
Self::AwsS3(s) => s.activity(),
Self::AzureBlob(s) => s.activity(),
Self::Unreliable(s) => s.activity(),
}
}
}
impl GenericRemoteStorage {
@@ -775,6 +794,9 @@ struct ConcurrencyLimiter {
// The helps to ensure we don't exceed the thresholds.
write: Arc<Semaphore>,
read: Arc<Semaphore>,
write_total: usize,
read_total: usize,
}
impl ConcurrencyLimiter {
@@ -803,10 +825,21 @@ impl ConcurrencyLimiter {
Arc::clone(self.for_kind(kind)).acquire_owned().await
}
fn activity(&self) -> RemoteStorageActivity {
RemoteStorageActivity {
read_available: self.read.available_permits(),
read_total: self.read_total,
write_available: self.write.available_permits(),
write_total: self.write_total,
}
}
fn new(limit: usize) -> ConcurrencyLimiter {
Self {
read: Arc::new(Semaphore::new(limit)),
write: Arc::new(Semaphore::new(limit)),
read_total: limit,
write_total: limit,
}
}
}

View File

@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
use utils::crashsafe::path_with_suffix_extension;
use crate::{
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
REMOTE_STORAGE_PREFIX_SEPARATOR,
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
use super::{RemoteStorage, StorageMetadata};
@@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs {
) -> Result<(), TimeTravelError> {
Err(TimeTravelError::Unimplemented)
}
fn activity(&self) -> RemoteStorageActivity {
// LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
RemoteStorageActivity {
read_available: 16,
read_total: 16,
write_available: 16,
write_total: 16,
}
}
}
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {

View File

@@ -46,16 +46,15 @@ use utils::backoff;
use super::StorageMetadata;
use crate::{
error::Cancelled,
metrics::{start_counting_cancelled_wait, start_measuring_requests},
support::PermitCarrying,
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
REMOTE_STORAGE_PREFIX_SEPARATOR,
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
use crate::metrics::AttemptOutcome;
pub(super) use crate::metrics::RequestKind;
pub(super) mod metrics;
use self::metrics::AttemptOutcome;
pub(super) use self::metrics::RequestKind;
/// AWS S3 storage.
pub struct S3Bucket {
@@ -228,7 +227,7 @@ impl S3Bucket {
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
@@ -249,7 +248,7 @@ impl S3Bucket {
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
Ok(permit)
@@ -288,7 +287,7 @@ impl S3Bucket {
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
// an error: we expect to sometimes fetch an object and find it missing,
// e.g. when probing for timeline indices.
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Ok,
started_at,
@@ -296,7 +295,7 @@ impl S3Bucket {
return Err(DownloadError::NotFound);
}
Err(e) => {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
@@ -372,12 +371,12 @@ impl S3Bucket {
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &resp, started_at);
let resp = resp.context("request deletion")?;
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.deleted_objects_total
.inc_by(chunk.len() as u64);
@@ -436,14 +435,14 @@ pin_project_lite::pin_project! {
/// Times and tracks the outcome of the request.
struct TimedDownload<S> {
started_at: std::time::Instant,
outcome: AttemptOutcome,
outcome: metrics::AttemptOutcome,
#[pin]
inner: S
}
impl<S> PinnedDrop for TimedDownload<S> {
fn drop(mut this: Pin<&mut Self>) {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
}
}
}
@@ -452,7 +451,7 @@ impl<S> TimedDownload<S> {
fn new(started_at: std::time::Instant, inner: S) -> Self {
TimedDownload {
started_at,
outcome: AttemptOutcome::Cancelled,
outcome: metrics::AttemptOutcome::Cancelled,
inner,
}
}
@@ -469,8 +468,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
let res = ready!(this.inner.poll_next(cx));
match &res {
Some(Ok(_)) => {}
Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
None => *this.outcome = AttemptOutcome::Ok,
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
None => *this.outcome = metrics::AttemptOutcome::Ok,
}
Poll::Ready(res)
@@ -544,7 +543,7 @@ impl RemoteStorage for S3Bucket {
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &response, started_at);
@@ -626,7 +625,7 @@ impl RemoteStorage for S3Bucket {
if let Ok(inner) = &res {
// do not incl. timeouts as errors in metrics but cancellations
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, inner, started_at);
}
@@ -674,7 +673,7 @@ impl RemoteStorage for S3Bucket {
};
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
@@ -976,6 +975,32 @@ impl RemoteStorage for S3Bucket {
}
Ok(())
}
fn activity(&self) -> RemoteStorageActivity {
self.concurrency_limiter.activity()
}
}
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
fn start_counting_cancelled_wait(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
})
}
/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
fn start_measuring_requests(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Cancelled,
started_at,
)
})
}
// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry

View File

@@ -15,7 +15,6 @@ pub(crate) enum RequestKind {
TimeTravel = 5,
}
use scopeguard::ScopeGuard;
use RequestKind::*;
impl RequestKind {
@@ -34,10 +33,10 @@ impl RequestKind {
}
}
pub(crate) struct RequestTyped<C>([C; 6]);
pub(super) struct RequestTyped<C>([C; 6]);
impl<C> RequestTyped<C> {
pub(crate) fn get(&self, kind: RequestKind) -> &C {
pub(super) fn get(&self, kind: RequestKind) -> &C {
&self.0[kind.as_index()]
}
@@ -59,19 +58,19 @@ impl<C> RequestTyped<C> {
}
impl RequestTyped<Histogram> {
pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
self.get(kind).observe(started_at.elapsed().as_secs_f64())
}
}
pub(crate) struct PassFailCancelledRequestTyped<C> {
pub(super) struct PassFailCancelledRequestTyped<C> {
success: RequestTyped<C>,
fail: RequestTyped<C>,
cancelled: RequestTyped<C>,
}
#[derive(Debug, Clone, Copy)]
pub(crate) enum AttemptOutcome {
pub(super) enum AttemptOutcome {
Ok,
Err,
Cancelled,
@@ -87,7 +86,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
}
impl AttemptOutcome {
pub(crate) fn as_str(&self) -> &'static str {
pub(super) fn as_str(&self) -> &'static str {
match self {
AttemptOutcome::Ok => "ok",
AttemptOutcome::Err => "err",
@@ -97,7 +96,7 @@ impl AttemptOutcome {
}
impl<C> PassFailCancelledRequestTyped<C> {
pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
let target = match outcome {
AttemptOutcome::Ok => &self.success,
AttemptOutcome::Err => &self.fail,
@@ -120,7 +119,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
}
impl PassFailCancelledRequestTyped<Histogram> {
pub(crate) fn observe_elapsed(
pub(super) fn observe_elapsed(
&self,
kind: RequestKind,
outcome: impl Into<AttemptOutcome>,
@@ -131,44 +130,19 @@ impl PassFailCancelledRequestTyped<Histogram> {
}
}
/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
pub(crate) fn start_counting_cancelled_wait(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
crate::metrics::BUCKET_METRICS
.cancelled_waits
.get(kind)
.inc()
})
}
/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
pub(crate) fn start_measuring_requests(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Cancelled,
started_at,
)
})
}
pub(crate) struct BucketMetrics {
pub(super) struct BucketMetrics {
/// Full request duration until successful completion, error or cancellation.
pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
/// Total amount of seconds waited on queue.
pub(crate) wait_seconds: RequestTyped<Histogram>,
pub(super) wait_seconds: RequestTyped<Histogram>,
/// Track how many semaphore awaits were cancelled per request type.
///
/// This is in case cancellations are happening more than expected.
pub(crate) cancelled_waits: RequestTyped<IntCounter>,
pub(super) cancelled_waits: RequestTyped<IntCounter>,
/// Total amount of deleted objects in batches or single requests.
pub(crate) deleted_objects_total: IntCounter,
pub(super) deleted_objects_total: IntCounter,
}
impl Default for BucketMetrics {

View File

@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
use crate::{
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
StorageMetadata, TimeTravelError,
RemoteStorageActivity, StorageMetadata, TimeTravelError,
};
pub struct UnreliableWrapper {
@@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper {
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
fn activity(&self) -> RemoteStorageActivity {
self.inner.activity()
}
}

View File

@@ -78,10 +78,6 @@ where
let e = Err(std::io::Error::from(e));
return Poll::Ready(Some(e));
}
} else {
// this would be perfectly valid behaviour for doing a graceful completion on the
// download for example, but not one we expect to do right now.
tracing::warn!("continuing polling after having cancelled or timeouted");
}
this.inner.poll_next(cx)
@@ -93,22 +89,13 @@ where
}
/// Fires only on the first cancel or timeout, not on both.
pub(crate) fn cancel_or_timeout(
pub(crate) async fn cancel_or_timeout(
timeout: Duration,
cancel: CancellationToken,
) -> impl std::future::Future<Output = TimeoutOrCancel> + 'static {
// futures are lazy, they don't do anything before being polled.
//
// "precalculate" the wanted deadline before returning the future, so that we can use pause
// failpoint to trigger a timeout in test.
let deadline = tokio::time::Instant::now() + timeout;
async move {
tokio::select! {
_ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout,
_ = cancel.cancelled() => {
TimeoutOrCancel::Cancel
},
}
) -> TimeoutOrCancel {
tokio::select! {
_ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
_ = cancel.cancelled() => TimeoutOrCancel::Cancel,
}
}
@@ -185,31 +172,4 @@ mod tests {
_ = tokio::time::sleep(Duration::from_secs(121)) => {},
}
}
#[tokio::test]
async fn notified_but_pollable_after() {
let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static(
b"hello world",
))));
let timeout = Duration::from_secs(120);
let cancel = CancellationToken::new();
cancel.cancel();
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
let mut stream = std::pin::pin!(stream);
let next = stream.next().await;
let ioe = next.unwrap().unwrap_err();
assert!(
matches!(
ioe.get_ref().unwrap().downcast_ref::<DownloadError>(),
Some(&DownloadError::Cancelled)
),
"{ioe:?}"
);
let next = stream.next().await;
let bytes = next.unwrap().unwrap();
assert_eq!(&b"hello world"[..], bytes);
}
}

View File

@@ -50,9 +50,6 @@ pub struct SkTimelineInfo {
pub safekeeper_connstr: Option<String>,
#[serde(default)]
pub http_connstr: Option<String>,
// Minimum of all active RO replicas flush LSN
#[serde(default = "lsn_invalid")]
pub standby_horizon: Lsn,
}
#[derive(Debug, Clone, Deserialize, Serialize)]

View File

@@ -9,33 +9,6 @@ use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use tracing::*;
/// Declare a failpoint that can use the `pause` failpoint action.
/// We don't want to block the executor thread, hence, spawn_blocking + await.
#[macro_export]
macro_rules! pausable_failpoint {
($name:literal) => {
if cfg!(feature = "testing") {
tokio::task::spawn_blocking({
let current = tracing::Span::current();
move || {
let _entered = current.entered();
tracing::info!("at failpoint {}", $name);
fail::fail_point!($name);
}
})
.await
.expect("spawn_blocking");
}
};
($name:literal, $cond:expr) => {
if cfg!(feature = "testing") {
if $cond {
pausable_failpoint!($name)
}
}
};
}
/// use with fail::cfg("$name", "return(2000)")
///
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the

View File

@@ -3,9 +3,6 @@ use std::{fs, io, path::Path};
use anyhow::Context;
mod rename_noreplace;
pub use rename_noreplace::rename_noreplace;
pub trait PathExt {
/// Returns an error if `self` is not a directory.
fn is_empty_dir(&self) -> io::Result<bool>;

View File

@@ -1,109 +0,0 @@
use nix::NixPath;
/// Rename a file without replacing an existing file.
///
/// This is a wrapper around platform-specific APIs.
pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
src: &P1,
dst: &P2,
) -> nix::Result<()> {
{
#[cfg(target_os = "linux")]
{
nix::fcntl::renameat2(
None,
src,
None,
dst,
nix::fcntl::RenameFlags::RENAME_NOREPLACE,
)
}
#[cfg(target_os = "macos")]
{
let res = src.with_nix_path(|src| {
dst.with_nix_path(|dst|
// SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np.
unsafe {
nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL)
})
})??;
nix::errno::Errno::result(res).map(drop)
}
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
{
std::compile_error!("OS does not support no-replace renames");
}
}
}
#[cfg(test)]
mod test {
use std::{fs, path::PathBuf};
use super::*;
fn testdir() -> camino_tempfile::Utf8TempDir {
match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") {
Some(path) => {
let path: camino::Utf8PathBuf = path;
camino_tempfile::tempdir_in(path).unwrap()
}
None => camino_tempfile::tempdir().unwrap(),
}
}
#[test]
fn test_absolute_paths() {
let testdir = testdir();
println!("testdir: {}", testdir.path());
let src = testdir.path().join("src");
let dst = testdir.path().join("dst");
fs::write(&src, b"").unwrap();
fs::write(&dst, b"").unwrap();
let src = src.canonicalize().unwrap();
assert!(src.is_absolute());
let dst = dst.canonicalize().unwrap();
assert!(dst.is_absolute());
let result = rename_noreplace(&src, &dst);
assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
}
#[test]
fn test_relative_paths() {
let testdir = testdir();
println!("testdir: {}", testdir.path());
// this is fine because we run in nextest => process per test
std::env::set_current_dir(testdir.path()).unwrap();
let src = PathBuf::from("src");
let dst = PathBuf::from("dst");
fs::write(&src, b"").unwrap();
fs::write(&dst, b"").unwrap();
let result = rename_noreplace(&src, &dst);
assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
}
#[test]
fn test_works_when_not_exists() {
let testdir = testdir();
println!("testdir: {}", testdir.path());
let src = testdir.path().join("src");
let dst = testdir.path().join("dst");
fs::write(&src, b"content").unwrap();
rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap();
assert_eq!(
"content",
String::from_utf8(std::fs::read(&dst).unwrap()).unwrap()
);
}
}

View File

@@ -19,13 +19,13 @@
/// // right: [0x68; 1]
/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
/// ```
pub struct Hex<S>(pub S);
#[derive(PartialEq)]
pub struct Hex<'a>(pub &'a [u8]);
impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
impl std::fmt::Debug for Hex<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "[")?;
let chunks = self.0.as_ref().chunks(16);
for (i, c) in chunks.enumerate() {
for (i, c) in self.0.chunks(16).enumerate() {
if i > 0 && !c.is_empty() {
writeln!(f, ", ")?;
}
@@ -36,15 +36,6 @@ impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
write!(f, "0x{b:02x}")?;
}
}
write!(f, "; {}]", self.0.as_ref().len())
}
}
impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
fn eq(&self, other: &Hex<R>) -> bool {
let left = self.0.as_ref();
let right = other.0.as_ref();
left == right
write!(f, "; {}]", self.0.len())
}
}

View File

@@ -135,8 +135,7 @@ impl Gate {
let started_at = std::time::Instant::now();
let mut do_close = std::pin::pin!(self.do_close());
// with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
let nag_after = Duration::from_millis(100);
let nag_after = Duration::from_secs(1);
let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
return;

View File

@@ -496,9 +496,9 @@ mod tests {
// TODO: When updating Postgres versions, this test will cause
// problems. Postgres version in message needs updating.
//
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
vec![
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,

View File

@@ -6,14 +6,10 @@ use futures::future::BoxFuture;
use futures::{Stream, StreamExt};
use itertools::Itertools;
use pageserver_api::shard::ShardIdentity;
use pin_project_lite::pin_project;
use std::collections::BinaryHeap;
use std::collections::VecDeque;
use std::collections::{binary_heap, BinaryHeap};
use std::fmt::Display;
use std::future::Future;
use std::ops::{DerefMut, Range};
use std::pin::Pin;
use std::task::{ready, Poll};
use std::ops::Range;
use utils::lsn::Lsn;
pub const PAGE_SZ: u64 = 8192;
@@ -85,33 +81,6 @@ pub fn intersect_keyspace<K: Ord + Clone + Copy>(
ranges
}
/// Create a stream that iterates through all DeltaEntrys among all input
/// layers, in key-lsn order.
///
/// This is public because the create_delta() implementation likely wants to use this too
/// TODO: move to a more shared place
pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
layers: &'a [E::DeltaLayer],
ctx: &'a E::RequestContext,
) -> MergeDeltaKeys<'a, E> {
// Use a binary heap to merge the layers. Each input layer is initially
// represented by a LazyLoadLayer::Unloaded element, which uses the start of
// the layer's key range as the key. The first time a layer reaches the top
// of the heap, all the keys of the layer are loaded into a sorted vector.
//
// This helps to keep the memory usage reasonable: we only need to hold in
// memory the DeltaEntrys of the layers that overlap with the "current" key.
let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
for l in layers {
heap.push(LazyLoadLayer::Unloaded(l));
}
MergeDeltaKeys {
heap,
ctx,
load_future: None,
}
}
pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
layers: &'a [E::DeltaLayer],
ctx: &'a E::RequestContext,
@@ -129,104 +98,139 @@ pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
Ok(stream)
}
enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
Unloaded(&'a E::DeltaLayer),
/// Wrapper type to make `dl.load_keys`` compile.
type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
pub enum LayerIterator<'a, E: CompactionJobExecutor> {
Loaded(
VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>,
&'a E::RequestContext,
),
Unloaded(&'a E::DeltaLayer, &'a E::RequestContext),
}
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
fn min_key(&self) -> E::Key {
impl<'a, E: CompactionJobExecutor + 'a> LayerIterator<'a, E> {
pub fn new(delta_layer: &'a E::DeltaLayer, ctx: &'a E::RequestContext) -> Self {
Self::Unloaded(delta_layer, ctx)
}
pub fn key_lsn(&self) -> (E::Key, Lsn) {
match self {
Self::Loaded(entries) => entries.front().unwrap().key(),
Self::Unloaded(dl) => dl.key_range().start,
Self::Unloaded(dl, _) => (dl.key_range().start, dl.lsn_range().start),
Self::Loaded(entries, _) => entries.front().map(|x| (x.key(), x.lsn())).unwrap(),
}
}
fn min_lsn(&self) -> Lsn {
async fn load(&mut self) -> anyhow::Result<()> {
match self {
Self::Loaded(entries) => entries.front().unwrap().lsn(),
Self::Unloaded(dl) => dl.lsn_range().start,
Self::Unloaded(dl, ctx) => {
let unloaded_key_lsn = (dl.key_range().start, dl.lsn_range().start);
let fut: LoadFuture<
'a,
<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>,
> = Box::pin(dl.load_keys(ctx));
let keys = VecDeque::from(fut.await?);
assert_eq!(
keys.front().as_ref().map(|x| (x.key(), x.lsn())).unwrap(),
unloaded_key_lsn,
"unmatched start key_lsn"
);
*self = Self::Loaded(keys, ctx);
Ok(())
}
Self::Loaded(_, _) => Ok(()),
}
}
pub async fn entry(
&mut self,
) -> anyhow::Result<&<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>> {
self.load().await?;
let Self::Loaded(x, _) = self else {
unreachable!()
};
Ok(x.front().unwrap())
}
pub async fn next(
&mut self,
) -> anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>> {
self.load().await?; // requires Box::pin to make it compile
let Self::Loaded(x, _) = self else {
unreachable!()
};
Ok(x.pop_front().expect("already reached the end"))
}
pub fn is_end(&self) -> bool {
match self {
Self::Unloaded(_, _) => false,
Self::Loaded(x, _) => x.is_empty(),
}
}
}
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
impl<'a, E: CompactionJobExecutor + 'a> PartialOrd for LayerIterator<'a, E> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
impl<'a, E: CompactionJobExecutor + 'a> Ord for LayerIterator<'a, E> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
// reverse order so that we get a min-heap
(other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
// reverse comparison to get a min-heap
other.key_lsn().cmp(&self.key_lsn())
}
}
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
impl<'a, E: CompactionJobExecutor + 'a> PartialEq for LayerIterator<'a, E> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == std::cmp::Ordering::Equal
}
}
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
impl<'a, E: CompactionJobExecutor + 'a> Eq for LayerIterator<'a, E> {}
// Stream returned by `merge_delta_keys`
pin_project! {
#[allow(clippy::type_complexity)]
pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
heap: BinaryHeap<LazyLoadLayer<'a, E>>,
#[pin]
load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
ctx: &'a E::RequestContext,
}
pub struct DeltaMergeIterator<'a, E: CompactionJobExecutor> {
heap: BinaryHeap<LayerIterator<'a, E>>,
}
impl<'a, E> Stream for MergeDeltaKeys<'a, E>
where
E: CompactionJobExecutor + 'a,
{
type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
impl<'a, E: CompactionJobExecutor + 'a> DeltaMergeIterator<'a, E> {
pub fn new(delta_layers: &'a [E::DeltaLayer], ctx: &'a E::RequestContext) -> Self {
let mut heap = BinaryHeap::new();
for dl in delta_layers {
heap.push(LayerIterator::new(dl, ctx));
}
Self { heap }
}
fn poll_next(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
let mut this = self.project();
loop {
if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
// We are waiting for loading the keys to finish
match ready!(load_future.as_mut().poll(cx)) {
Ok(entries) => {
this.load_future.set(None);
*this.heap.peek_mut().unwrap() =
LazyLoadLayer::Loaded(VecDeque::from(entries));
}
Err(e) => {
return Poll::Ready(Some(Err(e)));
}
pub fn is_end(&self) -> bool {
self.heap.is_empty()
}
/// The next key-lsn entry that will be returned by `next`.
pub fn key_lsn(&self) -> (E::Key, Lsn) {
self.heap.peek().expect("already reached the end").key_lsn()
}
/// Move to the next entry and return the current entry.
pub async fn next(
&mut self,
) -> anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>> {
let Some(mut top) = self.heap.peek_mut() else {
panic!("already reached the end")
};
match top.next().await {
Ok(entry) => {
if top.is_end() {
binary_heap::PeekMut::pop(top);
}
Ok(entry)
}
// If the topmost layer in the heap hasn't been loaded yet, start
// loading it. Otherwise return the next entry from it and update
// the layer's position in the heap (this decreaseKey operation is
// performed implicitly when `top` is dropped).
if let Some(mut top) = this.heap.peek_mut() {
match top.deref_mut() {
LazyLoadLayer::Unloaded(ref mut l) => {
let fut = l.load_keys(this.ctx);
this.load_future.set(Some(Box::pin(fut)));
continue;
}
LazyLoadLayer::Loaded(ref mut entries) => {
let result = entries.pop_front().unwrap();
if entries.is_empty() {
std::collections::binary_heap::PeekMut::pop(top);
}
return Poll::Ready(Some(Ok(result)));
}
}
} else {
return Poll::Ready(None);
Err(e) => {
// pop the item if there is an error, otherwise it might cause further panic when binary heap compares it after `PeekMut` gets dropped.
binary_heap::PeekMut::pop(top);
Err(e)
}
}
}

View File

@@ -92,7 +92,9 @@ pub trait CompactionJobExecutor {
) -> impl Future<Output = anyhow::Result<()>> + Send;
}
pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
pub trait CompactionKey:
std::cmp::Ord + Clone + Copy + std::fmt::Display + std::fmt::Debug
{
const MIN: Self;
const MAX: Self;

View File

@@ -2,7 +2,6 @@ mod draw;
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
use futures::StreamExt;
use pageserver_api::shard::ShardIdentity;
use rand::Rng;
use tracing::info;
@@ -15,7 +14,8 @@ use std::sync::Arc;
use std::sync::Mutex;
use crate::helpers::PAGE_SZ;
use crate::helpers::{merge_delta_keys, overlaps_with};
use crate::helpers::overlaps_with;
use crate::helpers::DeltaMergeIterator;
use crate::interface;
use crate::interface::CompactionLayer;
@@ -380,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
}
fn file_size(&self) -> u64 {
match self {
MockLayer::Delta(this) => this.file_size,
MockLayer::Image(this) => this.file_size,
MockLayer::Delta(this) => this.file_size(),
MockLayer::Image(this) => this.file_size(),
}
}
fn short_id(&self) -> String {
@@ -545,12 +545,11 @@ impl interface::CompactionJobExecutor for MockTimeline {
input_layers: &[Arc<MockDeltaLayer>],
ctx: &MockRequestContext,
) -> anyhow::Result<()> {
let mut key_value_stream =
std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
let mut key_value_stream = DeltaMergeIterator::<MockTimeline>::new(input_layers, ctx);
let mut records: Vec<MockRecord> = Vec::new();
let mut total_len = 2;
while let Some(delta_entry) = key_value_stream.next().await {
let delta_entry: MockRecord = delta_entry?;
while !key_value_stream.is_end() {
let delta_entry: MockRecord = key_value_stream.next().await?;
if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
total_len += delta_entry.len;
records.push(delta_entry);

View File

@@ -17,7 +17,6 @@ pageserver = { path = ".." }
pageserver_api.workspace = true
remote_storage = { path = "../../libs/remote_storage" }
postgres_ffi.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-util.workspace = true
toml_edit.workspace = true

View File

@@ -2,55 +2,36 @@ use std::collections::HashMap;
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
use utils::lsn::Lsn;
#[derive(clap::Subcommand)]
pub(crate) enum IndexPartCmd {
Dump {
path: Utf8PathBuf,
#[clap(long, value_enum, default_value = "all-json")]
what: What,
},
}
#[derive(clap::ValueEnum, Debug, Clone, Copy)]
pub(crate) enum What {
AllJson,
TimelineMetadataDebugString,
Dump { path: Utf8PathBuf },
}
pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
match cmd {
IndexPartCmd::Dump { path, what } => {
IndexPartCmd::Dump { path } => {
let bytes = tokio::fs::read(path).await.context("read file")?;
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
#[derive(serde::Serialize)]
struct Output<'a> {
layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
disk_consistent_lsn: Lsn,
timeline_metadata: &'a TimelineMetadata,
}
match what {
What::AllJson => {
let output = Output {
layer_metadata: &des.layer_metadata,
disk_consistent_lsn: des.metadata.disk_consistent_lsn(),
timeline_metadata: &des.metadata,
};
let output =
serde_json::to_string_pretty(&output).context("serialize output")?;
println!("{output}");
}
What::TimelineMetadataDebugString => {
println!("{:#?}", des.metadata);
}
}
let output = Output {
layer_metadata: &des.layer_metadata,
disk_consistent_lsn: des.get_disk_consistent_lsn(),
timeline_metadata: &des.metadata,
};
let output = serde_json::to_string_pretty(&output).context("serialize output")?;
println!("{output}");
Ok(())
}
}

View File

@@ -1,475 +0,0 @@
use anyhow::Context;
use clap::Parser;
use pageserver_api::{
key::Key,
reltag::{BlockNumber, RelTag, SlruKind},
shard::{ShardCount, ShardStripeSize},
};
use std::str::FromStr;
#[derive(Parser)]
pub(super) struct DescribeKeyCommand {
/// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
input: Vec<String>,
/// The number of shards to calculate what Keys placement would be.
#[arg(long)]
shard_count: Option<CustomShardCount>,
/// The sharding stripe size.
///
/// The default is hardcoded. It makes no sense to provide this without providing
/// `--shard-count`.
#[arg(long, requires = "shard_count")]
stripe_size: Option<u32>,
}
/// Sharded shard count without unsharded count, which the actual ShardCount supports.
#[derive(Clone, Copy)]
pub(super) struct CustomShardCount(std::num::NonZeroU8);
#[derive(Debug, thiserror::Error)]
pub(super) enum InvalidShardCount {
#[error(transparent)]
ParsingFailed(#[from] std::num::ParseIntError),
#[error("too few shards")]
TooFewShards,
}
impl FromStr for CustomShardCount {
type Err = InvalidShardCount;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let inner: std::num::NonZeroU8 = s.parse()?;
if inner.get() < 2 {
Err(InvalidShardCount::TooFewShards)
} else {
Ok(CustomShardCount(inner))
}
}
}
impl From<CustomShardCount> for ShardCount {
fn from(value: CustomShardCount) -> Self {
ShardCount::new(value.0.get())
}
}
impl DescribeKeyCommand {
pub(super) fn execute(self) {
let DescribeKeyCommand {
input,
shard_count,
stripe_size,
} = self;
let material = KeyMaterial::try_from(input.as_slice()).unwrap();
let kind = material.kind();
let key = Key::from(material);
println!("parsed from {kind}: {key}:");
println!();
println!("{key:?}");
macro_rules! kind_query {
([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}};
($name:ident) => {{
let s: &'static str = stringify!($name);
let s = s.strip_prefix("is_").unwrap_or(s);
let s = s.strip_suffix("_key").unwrap_or(s);
#[allow(clippy::needless_borrow)]
(s, key.$name())
}};
}
// the current characterization is a mess of these boolean queries and separate
// "recognization". I think it accurately represents how strictly we model the Key
// right now, but could of course be made less confusing.
let queries = kind_query!([
is_rel_block_key,
is_rel_vm_block_key,
is_rel_fsm_block_key,
is_slru_block_key,
is_inherited_key,
is_rel_size_key,
is_slru_segment_size_key,
]);
let recognized_kind = "recognized kind";
let metadata_key = "metadata key";
let shard_placement = "shard placement";
let longest = queries
.iter()
.map(|t| t.0)
.chain([recognized_kind, metadata_key, shard_placement])
.map(|s| s.len())
.max()
.unwrap();
let colon = 1;
let padding = 1;
for (name, is) in queries {
let width = longest - name.len() + colon + padding;
println!("{}{:width$}{}", name, ":", is);
}
let width = longest - recognized_kind.len() + colon + padding;
println!(
"{}{:width$}{:?}",
recognized_kind,
":",
RecognizedKeyKind::new(key),
);
if let Some(shard_count) = shard_count {
// seeing the sharding placement might be confusing, so leave it out unless shard
// count was given.
let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
println!(
"# placement with shard_count: {} and stripe_size: {}:",
shard_count.0, stripe_size.0
);
let width = longest - shard_placement.len() + colon + padding;
println!(
"{}{:width$}{:?}",
shard_placement,
":",
pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
);
}
}
}
/// Hand-wavy "inputs we accept" for a key.
#[derive(Debug)]
pub(super) enum KeyMaterial {
Hex(Key),
String(SpanAttributesFromLogs),
Split(RelTag, BlockNumber),
}
impl KeyMaterial {
fn kind(&self) -> &'static str {
match self {
KeyMaterial::Hex(_) => "hex",
KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
}
}
}
impl From<KeyMaterial> for Key {
fn from(value: KeyMaterial) -> Self {
match value {
KeyMaterial::Hex(key) => key,
KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
| KeyMaterial::Split(rt, blocknum) => {
pageserver_api::key::rel_block_to_key(rt, blocknum)
}
}
}
}
impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
type Error = anyhow::Error;
fn try_from(value: &[S]) -> Result<Self, Self::Error> {
match value {
[] => anyhow::bail!(
"need 1..N positional arguments describing the key, try hex or a log line"
),
[one] => {
let one = one.as_ref();
let key = Key::from_hex(one).map(KeyMaterial::Hex);
let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
match (key, attrs) {
(Ok(key), _) => Ok(key),
(_, Ok(s)) => Ok(s),
(Err(e1), Err(e2)) => anyhow::bail!(
"failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
),
}
}
more => {
// assume going left to right one of these is a reltag and then we find a blocknum
// this works, because we don't have plain numbers at least right after reltag in
// logs. for some definition of "works".
let Some((reltag_at, reltag)) = more
.iter()
.map(AsRef::as_ref)
.enumerate()
.find_map(|(i, s)| {
s.split_once("rel=")
.map(|(_garbage, actual)| actual)
.unwrap_or(s)
.parse::<RelTag>()
.ok()
.map(|rt| (i, rt))
})
else {
anyhow::bail!("found no RelTag in arguments");
};
let Some(blocknum) = more
.iter()
.map(AsRef::as_ref)
.skip(reltag_at)
.find_map(|s| {
s.split_once("blkno=")
.map(|(_garbage, actual)| actual)
.unwrap_or(s)
.parse::<BlockNumber>()
.ok()
})
else {
anyhow::bail!("found no blocknum in arguments");
};
Ok(KeyMaterial::Split(reltag, blocknum))
}
}
}
}
#[derive(Debug)]
pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
impl std::str::FromStr for SpanAttributesFromLogs {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// accept the span separator but do not require or fail if either is missing
// "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
let (_, reltag) = s
.split_once("rel=")
.ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
let reltag = reltag.split_whitespace().next().unwrap();
let (_, blocknum) = s
.split_once("blkno=")
.ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
let blocknum = blocknum.split_whitespace().next().unwrap();
let reltag = reltag
.parse()
.with_context(|| format!("parse reltag from {reltag:?}"))?;
let blocknum = blocknum
.parse()
.with_context(|| format!("parse blocknum from {blocknum:?}"))?;
Ok(Self(reltag, blocknum))
}
}
#[derive(Debug)]
#[allow(dead_code)] // debug print is used
enum RecognizedKeyKind {
DbDir,
ControlFile,
Checkpoint,
AuxFilesV1,
SlruDir(Result<SlruKind, u32>),
RelMap(RelTagish<2>),
RelDir(RelTagish<2>),
AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
}
#[derive(Debug, PartialEq)]
#[allow(unused)]
enum AuxFileV2 {
Recognized(&'static str, utils::Hex<[u8; 13]>),
OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
Other(utils::Hex<[u8; 13]>),
}
impl RecognizedKeyKind {
fn new(key: Key) -> Option<Self> {
use RecognizedKeyKind::{
AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
};
let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
Some(match key {
pageserver_api::key::DBDIR_KEY => DbDir,
pageserver_api::key::CONTROLFILE_KEY => ControlFile,
pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
_ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
_ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
RelMap([key.field2, key.field3].into())
}
_ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
RelDir([key.field2, key.field3].into())
}
_ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
),
_ => return None,
})
}
}
impl AuxFileV2 {
fn new(key: Key) -> Option<AuxFileV2> {
const EMPTY_HASH: [u8; 13] = {
let mut out = [0u8; 13];
let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
let mut i = 3;
while i < 16 {
out[i - 3] = hash[i];
i += 1;
}
out
};
let bytes = key.to_i128().to_be_bytes();
let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
assert_eq!(EMPTY_HASH.len(), hash.0.len());
// TODO: we could probably find the preimages for the hashes
Some(match (bytes[1], bytes[2]) {
(1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
(1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
(1, 3) if hash.0 == EMPTY_HASH => {
AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
}
(2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
(1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
(0xff, 0xff) => AuxFileV2::Other(hash),
_ => return None,
})
}
}
/// Prefix of RelTag, currently only known use cases are the two item versions.
///
/// Renders like a reltag with `/`, nothing else.
struct RelTagish<const N: usize>([u32; N]);
impl<const N: usize> From<[u32; N]> for RelTagish<N> {
fn from(val: [u32; N]) -> Self {
RelTagish(val)
}
}
impl<const N: usize> std::fmt::Debug for RelTagish<N> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use std::fmt::Write as _;
let mut first = true;
self.0.iter().try_for_each(|x| {
if !first {
f.write_char('/')?;
}
first = false;
write!(f, "{}", x)
})
}
}
#[cfg(test)]
mod tests {
use pageserver::aux_file::encode_aux_file_key;
use super::*;
#[test]
fn hex_is_key_material() {
let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
}
#[test]
fn single_positional_spanalike_is_key_material() {
// why is this needed? if you are checking many, then copypaste starts to appeal
let strings = [
(line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
(line!(), "rel=1663/208101/2620_fsm blkno=2"),
(line!(), "rel=1663/208101/2620.1 blkno=2"),
];
let mut first: Option<Key> = None;
for (line, example) in strings {
let m = KeyMaterial::try_from(&[example][..])
.unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
let key = Key::from(m);
if let Some(first) = first {
assert_eq!(first, key);
} else {
first = Some(key);
}
}
// not supporting this is rather accidential, but I think the input parsing is lenient
// enough already
KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
}
#[test]
fn multiple_spanlike_args() {
let strings = [
(line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
(line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
(line!(), &["1663/208101/2620_fsm", "2"][..]),
];
let mut first: Option<Key> = None;
for (line, example) in strings {
let m = KeyMaterial::try_from(example)
.unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
let key = Key::from(m);
if let Some(first) = first {
assert_eq!(first, key);
} else {
first = Some(key);
}
}
}
#[test]
fn recognized_auxfiles() {
use AuxFileV2::*;
let empty = [
0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
];
let foobar = [
0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
];
#[rustfmt::skip]
let examples = [
(line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
(line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
(line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
(line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
(line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
(line!(), "foobar", Other(utils::Hex(foobar))),
];
for (line, path, expected) in examples {
let key = encode_aux_file_key(path);
let recognized =
AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
assert_eq!(recognized, expected);
}
assert_eq!(
AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
None,
"example key has one too few 0 after 6 before 1"
);
}
}

View File

@@ -6,7 +6,6 @@
mod draw_timeline_dir;
mod index_part;
mod key;
mod layer_map_analyzer;
mod layers;
@@ -62,8 +61,6 @@ enum Commands {
AnalyzeLayerMap(AnalyzeLayerMapCmd),
#[command(subcommand)]
Layer(LayerCmd),
/// Debug print a hex key found from logs
Key(key::DescribeKeyCommand),
}
/// Read and update pageserver metadata file
@@ -186,7 +183,6 @@ async fn main() -> anyhow::Result<()> {
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
.await?;
}
Commands::Key(dkc) => dkc.execute(),
};
Ok(())
}

View File

@@ -5,7 +5,6 @@ use utils::lsn::Lsn;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Instant;
/// Ingest aux files into the pageserver.
#[derive(clap::Parser)]
@@ -89,17 +88,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
println!("ingested {file_cnt} files");
}
for _ in 0..100 {
let start = Instant::now();
let files = mgmt_api_client
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
.await?;
println!(
"{} files found in {}s",
files.len(),
start.elapsed().as_secs_f64()
);
}
let files = mgmt_api_client
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
.await?;
println!("{} files found", files.len());
anyhow::Ok(())
}

View File

@@ -1,6 +1,6 @@
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver_api::key::Key;
use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::PagestreamGetPageRequest;
@@ -187,7 +187,7 @@ async fn main_impl(
for r in partitioning.keys.ranges.iter() {
let mut i = r.start;
while i != r.end {
if i.is_rel_block_key() {
if is_rel_block_key(&i) {
filtered.add_key(i);
}
i = i.next();
@@ -308,10 +308,9 @@ async fn main_impl(
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
assert!(key.is_rel_block_key());
let (rel_tag, block_no) = key
.to_rel_block()
.expect("we filter non-rel-block keys out above");
assert!(is_rel_block_key(&key));
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX

View File

@@ -178,8 +178,7 @@ impl AuxFileSizeEstimator {
}
}
/// When generating base backup or doing initial logical size calculation
pub fn on_initial(&self, new_size: usize) {
pub fn on_base_backup(&self, new_size: usize) {
let mut guard = self.size.lock().unwrap();
*guard = Some(new_size as isize);
self.report(new_size as isize);

View File

@@ -13,7 +13,7 @@
use anyhow::{anyhow, Context};
use bytes::{BufMut, Bytes, BytesMut};
use fail::fail_point;
use pageserver_api::key::Key;
use pageserver_api::key::{key_to_slru_block, Key};
use postgres_ffi::pg_constants;
use std::fmt::Write as FmtWrite;
use std::time::SystemTime;
@@ -170,7 +170,7 @@ where
}
async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
let (kind, segno, _) = key.to_slru_block()?;
let (kind, segno, _) = key_to_slru_block(*key)?;
match kind {
SlruKind::Clog => {
@@ -362,13 +362,6 @@ where
));
info!("Replication slot {} restart LSN={}", path, restart_lsn);
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
} else if path == "pg_logical/replorigin_checkpoint" {
// replorigin_checkoint is written only on compute shutdown, so it contains
// deteriorated values. So we generate our own version of this file for the particular LSN
// based on information about replorigins extracted from transaction commit records.
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
// but now we should handle (skip) it for backward compatibility.
continue;
}
let header = new_tar_header(&path, content.len() as u64)?;
self.ar
@@ -397,32 +390,6 @@ where
{
self.add_twophase_file(xid).await?;
}
let repl_origins = self
.timeline
.get_replorigins(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
let n_origins = repl_origins.len();
if n_origins != 0 {
//
// Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins
// extracted from transaction commit record. We are using this file to pass information about replication
// origins to compute to allow logical replication to restart from proper point.
//
let mut content = Vec::with_capacity(n_origins * 16 + 8);
content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes());
for (origin_id, origin_lsn) in repl_origins {
content.extend_from_slice(&origin_id.to_le_bytes());
content.extend_from_slice(&[0u8; 6]); // align to 8 bytes
content.extend_from_slice(&origin_lsn.0.to_le_bytes());
}
let crc32 = crc32c::crc32c(&content);
content.extend_from_slice(&crc32.to_le_bytes());
let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
self.ar.append(&header, &*content).await.context(
"could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
)?;
}
fail_point!("basebackup-before-control-file", |_| {
Err(BasebackupError::Server(anyhow!(

View File

@@ -99,6 +99,8 @@ pub mod defaults {
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
///
/// Default built-in configuration file.
///
@@ -144,6 +146,8 @@ pub mod defaults {
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -296,6 +300,8 @@ pub struct PageServerConf {
///
/// Setting this to zero disables limits on total ephemeral layer size.
pub ephemeral_bytes_per_memory_kb: usize,
pub walredo_process_kind: crate::walredo::ProcessKind,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -401,6 +407,8 @@ struct PageServerConfigBuilder {
validate_vectored_get: BuilderValue<bool>,
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
}
impl PageServerConfigBuilder {
@@ -489,6 +497,8 @@ impl PageServerConfigBuilder {
)),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
}
}
}
@@ -676,6 +686,10 @@ impl PageServerConfigBuilder {
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
}
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
self.walredo_process_kind = BuilderValue::Set(value);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let default = Self::default_values();
@@ -733,6 +747,7 @@ impl PageServerConfigBuilder {
max_vectored_read_bytes,
validate_vectored_get,
ephemeral_bytes_per_memory_kb,
walredo_process_kind,
}
CUSTOM LOGIC
{
@@ -1029,6 +1044,9 @@ impl PageServerConf {
"ephemeral_bytes_per_memory_kb" => {
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
}
"walredo_process_kind" => {
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -1112,6 +1130,7 @@ impl PageServerConf {
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
}
}
}
@@ -1351,6 +1370,7 @@ background_task_maximum_delay = '334 s'
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
},
"Correct defaults should be used when no config values are provided"
);
@@ -1424,6 +1444,7 @@ background_task_maximum_delay = '334 s'
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
// mean the synthetic size worker should terminate.
let shutting_down = matches!(
e.downcast_ref::<PageReconstructError>(),
Some(PageReconstructError::Cancelled)
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
);
if !shutting_down {

View File

@@ -311,7 +311,7 @@ impl DeletionList {
result.extend(
timeline_layers
.into_iter()
.map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
.map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
);
}
}

View File

@@ -534,7 +534,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
});
}
EvictionLayer::Secondary(layer) => {
let file_size = layer.metadata.file_size;
let file_size = layer.metadata.file_size();
js.spawn(async move {
layer
@@ -641,7 +641,7 @@ impl EvictionLayer {
pub(crate) fn get_file_size(&self) -> u64 {
match self {
Self::Attached(l) => l.layer_desc().file_size,
Self::Secondary(sl) => sl.metadata.file_size,
Self::Secondary(sl) => sl.metadata.file_size(),
}
}
}

View File

@@ -81,10 +81,8 @@ paths:
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
404 means that deletion successfully finished"
responses:
"200":
description: Tenant was successfully deleted, or was already not found.
"404":
description: Tenant not found. This is a success result, equivalent to 200.
description: Tenant not found. This is the success path.
content:
application/json:
schema:
@@ -259,37 +257,6 @@ paths:
schema:
$ref: "#/components/schemas/LsnByTimestampResponse"
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
format: hex
post:
description: Obtain lease for the given LSN
parameters:
- name: lsn
in: query
required: true
schema:
type: string
format: hex
description: A LSN to obtain the lease for
responses:
"200":
description: OK
content:
application/json:
schema:
$ref: "#/components/schemas/LsnLease"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
parameters:
- name: tenant_id
@@ -614,80 +581,6 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
ŕequired: true
schema:
type: string
put:
description: |
Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
Current implementation might not be retryable across failure cases, but will be enhanced in future.
Detaching should be expected to be expensive operation. Timeouts should be retried.
responses:
"200":
description: |
The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
If any timelines were deleted after reparenting, they might not be on this list.
content:
application/json:
schema:
$ref: "#/components/schemas/AncestorDetached"
"400":
description: |
Number of early checks meaning the timeline cannot be detached now:
- the ancestor of timeline has an ancestor: not supported, see RFC
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"404":
description: Tenant or timeline not found.
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"409":
description: |
The timeline can never be detached:
- timeline has no ancestor, implying that the timeline has never had an ancestor
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"500":
description: |
Transient error, for example, pageserver shutdown happened while
processing the request but we were unable to distinguish that. Must
be retried.
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: |
Temporarily unavailable, please retry. Possible reasons:
- another timeline detach for the same tenant is underway, please retry later
- detected shutdown error
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/:
get:
description: Get tenants list
@@ -1087,15 +980,6 @@ components:
type: string
enum: [past, present, future, nodata]
LsnLease:
type: object
required:
- valid_until
properties:
valid_until:
type: string
format: date-time
PageserverUtilization:
type: object
required:
@@ -1153,19 +1037,6 @@ components:
format: int64
description: How many bytes of layer content were in the latest layer heatmap
AncestorDetached:
type: object
required:
- reparented_timelines
properties:
reparented_timelines:
type: array
description: Set of reparented timeline ids
properties:
type: string
format: hex
description: TimelineId
Error:
type: object

View File

@@ -16,7 +16,6 @@ use hyper::header;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
@@ -74,9 +73,7 @@ use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::CompactionError;
use crate::tenant::timeline::Timeline;
use crate::tenant::GetTimelineError;
use crate::tenant::SpawnMode;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::{config::PageServerConf, tenant::mgr};
@@ -184,6 +181,9 @@ impl From<PageReconstructError> for ApiError {
PageReconstructError::Cancelled => {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
PageReconstructError::AncestorStopping(_) => {
ApiError::ResourceUnavailable(format!("{pre}").into())
}
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
}
@@ -279,13 +279,6 @@ impl From<GetTenantError> for ApiError {
}
}
impl From<GetTimelineError> for ApiError {
fn from(gte: GetTimelineError) -> Self {
// Rationale: tenant is activated only after eligble timelines activate
ApiError::NotFound(gte.into())
}
}
impl From<GetActiveTenantError> for ApiError {
fn from(e: GetActiveTenantError) -> ApiError {
match e {
@@ -393,7 +386,7 @@ async fn build_timeline_info_common(
let guard = timeline.last_received_wal.lock().unwrap();
if let Some(info) = guard.as_ref() {
(
Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
Some(info.last_received_msg_lsn),
Some(info.last_received_msg_ts),
)
@@ -650,7 +643,9 @@ async fn timeline_preserve_initdb_handler(
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let timeline = tenant.get_timeline(timeline_id, false)?;
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(|e| ApiError::NotFound(e.into()))?;
timeline
.preserve_initdb_archive()
@@ -692,7 +687,9 @@ async fn timeline_detail_handler(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline = tenant.get_timeline(timeline_id, false)?;
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(|e| ApiError::NotFound(e.into()))?;
let timeline_info = build_timeline_info(
&timeline,
@@ -1073,7 +1070,7 @@ async fn tenant_delete_handler(
let state = get_state(&request);
let status = state
state
.tenant_manager
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
.instrument(info_span!("tenant_delete_handler",
@@ -1082,14 +1079,7 @@ async fn tenant_delete_handler(
))
.await?;
// Callers use 404 as success for deletions, for historical reasons.
if status == StatusCode::NOT_FOUND {
return Err(ApiError::NotFound(
anyhow::anyhow!("Deletion complete").into(),
));
}
json_response(status, ())
json_response(StatusCode::ACCEPTED, ())
}
/// HTTP endpoint to query the current tenant_size of a tenant.
@@ -1711,32 +1701,6 @@ async fn handle_tenant_break(
json_response(StatusCode::OK, ())
}
// Obtains an lsn lease on the given timeline.
async fn lsn_lease_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let lsn: Lsn = parse_query_param(&request, "lsn")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let result = timeline
.make_lsn_lease(lsn, &ctx)
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
json_response(StatusCode::OK, result)
}
// Run GC immediately on given timeline.
async fn timeline_gc_handler(
mut request: Request<Body>,
@@ -1772,8 +1736,6 @@ async fn timeline_compact_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1782,9 +1744,6 @@ async fn timeline_compact_handler(
.compact(&cancel, flags, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
}
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1809,8 +1768,6 @@ async fn timeline_checkpoint_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1818,26 +1775,11 @@ async fn timeline_checkpoint_handler(
timeline
.freeze_and_flush()
.await
.map_err(|e| {
match e {
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
other => ApiError::InternalServerError(other.into()),
}
})?;
.map_err(ApiError::InternalServerError)?;
timeline
.compact(&cancel, flags, &ctx)
.await
.map_err(|e|
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
}
.map_err(|e| ApiError::InternalServerError(e.into()))?;
json_response(StatusCode::OK, ())
}
@@ -1922,11 +1864,14 @@ async fn timeline_detach_ancestor_handler(
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
let ctx = &ctx;
let timeline = tenant.get_timeline(timeline_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))?;
let (_guard, prepared) = timeline
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
.await?;
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
let res = state
.tenant_manager
@@ -2060,7 +2005,9 @@ async fn active_timeline_of_active_tenant(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
Ok(tenant.get_timeline(timeline_id, true)?)
tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))
}
async fn always_panic_handler(
@@ -2189,7 +2136,7 @@ async fn tenant_scan_remote_handler(
{
Ok((index_part, index_generation)) => {
tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
generation = std::cmp::max(generation, index_generation);
}
Err(DownloadError::NotFound) => {
@@ -2324,31 +2271,6 @@ async fn post_tracing_event_handler(
json_response(StatusCode::OK, ())
}
async fn force_aux_policy_switch_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
let policy: AuxFilePolicy = json_request(&mut r).await?;
let state = get_state(&r);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
timeline
.do_switch_aux_policy(policy)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn put_io_engine_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
@@ -2426,9 +2348,19 @@ async fn list_aux_files(
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
json_response(StatusCode::OK, files)
let process = || async move {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
Ok::<_, anyhow::Error>(files)
};
match process().await {
Ok(st) => json_response(StatusCode::OK, st),
Err(err) => json_response(
StatusCode::INTERNAL_SERVER_ERROR,
ApiError::InternalServerError(err).to_string(),
),
}
}
async fn ingest_aux_files(
@@ -2446,22 +2378,24 @@ async fn ingest_aux_files(
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let mut modification = timeline.begin_modification(
Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
for (fname, content) in body.aux_files {
modification
.put_file(&fname, content.as_bytes(), &ctx)
.await
.map_err(ApiError::InternalServerError)?;
}
modification
.commit(&ctx)
.await
.map_err(ApiError::InternalServerError)?;
let process = || async move {
let mut modification = timeline.begin_modification(Lsn(
timeline.get_last_record_lsn().0 + 8
) /* advance LSN by 8 */);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
for (fname, content) in body.aux_files {
modification
.put_file(&fname, content.as_bytes(), &ctx)
.await?;
}
modification.commit(&ctx).await?;
Ok::<_, anyhow::Error>(())
};
json_response(StatusCode::OK, ())
match process().await {
Ok(st) => json_response(StatusCode::OK, st),
Err(err) => Err(ApiError::InternalServerError(err)),
}
}
/// Report on the largest tenants on this pageserver, for the storage controller to identify
@@ -2767,10 +2701,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|r| api_handler(r, get_timestamp_of_lsn_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|r| api_handler(r, lsn_lease_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
|r| api_handler(r, timeline_gc_handler),
@@ -2844,10 +2774,6 @@ pub fn make_router(
|r| api_handler(r, timeline_collect_keyspace),
)
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|r| api_handler(r, force_aux_policy_switch_handler),
)
.get("/v1/utilization", |r| api_handler(r, get_utilization))
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",

View File

@@ -525,15 +525,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_standby_horizon",
"Standby apply LSN for which GC is hold off, by timeline.",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_resident_physical_size",
@@ -1867,6 +1858,7 @@ pub(crate) struct WalIngestMetrics {
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) time_spent_on_ingest: Histogram,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -1890,6 +1882,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
"Number of WAL records filtered out due to sharding"
)
.expect("failed to define a metric"),
time_spent_on_ingest: register_histogram!(
"pageserver_wal_ingest_put_value_seconds",
"Actual time spent on ingesting a record",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric"),
});
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -2100,7 +2098,6 @@ pub(crate) struct TimelineMetrics {
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
@@ -2170,9 +2167,6 @@ impl TimelineMetrics {
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2218,7 +2212,6 @@ impl TimelineMetrics {
find_gc_cutoffs_histo,
load_layer_map_histo,
last_record_gauge,
standby_horizon_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
aux_file_size_gauge,
@@ -2253,7 +2246,6 @@ impl TimelineMetrics {
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

View File

@@ -19,7 +19,6 @@ use pageserver_api::models::{
};
use pageserver_api::shard::ShardIndex;
use pageserver_api::shard::ShardNumber;
use pageserver_api::shard::TenantShardId;
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
use pq_proto::framed::ConnectionError;
use pq_proto::FeStartupPacket;
@@ -34,7 +33,6 @@ use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use std::time::SystemTime;
use tokio::io::AsyncWriteExt;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::io::StreamReader;
@@ -66,7 +64,6 @@ use crate::tenant::mgr::GetTenantError;
use crate::tenant::mgr::ShardResolveResult;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::mgr::TenantManager;
use crate::tenant::timeline::FlushLayerError;
use crate::tenant::timeline::WaitLsnError;
use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
@@ -261,8 +258,6 @@ async fn page_service_conn_main(
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
let socket = std::pin::pin!(socket);
fail::fail_point!("ps::connection-start::pre-login");
// XXX: pgbackend.run() should take the connection_ctx,
// and create a child per-query context when it invokes process_query.
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
@@ -373,7 +368,7 @@ impl From<WaitLsnError> for PageStreamError {
match value {
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
WaitLsnError::Shutdown => Self::Shutdown,
e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
}
}
}
@@ -383,7 +378,7 @@ impl From<WaitLsnError> for QueryError {
match value {
e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
WaitLsnError::Shutdown => Self::Shutdown,
WaitLsnError::BadState { .. } => Self::Reconnect,
WaitLsnError::BadState => Self::Reconnect,
}
}
}
@@ -606,7 +601,6 @@ impl PageServerHandler {
};
trace!("query: {copy_data_bytes:?}");
fail::fail_point!("ps::handle-pagerequest-message");
// Trace request if needed
if let Some(t) = tracer.as_mut() {
@@ -621,7 +615,6 @@ impl PageServerHandler {
let (response, span) = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
fail::fail_point!("ps::handle-pagerequest-message::exists");
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
(
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -631,7 +624,6 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::Nblocks(req) => {
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
(
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -641,7 +633,6 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::GetPage(req) => {
fail::fail_point!("ps::handle-pagerequest-message::getpage");
// shard_id is filled in by the handler
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
(
@@ -652,7 +643,6 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::DbSize(req) => {
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
(
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -662,7 +652,6 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::GetSlruSegment(req) => {
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
(
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
@@ -831,10 +820,7 @@ impl PageServerHandler {
// We only want to persist the data, and it doesn't matter if it's in the
// shape of deltas or images.
info!("flushing layers");
timeline.freeze_and_flush().await.map_err(|e| match e {
FlushLayerError::Cancelled => QueryError::Shutdown,
other => QueryError::Other(other.into()),
})?;
timeline.freeze_and_flush().await?;
info!("done");
Ok(())
@@ -919,39 +905,6 @@ impl PageServerHandler {
}
}
#[instrument(skip_all, fields(shard_id, %lsn))]
async fn handle_make_lsn_lease<IO>(
&self,
pgb: &mut PostgresBackend<IO>,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<(), QueryError>
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
let timeline = self
.get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
.await?;
let lease = timeline.make_lsn_lease(lsn, ctx)?;
let valid_until = lease
.valid_until
.duration_since(SystemTime::UNIX_EPOCH)
.map_err(|e| QueryError::Other(e.into()))?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
b"valid_until",
)]))?
.write_message_noflush(&BeMessage::DataRow(&[Some(
&valid_until.as_millis().to_be_bytes(),
)]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
Ok(())
}
#[instrument(skip_all, fields(shard_id))]
async fn handle_get_rel_exists_request(
&mut self,
@@ -1517,7 +1470,6 @@ where
_pgb: &mut PostgresBackend<IO>,
_sm: &FeStartupPacket,
) -> Result<(), QueryError> {
fail::fail_point!("ps::connection-start::startup-packet");
Ok(())
}
@@ -1532,12 +1484,11 @@ where
Err(QueryError::SimulatedConnectionError)
});
fail::fail_point!("ps::connection-start::process-query");
let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string:?}");
let parts = query_string.split_whitespace().collect::<Vec<_>>();
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
if query_string.starts_with("pagestream_v2 ") {
let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
@@ -1562,7 +1513,9 @@ where
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
} else if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
@@ -1587,7 +1540,10 @@ where
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
} else if query_string.starts_with("basebackup ") {
let (_, params_raw) = query_string.split_at("basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for basebackup command"
@@ -1605,23 +1561,26 @@ where
self.check_permission(Some(tenant_id))?;
let lsn = if let Some(lsn_str) = params.get(2) {
let lsn = if params.len() >= 3 {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
let gzip = match params.get(3) {
Some(&"--gzip") => true,
None => false,
Some(third_param) => {
let gzip = if params.len() >= 4 {
if params[3] == "--gzip" {
true
} else {
return Err(QueryError::Other(anyhow::anyhow!(
"Parameter in position 3 unknown {third_param}",
)))
"Parameter in position 3 unknown {}",
params[3],
)));
}
} else {
false
};
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
@@ -1645,7 +1604,10 @@ where
res?;
}
// return pair of prev_lsn and last_lsn
else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
else if query_string.starts_with("get_last_record_rlsn ") {
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for get_last_record_rlsn command"
@@ -1687,7 +1649,10 @@ where
.await?;
}
// same as basebackup, but result includes relational data as well
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
else if query_string.starts_with("fullbackup ") {
let (_, params_raw) = query_string.split_at("fullbackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for fullbackup command"
@@ -1704,18 +1669,18 @@ where
.record("timeline_id", field::display(timeline_id));
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if let Some(lsn_str) = params.get(2) {
let lsn = if params.len() > 2 {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
let prev_lsn = if params.len() > 3 {
Some(
Lsn::from_str(prev_lsn_str)
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
)
} else {
None
@@ -1748,7 +1713,8 @@ where
// 2. Run:
// cat my_backup/base.tar | psql -h $PAGESERVER \
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
let params = &parts[2..];
let (_, params_raw) = query_string.split_at("import basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 5 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import basebackup command"
@@ -1797,7 +1763,8 @@ where
//
// Files are scheduled to be persisted to remote storage, and the
// caller should poll the http api to check when that is done.
let params = &parts[2..];
let (_, params_raw) = query_string.split_at("import wal ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 4 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import wal command"
@@ -1835,45 +1802,10 @@ where
// important because psycopg2 executes "SET datestyle TO 'ISO'"
// on connect
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("lease lsn ") {
let params = &parts[2..];
if params.len() != 3 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number {} for lease lsn command",
params.len()
)));
}
let tenant_shard_id = TenantShardId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_shard_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_shard_id.tenant_id))?;
// The caller is responsible for providing correct lsn.
let lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
match self
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
.await
{
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error obtaining lsn lease for {lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else if let Some(params) = parts.strip_prefix(&["show"]) {
} else if query_string.starts_with("show ") {
// show <tenant_id>
let (_, params_raw) = query_string.split_at("show ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 1 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for config command"

View File

@@ -9,6 +9,7 @@
use super::tenant::{PageReconstructError, Timeline};
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::WAL_INGEST;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::walrecord::NeonWalRecord;
use crate::{aux_file, repository::*};
@@ -17,8 +18,8 @@ use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use itertools::Itertools;
use pageserver_api::key::{
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
@@ -27,7 +28,7 @@ use pageserver_api::models::AuxFilePolicy;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
use postgres_ffi::{Oid, TimestampTz, TransactionId};
use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::ControlFlow;
@@ -78,19 +79,11 @@ pub enum LsnForTimestamp {
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum CalculateLogicalSizeError {
pub enum CalculateLogicalSizeError {
#[error("cancelled")]
Cancelled,
/// Something went wrong while reading the metadata we use to calculate logical size
/// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
/// in the `From` implementation for this variant.
#[error(transparent)]
PageRead(PageReconstructError),
/// Something went wrong deserializing metadata that we read to calculate logical size
#[error("decode error: {0}")]
Decode(#[from] DeserializeError),
Other(#[from] anyhow::Error),
}
#[derive(Debug, thiserror::Error)]
@@ -115,8 +108,10 @@ impl From<PageReconstructError> for CollectKeySpaceError {
impl From<PageReconstructError> for CalculateLogicalSizeError {
fn from(pre: PageReconstructError) -> Self {
match pre {
PageReconstructError::Cancelled => Self::Cancelled,
_ => Self::PageRead(pre),
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
Self::Cancelled
}
_ => Self::Other(pre.into()),
}
}
}
@@ -718,22 +713,10 @@ impl Timeline {
result.insert(fname, content);
}
}
self.aux_file_size_estimator.on_initial(sz);
self.aux_file_size_estimator.on_base_backup(sz);
Ok(result)
}
pub(crate) async fn trigger_aux_file_size_computation(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<(), PageReconstructError> {
let current_policy = self.last_aux_file_policy.load();
if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
self.list_aux_files_v2(lsn, ctx).await?;
}
Ok(())
}
pub(crate) async fn list_aux_files(
&self,
lsn: Lsn,
@@ -772,27 +755,6 @@ impl Timeline {
}
}
pub(crate) async fn get_replorigins(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
let kv = self
.scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
.await
.context("scan")?;
let mut result = HashMap::new();
for (k, v) in kv {
let v = v.context("get value")?;
let origin_id = k.field6 as RepOriginId;
let origin_lsn = Lsn::des(&v).unwrap();
if origin_lsn != Lsn::INVALID {
result.insert(origin_id, origin_lsn);
}
}
Ok(result)
}
/// Does the same as get_current_logical_size but counted on demand.
/// Used to initialize the logical size tracking on startup.
///
@@ -802,7 +764,7 @@ impl Timeline {
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub(crate) async fn get_current_logical_size_non_incremental(
pub async fn get_current_logical_size_non_incremental(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -811,7 +773,7 @@ impl Timeline {
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
let dbdir = DbDirectory::des(&buf)?;
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
@@ -918,9 +880,7 @@ impl Timeline {
Ok((
result.to_keyspace(),
/* AUX sparse key space */
SparseKeySpace(KeySpace {
ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
}),
SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
))
}
@@ -1189,20 +1149,6 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub async fn set_replorigin(
&mut self,
origin_id: RepOriginId,
origin_lsn: Lsn,
) -> anyhow::Result<()> {
let key = repl_origin_key(origin_id);
self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
Ok(())
}
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
self.set_replorigin(origin_id, Lsn::INVALID).await
}
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
self.put(CONTROLFILE_KEY, Value::Image(img));
Ok(())
@@ -1535,24 +1481,11 @@ impl<'a> DatadirModification<'a> {
// Allowed switch path:
// * no aux files -> v1/v2/cross-validation
// * cross-validation->v2
let current_policy = if current_policy.is_none() {
// This path will only be hit once per tenant: we will decide the final policy in this code block.
// The next call to `put_file` will always have `last_aux_file_policy != None`.
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
if aux_files_key_v1.is_empty() {
None
} else {
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
Some(AuxFilePolicy::V1)
}
} else {
current_policy
};
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
self.tline.do_switch_aux_policy(switch_policy)?;
self.tline.last_aux_file_policy.store(Some(switch_policy));
self.tline
.remote_client
.schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
switch_policy
} else {
@@ -1607,7 +1540,7 @@ impl<'a> DatadirModification<'a> {
self.tline.aux_file_size_estimator.on_add(content.len());
new_files.push((path, content));
}
(None, true) => warn!("removing non-existing aux file: {}", path),
(None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
}
let new_val = aux_file::encode_file_value(&new_files)?;
self.put(key, Value::Image(new_val.into()));
@@ -1661,7 +1594,8 @@ impl<'a> DatadirModification<'a> {
aux_files.dir = Some(dir);
}
Err(
e @ (PageReconstructError::Cancelled
e @ (PageReconstructError::AncestorStopping(_)
| PageReconstructError::Cancelled
| PageReconstructError::AncestorLsnTimeout(_)),
) => {
// Important that we do not interpret a shutdown error as "not found" and thereby
@@ -1733,7 +1667,7 @@ impl<'a> DatadirModification<'a> {
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
for (key, values) in self.pending_updates.drain() {
for (lsn, value) in values {
if key.is_rel_block_key() || key.is_slru_block_key() {
if is_rel_block_key(&key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, lsn, &value, ctx).await?;
@@ -1768,6 +1702,8 @@ impl<'a> DatadirModification<'a> {
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
let mut writer = self.tline.writer().await;
let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
@@ -1807,6 +1743,8 @@ impl<'a> DatadirModification<'a> {
writer.update_directory_entries_count(kind, count as u64);
}
timer.observe_duration();
Ok(())
}
@@ -1842,12 +1780,6 @@ impl<'a> DatadirModification<'a> {
self.tline.get(key, lsn, ctx).await
}
/// Only used during unit tests, force putting a key into the modification.
#[cfg(test)]
pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
self.put(key, val);
}
fn put(&mut self, key: Key, val: Value) {
let values = self.pending_updates.entry(key).or_default();
// Replace the previous value if it exists at the same lsn

File diff suppressed because it is too large Load Diff

View File

@@ -238,13 +238,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
io_buf,
Err(Error::new(
ErrorKind::Other,
format!("blob too large ({len} bytes)"),
format!("blob too large ({} bytes)", len),
)),
);
}
if len > 0x0fff_ffff {
tracing::warn!("writing blob above future limit ({len} bytes)");
}
let mut len_buf = (len as u32).to_be_bytes();
len_buf[0] |= 0x80;
io_buf.extend_from_slice(&len_buf[..]);

View File

@@ -11,7 +11,6 @@
use anyhow::bail;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::CompactionAlgorithm;
use pageserver_api::models::CompactionAlgorithmSettings;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, ThrottleConfig};
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -321,7 +320,7 @@ pub struct TenantConf {
pub compaction_period: Duration,
// Level0 delta layer threshold for compaction.
pub compaction_threshold: usize,
pub compaction_algorithm: CompactionAlgorithmSettings,
pub compaction_algorithm: CompactionAlgorithm,
// Determines how much history is retained, to allow
// branching and read replicas at an older point in time.
// The unit is #of bytes of WAL.
@@ -407,7 +406,7 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
pub compaction_algorithm: Option<CompactionAlgorithm>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
@@ -498,9 +497,7 @@ impl TenantConfOpt {
.unwrap_or(global_conf.compaction_threshold),
compaction_algorithm: self
.compaction_algorithm
.as_ref()
.unwrap_or(&global_conf.compaction_algorithm)
.clone(),
.unwrap_or(global_conf.compaction_algorithm),
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
image_creation_threshold: self
@@ -553,9 +550,7 @@ impl Default for TenantConf {
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
.expect("cannot parse default compaction period"),
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
compaction_algorithm: CompactionAlgorithmSettings {
kind: DEFAULT_COMPACTION_ALGORITHM,
},
compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
gc_horizon: DEFAULT_GC_HORIZON,
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
.expect("cannot parse default gc period"),

View File

@@ -8,7 +8,7 @@ use tokio::sync::OwnedMutexGuard;
use tokio_util::sync::CancellationToken;
use tracing::{error, instrument, Instrument};
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
use crate::{
config::PageServerConf,
@@ -16,7 +16,6 @@ use crate::{
task_mgr::{self, TaskKind},
tenant::{
mgr::{TenantSlot, TenantsMapRemoveResult},
remote_timeline_client::remote_heatmap_path,
timeline::ShutdownMode,
},
};
@@ -532,25 +531,6 @@ impl DeleteTenantFlow {
}
}
// Remove top-level tenant objects that don't belong to a timeline, such as heatmap
let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
if let Some(Err(e)) = backoff::retry(
|| async {
remote_storage
.delete(&heatmap_path, &task_mgr::shutdown_token())
.await
},
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_remote_tenant_heatmap",
&task_mgr::shutdown_token(),
)
.await
{
tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
}
let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
if timelines_path.exists() {

View File

@@ -267,7 +267,7 @@ impl<'de> Deserialize<'de> for TimelineMetadata {
D: serde::Deserializer<'de>,
{
let bytes = Vec::<u8>::deserialize(deserializer)?;
Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom)
Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
}
}
@@ -276,163 +276,13 @@ impl Serialize for TimelineMetadata {
where
S: Serializer,
{
let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?;
let bytes = self
.to_bytes()
.map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
bytes.serialize(serializer)
}
}
pub(crate) mod modern_serde {
use crate::tenant::metadata::METADATA_FORMAT_VERSION;
use super::{
TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE,
};
use serde::{Deserialize, Serialize};
pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
where
D: serde::de::Deserializer<'de>,
{
// for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec<u8> field with
// BeSer.
struct Visitor;
impl<'d> serde::de::Visitor<'d> for Visitor {
type Value = TimelineMetadata;
fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.write_str("BeSer bytes or json structure")
}
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'d>,
{
use serde::de::Error;
let de = serde::de::value::SeqAccessDeserializer::new(seq);
Vec::<u8>::deserialize(de)
.map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))?
}
fn visit_map<A>(self, map: A) -> Result<Self::Value, A::Error>
where
A: serde::de::MapAccess<'d>,
{
use serde::de::Error;
let de = serde::de::value::MapAccessDeserializer::new(map);
let body = TimelineMetadataBodyV2::deserialize(de)?;
// jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
// across serialization versions
let mut sink = Crc32Sink::default();
<TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(&body, &mut sink)
.map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?;
let size = METADATA_HDR_SIZE + sink.count;
Ok(TimelineMetadata {
hdr: TimelineMetadataHeader {
checksum: sink.crc,
size: size as u16,
format_version: METADATA_FORMAT_VERSION,
},
body,
})
}
}
deserializer.deserialize_any(Visitor)
}
#[derive(Default)]
struct Crc32Sink {
crc: u32,
count: usize,
}
impl std::io::Write for Crc32Sink {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.crc = crc32c::crc32c_append(self.crc, buf);
self.count += buf.len();
Ok(buf.len())
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
#[derive(thiserror::Error)]
#[error("re-serializing for crc32 failed")]
struct Crc32CalculationFailed<E>(#[source] E);
// this should be true for one release, after that we can change it to false
// remember to check the IndexPart::metadata field TODO comment as well
const LEGACY_BINCODED_BYTES: bool = true;
#[derive(serde::Serialize)]
#[serde(transparent)]
struct LegacyPaddedBytes<'a>(&'a TimelineMetadata);
struct JustTheBodyV2<'a>(&'a TimelineMetadata);
impl serde::Serialize for JustTheBodyV2<'_> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
// header is not needed, upon reading we've upgraded all v1 to v2
self.0.body.serialize(serializer)
}
}
pub(crate) fn serialize<S>(
metadata: &TimelineMetadata,
serializer: S,
) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
// we cannot use TimelineMetadata::serialize for now because it'll do
// TimelineMetadata::to_bytes
if LEGACY_BINCODED_BYTES {
LegacyPaddedBytes(metadata).serialize(serializer)
} else {
JustTheBodyV2(metadata).serialize(serializer)
}
}
#[test]
fn deserializes_bytes_as_well_as_equivalent_body_v2() {
#[derive(serde::Deserialize, serde::Serialize)]
struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata);
let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap();
assert_eq!(
serialized,
serde_json::json! {{
"disk_consistent_lsn": "0/149FD90",
"prev_record_lsn": "0/149FD18",
"ancestor_timeline": null,
"ancestor_lsn": "0/0",
"latest_gc_cutoff_lsn": "0/149FD18",
"initdb_lsn": "0/149FD18",
"pg_version": 15
}}
);
let wrapper_from_json = serde_json::value::from_value::<Wrapper>(serialized).unwrap();
assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0);
}
}
/// Parts of the metadata which are regularly modified.
pub(crate) struct MetadataUpdate {
disk_consistent_lsn: Lsn,

View File

@@ -3,12 +3,11 @@
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use hyper::StatusCode;
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::LocationConfigMode;
use pageserver_api::shard::{
ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
};
use pageserver_api::upcall_api::ReAttachResponseTenant;
use rand::{distributions::Alphanumeric, Rng};
@@ -46,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::inmemory_layer;
use crate::tenant::timeline::ShutdownMode;
use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use utils::crashsafe::path_with_suffix_extension;
@@ -55,7 +54,6 @@ use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::remote_timeline_client::remote_tenant_path;
use super::secondary::SecondaryTenant;
use super::timeline::detach_ancestor::PreparedTimelineDetach;
use super::TenantSharedResources;
@@ -129,8 +127,6 @@ pub(crate) enum ShardSelector {
First,
/// Pick the shard that holds this key
Page(Key),
/// The shard ID is known: pick the given shard
Known(ShardIndex),
}
/// A convenience for use with the re_attach ControlPlaneClient function: rather
@@ -1371,7 +1367,7 @@ impl TenantManager {
&self,
tenant_shard_id: TenantShardId,
activation_timeout: Duration,
) -> Result<StatusCode, DeleteTenantError> {
) -> Result<(), DeleteTenantError> {
super::span::debug_assert_current_span_has_tenant_id();
// We acquire a SlotGuard during this function to protect against concurrent
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
@@ -1384,79 +1380,18 @@ impl TenantManager {
//
// See https://github.com/neondatabase/neon/issues/5080
// Tenant deletion can happen two ways:
// - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
// state until deletion is complete.
// - New: called on a pageserver without an attached location. We proceed with deletion from
// remote storage.
//
// See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
match &slot_guard.old_value {
Some(TenantSlot::Attached(tenant)) => {
// Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
// deletion will be resumed across restarts.
let tenant = tenant.clone();
return self
.delete_tenant_attached(slot_guard, tenant, activation_timeout)
.await;
// unwrap is safe because we used MustExist mode when acquiring
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
TenantSlot::Attached(tenant) => tenant.clone(),
_ => {
// Express "not attached" as equivalent to "not found"
return Err(DeleteTenantError::NotAttached);
}
Some(TenantSlot::Secondary(secondary_tenant)) => {
secondary_tenant.shutdown().await;
let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| {
format!("local tenant directory {local_tenant_directory:?} rename")
})?;
spawn_background_purge(tmp_dir);
}
Some(TenantSlot::InProgress(_)) => unreachable!(),
None => {}
};
// Fall through: local state for this tenant is no longer present, proceed with remote delete
let remote_path = remote_tenant_path(&tenant_shard_id);
let keys = match self
.resources
.remote_storage
.list(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
)
.await
{
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
}
// Callers use 404 as success for deletions, for historical reasons.
Ok(StatusCode::NOT_FOUND)
}
async fn delete_tenant_attached(
&self,
slot_guard: SlotGuard,
tenant: Arc<Tenant>,
activation_timeout: Duration,
) -> Result<StatusCode, DeleteTenantError> {
match tenant.current_state() {
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
// If deletion is already in progress, return success (the semantics of this
@@ -1466,7 +1401,7 @@ impl TenantManager {
// The `delete_progress` lock is held: deletion is already happening
// in the bacckground
slot_guard.revert();
return Ok(StatusCode::ACCEPTED);
return Ok(());
}
}
_ => {
@@ -1499,8 +1434,7 @@ impl TenantManager {
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
slot_guard.revert();
let () = result?;
Ok(StatusCode::ACCEPTED)
result
}
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
@@ -2133,11 +2067,6 @@ impl TenantManager {
return ShardResolveResult::Found(tenant.clone());
}
}
ShardSelector::Known(shard)
if tenant.shard_identity.shard_index() == shard =>
{
return ShardResolveResult::Found(tenant.clone());
}
_ => continue,
}
}
@@ -2897,13 +2826,7 @@ pub(crate) async fn immediate_gc(
}
}
result.map_err(|e| match e {
GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
GcError::TimelineNotFound => {
ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
}
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
})
result.map_err(ApiError::InternalServerError)
}
#[cfg(test)]

View File

@@ -91,7 +91,8 @@
//!
//! The *actual* remote state lags behind the *desired* remote state while
//! there are in-flight operations.
//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`].
//! We keep track of the desired remote state in
//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`].
//! It is initialized based on the [`IndexPart`] that was passed during init
//! and updated with every `schedule_*` function call.
//! All this is necessary necessary to compute the future [`IndexPart`]s
@@ -114,7 +115,8 @@
//!
//! # Completion
//!
//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately,
//! Once an operation has completed, we update
//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
//! and submit a request through the DeletionQueue to update
//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
//! validated that our generation is not stale. It is this visible value
@@ -195,7 +197,6 @@ pub(crate) use upload::upload_initdb_dir;
use utils::backoff::{
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use utils::pausable_failpoint;
use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicU32, Ordering};
@@ -414,7 +415,6 @@ impl RemoteTimelineClient {
Ok(())
}
/// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
match &mut *self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
@@ -441,11 +441,13 @@ impl RemoteTimelineClient {
/// Returns true if this timeline was previously detached at this Lsn and the remote timeline
/// client is currently initialized.
pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
// technically this is a dirty read, but given how timeline detach ancestor is implemented
// via tenant restart, the lineage has always been uploaded.
self.upload_queue
.lock()
.unwrap()
.initialized_mut()
.map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn))
.map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
.unwrap_or(false)
}
@@ -454,6 +456,7 @@ impl RemoteTimelineClient {
current_remote_index_part
.layer_metadata
.values()
// If we don't have the file size for the layer, don't account for it in the metric.
.map(|ilmd| ilmd.file_size)
.sum()
} else {
@@ -581,9 +584,9 @@ impl RemoteTimelineClient {
// As documented in the struct definition, it's ok for latest_metadata to be
// ahead of what's _actually_ on the remote during index upload.
upload_queue.dirty.metadata = metadata.clone();
upload_queue.latest_metadata = metadata.clone();
self.schedule_index_upload(upload_queue)?;
self.schedule_index_upload(upload_queue);
Ok(())
}
@@ -602,9 +605,9 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.dirty.metadata.apply(update);
upload_queue.latest_metadata.apply(update);
self.schedule_index_upload(upload_queue)?;
self.schedule_index_upload(upload_queue);
Ok(())
}
@@ -616,8 +619,8 @@ impl RemoteTimelineClient {
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
self.schedule_index_upload(upload_queue)?;
upload_queue.last_aux_file_policy = last_aux_file_policy;
self.schedule_index_upload(upload_queue);
Ok(())
}
///
@@ -635,44 +638,30 @@ impl RemoteTimelineClient {
let upload_queue = guard.initialized_mut()?;
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue)?;
self.schedule_index_upload(upload_queue);
}
Ok(())
}
/// Launch an index-file upload operation in the background (internal function)
fn schedule_index_upload(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
) -> anyhow::Result<()> {
let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
// fix up the duplicated field
upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
// make sure it serializes before doing it in perform_upload_task so that it doesn't
// look like a retryable error
let void = std::io::sink();
serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?;
let index_part = &upload_queue.dirty;
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
info!(
"scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
index_part.layer_metadata.len(),
upload_queue.latest_files.len(),
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
);
let op = UploadOp::UploadMetadata {
uploaded: Box::new(index_part.clone()),
};
let index_part = IndexPart::from(&*upload_queue);
let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
// Launch the task immediately, if possible
self.launch_queued_tasks(upload_queue);
Ok(())
}
pub(crate) async fn schedule_reparenting_and_wait(
@@ -685,16 +674,16 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else {
let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
return Err(anyhow::anyhow!(
"cannot reparent without a current ancestor"
));
};
upload_queue.dirty.metadata.reparent(new_parent);
upload_queue.dirty.lineage.record_previous_ancestor(&prev);
upload_queue.latest_metadata.reparent(new_parent);
upload_queue.latest_lineage.record_previous_ancestor(&prev);
self.schedule_index_upload(upload_queue)?;
self.schedule_index_upload(upload_queue);
self.schedule_barrier0(upload_queue)
};
@@ -715,17 +704,16 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
upload_queue.dirty.lineage.record_detaching(&adopted);
upload_queue.latest_metadata.detach_from_ancestor(&adopted);
upload_queue.latest_lineage.record_detaching(&adopted);
for layer in layers {
upload_queue
.dirty
.layer_metadata
.latest_files
.insert(layer.layer_desc().layer_name(), layer.metadata());
}
self.schedule_index_upload(upload_queue)?;
self.schedule_index_upload(upload_queue);
let barrier = self.schedule_barrier0(upload_queue);
self.launch_queued_tasks(upload_queue);
@@ -757,8 +745,7 @@ impl RemoteTimelineClient {
let metadata = layer.metadata();
upload_queue
.dirty
.layer_metadata
.latest_files
.insert(layer.layer_desc().layer_name(), metadata.clone());
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -788,8 +775,8 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
let with_metadata = self
.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?;
let with_metadata =
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
@@ -813,7 +800,7 @@ impl RemoteTimelineClient {
let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
self.launch_queued_tasks(upload_queue);
@@ -826,7 +813,7 @@ impl RemoteTimelineClient {
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
names: I,
) -> anyhow::Result<Vec<(LayerName, LayerFileMetadata)>>
) -> Vec<(LayerName, LayerFileMetadata)>
where
I: IntoIterator<Item = LayerName>,
{
@@ -836,7 +823,7 @@ impl RemoteTimelineClient {
let with_metadata: Vec<_> = names
.into_iter()
.filter_map(|name| {
let meta = upload_queue.dirty.layer_metadata.remove(&name);
let meta = upload_queue.latest_files.remove(&name);
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -868,10 +855,10 @@ impl RemoteTimelineClient {
// index_part update, because that needs to be uploaded before we can actually delete the
// files.
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue)?;
self.schedule_index_upload(upload_queue);
}
Ok(with_metadata)
with_metadata
}
/// Schedules deletion for layer files which have previously been unlinked from the
@@ -962,7 +949,7 @@ impl RemoteTimelineClient {
let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
self.launch_queued_tasks(upload_queue);
Ok(())
@@ -1097,7 +1084,7 @@ impl RemoteTimelineClient {
let deleted_at = Utc::now().naive_utc();
stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
let mut index_part = stopped.upload_queue_for_deletion.dirty.clone();
let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
index_part.deleted_at = Some(deleted_at);
index_part
};
@@ -1205,7 +1192,7 @@ impl RemoteTimelineClient {
&self.storage_impl,
uploaded.local_path(),
&remote_path,
uploaded.metadata().file_size,
uploaded.metadata().file_size(),
cancel,
)
.await
@@ -1308,8 +1295,7 @@ impl RemoteTimelineClient {
stopped
.upload_queue_for_deletion
.dirty
.layer_metadata
.latest_files
.drain()
.map(|(file_name, meta)| {
remote_layer_path(
@@ -1446,7 +1432,7 @@ impl RemoteTimelineClient {
// Can always be scheduled.
true
}
UploadOp::UploadMetadata { .. } => {
UploadOp::UploadMetadata(_, _) => {
// These can only be performed after all the preceding operations
// have finished.
upload_queue.inprogress_tasks.is_empty()
@@ -1488,7 +1474,7 @@ impl RemoteTimelineClient {
UploadOp::UploadLayer(_, _) => {
upload_queue.num_inprogress_layer_uploads += 1;
}
UploadOp::UploadMetadata { .. } => {
UploadOp::UploadMetadata(_, _) => {
upload_queue.num_inprogress_metadata_uploads += 1;
}
UploadOp::Delete(_) => {
@@ -1587,7 +1573,7 @@ impl RemoteTimelineClient {
&self.storage_impl,
local_path,
&remote_path,
layer_metadata.file_size,
layer_metadata.file_size(),
&self.cancel,
)
.measure_remote_op(
@@ -1597,13 +1583,22 @@ impl RemoteTimelineClient {
)
.await
}
UploadOp::UploadMetadata { ref uploaded } => {
UploadOp::UploadMetadata(ref index_part, _lsn) => {
let mention_having_future_layers = if cfg!(feature = "testing") {
index_part
.layer_metadata
.keys()
.any(|x| x.is_in_future(*_lsn))
} else {
false
};
let res = upload::upload_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.timeline_id,
self.generation,
uploaded,
index_part,
&self.cancel,
)
.measure_remote_op(
@@ -1613,21 +1608,10 @@ impl RemoteTimelineClient {
)
.await;
if res.is_ok() {
self.update_remote_physical_size_gauge(Some(uploaded));
let mention_having_future_layers = if cfg!(feature = "testing") {
uploaded
.layer_metadata
.keys()
.any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn()))
} else {
false
};
self.update_remote_physical_size_gauge(Some(index_part));
if mention_having_future_layers {
// find rationale near crate::tenant::timeline::init::cleanup_future_layer
tracing::info!(
disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(),
"uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"
);
tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
}
}
res
@@ -1728,23 +1712,11 @@ impl RemoteTimelineClient {
upload_queue.num_inprogress_layer_uploads -= 1;
None
}
UploadOp::UploadMetadata { ref uploaded } => {
UploadOp::UploadMetadata(_, lsn) => {
upload_queue.num_inprogress_metadata_uploads -= 1;
// XXX monotonicity check?
// the task id is reused as a monotonicity check for storing the "clean"
// IndexPart.
let last_updater = upload_queue.clean.1;
let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id);
let monotone = is_later || last_updater.is_none();
assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id);
// not taking ownership is wasteful
upload_queue.clean.0.clone_from(uploaded);
upload_queue.clean.1 = Some(task.task_id);
let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
upload_queue.projected_remote_consistent_lsn = Some(lsn);
if self.generation.is_none() {
// Legacy mode: skip validating generation
upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -1796,9 +1768,9 @@ impl RemoteTimelineClient {
UploadOp::UploadLayer(_, m) => (
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
),
UploadOp::UploadMetadata { .. } => (
UploadOp::UploadMetadata(_, _) => (
RemoteOpFileKind::Index,
RemoteOpKind::Upload,
DontTrackSize {
@@ -1874,9 +1846,11 @@ impl RemoteTimelineClient {
// Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
let upload_queue_for_deletion = UploadQueueInitialized {
task_counter: 0,
dirty: initialized.dirty.clone(),
clean: initialized.clean.clone(),
latest_files: initialized.latest_files.clone(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: initialized.latest_metadata.clone(),
latest_lineage: initialized.latest_lineage.clone(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: initialized
.visible_remote_consistent_lsn
.clone(),
@@ -1889,6 +1863,7 @@ impl RemoteTimelineClient {
dangling_files: HashMap::default(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
last_aux_file_policy: initialized.last_aux_file_policy,
};
let upload_queue = std::mem::replace(

View File

@@ -28,7 +28,6 @@ use crate::TEMP_FILE_SUFFIX;
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use utils::pausable_failpoint;
use super::index::{IndexPart, LayerFileMetadata};
use super::{
@@ -85,7 +84,7 @@ pub async fn download_layer_file<'a>(
)
.await?;
let expected = layer_metadata.file_size;
let expected = layer_metadata.file_size();
if expected != bytes_amount {
return Err(DownloadError::Other(anyhow!(
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
@@ -153,8 +152,6 @@ async fn download_object<'a>(
let download = storage.download(src_path, cancel).await?;
pausable_failpoint!("before-downloading-layer-stream-pausable");
let mut buf_writer =
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
@@ -202,8 +199,6 @@ async fn download_object<'a>(
let mut download = storage.download(src_path, cancel).await?;
pausable_failpoint!("before-downloading-layer-stream-pausable");
// TODO: use vectored write (writev) once supported by tokio-epoll-uring.
// There's chunks_vectored() on the stream.
let (bytes_amount, destination_file) = async {

View File

@@ -11,11 +11,52 @@ use utils::id::TimelineId;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::upload_queue::UploadQueueInitialized;
use crate::tenant::Generation;
use pageserver_api::shard::ShardIndex;
use utils::lsn::Lsn;
/// Metadata gathered for each of the layer files.
///
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
//#[cfg_attr(test, derive(Default))]
pub struct LayerFileMetadata {
file_size: u64,
pub(crate) generation: Generation,
pub(crate) shard: ShardIndex,
}
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
fn from(other: &IndexLayerMetadata) -> Self {
LayerFileMetadata {
file_size: other.file_size,
generation: other.generation,
shard: other.shard,
}
}
}
impl LayerFileMetadata {
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
LayerFileMetadata {
file_size,
generation,
shard,
}
}
pub fn file_size(&self) -> u64 {
self.file_size
}
}
// TODO seems like another part of the remote storage file format
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
/// In-memory representation of an `index_part.json` file
///
/// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -36,18 +77,14 @@ pub struct IndexPart {
///
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
/// that latest version stores.
pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
// It's duplicated for convenience when reading the serialized structure, but is
// private because internally we would read from metadata instead.
pub(super) disk_consistent_lsn: Lsn,
disk_consistent_lsn: Lsn,
// TODO: later make this "rename" to "alias", rename field as "legacy_metadata"
#[serde(
rename = "metadata_bytes",
with = "crate::tenant::metadata::modern_serde"
)]
#[serde(rename = "metadata_bytes")]
pub metadata: TimelineMetadata,
#[serde(default)]
@@ -83,15 +120,26 @@ impl IndexPart {
pub const FILE_NAME: &'static str = "index_part.json";
pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
IndexPart {
fn new(
layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
disk_consistent_lsn: Lsn,
metadata: TimelineMetadata,
lineage: Lineage,
last_aux_file_policy: Option<AuxFilePolicy>,
) -> Self {
let layer_metadata = layers_and_metadata
.iter()
.map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
.collect();
Self {
version: Self::LATEST_VERSION,
layer_metadata: Default::default(),
disk_consistent_lsn: metadata.disk_consistent_lsn(),
layer_metadata,
disk_consistent_lsn,
metadata,
deleted_at: None,
lineage: Default::default(),
last_aux_file_policy: None,
lineage,
last_aux_file_policy,
}
}
@@ -101,7 +149,7 @@ impl IndexPart {
/// If you want this under normal operations, read it from self.metadata:
/// this method is just for the scrubber to use when validating an index.
pub fn duplicated_disk_consistent_lsn(&self) -> Lsn {
pub fn get_disk_consistent_lsn(&self) -> Lsn {
self.disk_consistent_lsn
}
@@ -115,7 +163,14 @@ impl IndexPart {
#[cfg(test)]
pub(crate) fn example() -> Self {
Self::empty(TimelineMetadata::example())
let example_metadata = TimelineMetadata::example();
Self::new(
&HashMap::new(),
example_metadata.disk_consistent_lsn(),
example_metadata,
Default::default(),
Some(AuxFilePolicy::V1),
)
}
pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
@@ -123,12 +178,25 @@ impl IndexPart {
}
}
/// Metadata gathered for each of the layer files.
///
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct LayerFileMetadata {
impl From<&UploadQueueInitialized> for IndexPart {
fn from(uq: &UploadQueueInitialized) -> Self {
let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
let metadata = uq.latest_metadata.clone();
let lineage = uq.latest_lineage.clone();
Self::new(
&uq.latest_files,
disk_consistent_lsn,
metadata,
lineage,
uq.last_aux_file_policy,
)
}
}
/// Serialized form of [`LayerFileMetadata`].
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub struct IndexLayerMetadata {
pub file_size: u64,
#[serde(default = "Generation::none")]
@@ -140,12 +208,12 @@ pub struct LayerFileMetadata {
pub shard: ShardIndex,
}
impl LayerFileMetadata {
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
LayerFileMetadata {
file_size,
generation,
shard,
impl From<&LayerFileMetadata> for IndexLayerMetadata {
fn from(other: &LayerFileMetadata) -> Self {
IndexLayerMetadata {
file_size: other.file_size,
generation: other.generation,
shard: other.shard,
}
}
}
@@ -208,10 +276,11 @@ impl Lineage {
/// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
/// to start a read/write primary at this lsn".
///
/// Returns true if the Lsn was previously our branch point.
/// Returns true if the Lsn was previously a branch point.
pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
self.original_ancestor
.is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
.as_ref()
.is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
}
}
@@ -238,12 +307,12 @@ mod tests {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
@@ -280,12 +349,12 @@ mod tests {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
@@ -323,12 +392,12 @@ mod tests {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 2,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
@@ -411,12 +480,12 @@ mod tests {
let expected = IndexPart {
version: 4,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
@@ -453,12 +522,12 @@ mod tests {
let expected = IndexPart {
version: 5,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
file_size: 23289856,
generation: Generation::new(1),
shard: ShardIndex::unsharded(),
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
file_size: 1015808,
generation: Generation::new(1),
shard: ShardIndex::unsharded(),
@@ -500,12 +569,12 @@ mod tests {
let expected = IndexPart {
version: 6,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,

View File

@@ -1,7 +1,6 @@
//! Helper functions to upload files to remote storage with a RemoteStorage
use anyhow::{bail, Context};
use bytes::Bytes;
use camino::Utf8Path;
use fail::fail_point;
use pageserver_api::shard::TenantShardId;
@@ -10,12 +9,12 @@ use std::time::SystemTime;
use tokio::fs::{self, File};
use tokio::io::AsyncSeekExt;
use tokio_util::sync::CancellationToken;
use utils::{backoff, pausable_failpoint};
use utils::backoff;
use super::index::IndexPart;
use super::Generation;
use crate::tenant::remote_timeline_client::{
remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
index::IndexPart, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path,
};
use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
use utils::id::{TenantId, TimelineId};
@@ -28,7 +27,7 @@ pub(crate) async fn upload_index_part<'a>(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
generation: Generation,
index_part: &IndexPart,
index_part: &'a IndexPart,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
tracing::trace!("uploading new index part");
@@ -38,16 +37,16 @@ pub(crate) async fn upload_index_part<'a>(
});
pausable_failpoint!("before-upload-index-pausable");
// FIXME: this error comes too late
let serialized = index_part.to_s3_bytes()?;
let serialized = Bytes::from(serialized);
let index_part_size = serialized.len();
let index_part_bytes = index_part
.to_s3_bytes()
.context("serialize index part file into bytes")?;
let index_part_size = index_part_bytes.len();
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
storage
.upload_storage_object(
futures::stream::once(futures::future::ready(Ok(serialized))),
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
index_part_size,
&remote_path,
cancel,

View File

@@ -187,7 +187,6 @@ impl SecondaryTenant {
};
let now = SystemTime::now();
tracing::info!("Evicting secondary layer");
let this = self.clone();

View File

@@ -45,10 +45,10 @@ use crate::tenant::{
use camino::Utf8PathBuf;
use chrono::format::{DelayedFormat, StrftimeItems};
use futures::Future;
use futures::{Future, StreamExt};
use pageserver_api::models::SecondaryProgress;
use pageserver_api::shard::TenantShardId;
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, warn, Instrument};
@@ -67,6 +67,12 @@ use super::{
/// download, if the uploader populated it.
const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
/// Range of concurrency we may use when downloading layers within a timeline. This is independent
/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
/// `PageServerConf::secondary_download_concurrency`
const MAX_LAYER_CONCURRENCY: usize = 16;
const MIN_LAYER_CONCURRENCY: usize = 1;
pub(super) async fn downloader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
@@ -75,18 +81,19 @@ pub(super) async fn downloader_task(
cancel: CancellationToken,
root_ctx: RequestContext,
) {
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
// How many tenants' secondary download operations we will run concurrently
let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
let generator = SecondaryDownloader {
tenant_manager,
remote_storage,
root_ctx,
};
let mut scheduler = Scheduler::new(generator, concurrency);
let mut scheduler = Scheduler::new(generator, tenant_concurrency);
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("secondary_download_scheduler"))
.instrument(info_span!("secondary_downloads"))
.await
}
@@ -407,7 +414,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
tracing::warn!("Insufficient space while downloading. Will retry later.");
}
Err(UpdateError::Cancelled) => {
tracing::info!("Shut down while downloading");
tracing::debug!("Shut down while downloading");
},
Err(UpdateError::Deserialize(e)) => {
tracing::error!("Corrupt content while downloading tenant: {e}");
@@ -562,39 +569,6 @@ impl<'a> TenantDownloader<'a> {
heatmap.timelines.len()
);
// Get or initialize the local disk state for the timelines we will update
let mut timeline_states = HashMap::new();
for timeline in &heatmap.timelines {
let timeline_state = self
.secondary_state
.detail
.lock()
.unwrap()
.timelines
.get(&timeline.timeline_id)
.cloned();
let timeline_state = match timeline_state {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state =
init_timeline_state(self.conf, tenant_shard_id, timeline).await;
// Re-acquire detail lock now that we're done with async load from local FS
self.secondary_state
.detail
.lock()
.unwrap()
.timelines
.insert(timeline.timeline_id, timeline_state.clone());
timeline_state
}
};
timeline_states.insert(timeline.timeline_id, timeline_state);
}
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
// principle that deletions should be done before writes wherever possible, and so that we can use this
// phase to initialize our SecondaryProgress.
@@ -605,10 +579,6 @@ impl<'a> TenantDownloader<'a> {
// Download the layers in the heatmap
for timeline in heatmap.timelines {
let timeline_state = timeline_states
.remove(&timeline.timeline_id)
.expect("Just populated above");
if self.secondary_state.cancel.is_cancelled() {
tracing::debug!(
"Cancelled before downloading timeline {}",
@@ -618,7 +588,7 @@ impl<'a> TenantDownloader<'a> {
}
let timeline_id = timeline.timeline_id;
self.download_timeline(timeline, timeline_state, ctx)
self.download_timeline(timeline, ctx)
.instrument(tracing::info_span!(
"secondary_download_timeline",
tenant_id=%tenant_shard_id.tenant_id,
@@ -639,22 +609,6 @@ impl<'a> TenantDownloader<'a> {
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
});
// Robustness: we should have updated progress properly, but in case we didn't, make sure
// we don't leave the tenant in a state where we claim to have successfully downloaded
// everything, but our progress is incomplete. The invariant here should be that if
// we have set `last_download` to this heatmap's etag, then the next time we see that
// etag we can safely do no work (i.e. we must be complete).
let mut progress = self.secondary_state.progress.lock().unwrap();
debug_assert!(progress.layers_downloaded == progress.layers_total);
debug_assert!(progress.bytes_downloaded == progress.bytes_total);
if progress.layers_downloaded != progress.layers_total
|| progress.bytes_downloaded != progress.bytes_total
{
tracing::warn!("Correcting drift in progress stats ({progress:?})");
progress.layers_downloaded = progress.layers_total;
progress.bytes_downloaded = progress.bytes_total;
}
Ok(())
}
@@ -709,7 +663,7 @@ impl<'a> TenantDownloader<'a> {
let mut layer_byte_count: u64 = timeline_state
.on_disk_layers
.values()
.map(|l| l.metadata.file_size)
.map(|l| l.metadata.file_size())
.sum();
// Remove on-disk layers that are no longer present in heatmap
@@ -720,7 +674,7 @@ impl<'a> TenantDownloader<'a> {
.get(layer_file_name)
.unwrap()
.metadata
.file_size;
.file_size();
let local_path = local_layer_path(
self.conf,
@@ -830,7 +784,6 @@ impl<'a> TenantDownloader<'a> {
async fn download_timeline(
&self,
timeline: HeatMapTimeline,
timeline_state: SecondaryDetailTimeline,
ctx: &RequestContext,
) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -839,8 +792,38 @@ impl<'a> TenantDownloader<'a> {
// Accumulate updates to the state
let mut touched = Vec::new();
// Clone a view of what layers already exist on disk
let timeline_state = self
.secondary_state
.detail
.lock()
.unwrap()
.timelines
.get(&timeline.timeline_id)
.cloned();
let timeline_state = match timeline_state {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state =
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
// Re-acquire detail lock now that we're done with async load from local FS
self.secondary_state
.detail
.lock()
.unwrap()
.timelines
.insert(timeline.timeline_id, timeline_state.clone());
timeline_state
}
};
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
let mut download_futs = Vec::new();
// Download heatmap layers that are not present on local disk, or update their
// access time if they are already present.
for layer in timeline.layers {
@@ -877,7 +860,9 @@ impl<'a> TenantDownloader<'a> {
}
}
if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|| on_disk.access_time != layer.access_time
{
// We already have this layer on disk. Update its access time.
tracing::debug!(
"Access time updated for layer {}: {} -> {}",
@@ -909,19 +894,35 @@ impl<'a> TenantDownloader<'a> {
strftime(&layer.access_time),
strftime(evicted_at)
);
self.skip_layer(layer);
continue;
}
}
match self
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
.await?
{
Some(layer) => touched.push(layer),
None => {
// Not an error but we didn't download it: remote layer is missing. Don't add it to the list of
// things to consider touched.
download_futs.push(self.download_layer(
tenant_shard_id,
&timeline.timeline_id,
layer,
ctx,
));
}
// Break up layer downloads into chunks, so that for each chunk we can re-check how much
// concurrency to use based on activity level of remote storage.
while !download_futs.is_empty() {
let chunk =
download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
let concurrency = Self::layer_concurrency(self.remote_storage.activity());
let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
let mut result_stream = std::pin::pin!(result_stream);
while let Some(result) = result_stream.next().await {
match result {
Err(e) => return Err(e),
Ok(None) => {
// No error, but we didn't download the layer. Don't mark it touched
}
Ok(Some(layer)) => touched.push(layer),
}
}
}
@@ -952,7 +953,7 @@ impl<'a> TenantDownloader<'a> {
tenant_shard_id,
&timeline.timeline_id,
t.name,
t.metadata.clone(),
LayerFileMetadata::from(&t.metadata),
t.access_time,
local_path,
));
@@ -964,15 +965,6 @@ impl<'a> TenantDownloader<'a> {
Ok(())
}
/// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
fn skip_layer(&self, layer: HeatMapLayer) {
let mut progress = self.secondary_state.progress.lock().unwrap();
progress.layers_total = progress.layers_total.saturating_sub(1);
progress.bytes_total = progress
.bytes_total
.saturating_sub(layer.metadata.file_size);
}
async fn download_layer(
&self,
tenant_shard_id: &TenantShardId,
@@ -995,25 +987,19 @@ impl<'a> TenantDownloader<'a> {
);
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
tracing::info!(
"Starting download of layer {}, size {}",
layer.name,
layer.metadata.file_size
);
let downloaded_bytes = download_layer_file(
let downloaded_bytes = match download_layer_file(
self.conf,
self.remote_storage,
*tenant_shard_id,
*timeline_id,
&layer.name,
&layer.metadata,
&LayerFileMetadata::from(&layer.metadata),
&local_path,
&self.secondary_state.cancel,
ctx,
)
.await;
let downloaded_bytes = match downloaded_bytes {
.await
{
Ok(bytes) => bytes,
Err(DownloadError::NotFound) => {
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
@@ -1023,8 +1009,6 @@ impl<'a> TenantDownloader<'a> {
"Skipped downloading missing layer {}, raced with compaction/gc?",
layer.name
);
self.skip_layer(layer);
return Ok(None);
}
Err(e) => return Err(e.into()),
@@ -1060,6 +1044,19 @@ impl<'a> TenantDownloader<'a> {
Ok(Some(layer))
}
/// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
// When less than 75% of units are available, use minimum concurrency. Else, do a linear mapping
// of our concurrency range to the units available within the remaining 25%.
let clamp_at = (activity.read_total * 3) / 4;
if activity.read_available > clamp_at {
(MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
/ (activity.read_total - clamp_at)
} else {
MIN_LAYER_CONCURRENCY
}
}
}
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1149,7 +1146,7 @@ async fn init_timeline_state(
tenant_shard_id,
&heatmap.timeline_id,
name,
remote_meta.metadata.clone(),
LayerFileMetadata::from(&remote_meta.metadata),
remote_meta.access_time,
file_path,
),
@@ -1183,3 +1180,58 @@ async fn init_timeline_state(
detail
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn layer_concurrency() {
// Totally idle
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 16,
read_total: 16,
write_available: 16,
write_total: 16
}),
MAX_LAYER_CONCURRENCY
);
// Totally busy
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 0,
read_total: 16,
write_available: 16,
write_total: 16
}),
MIN_LAYER_CONCURRENCY
);
// Edge of the range at which we interpolate
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 12,
read_total: 16,
write_available: 16,
write_total: 16
}),
MIN_LAYER_CONCURRENCY
);
// Midpoint of the range in which we interpolate
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 14,
read_total: 16,
write_available: 16,
write_total: 16
}),
MAX_LAYER_CONCURRENCY / 2
);
}
}

View File

@@ -1,6 +1,6 @@
use std::time::SystemTime;
use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(super) name: LayerName,
pub(super) metadata: LayerFileMetadata,
pub(super) metadata: IndexLayerMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
pub(super) access_time: SystemTime,
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
impl HeatMapLayer {
pub(crate) fn new(
name: LayerName,
metadata: LayerFileMetadata,
metadata: IndexLayerMetadata,
access_time: SystemTime,
) -> Self {
Self {

View File

@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("heatmap_upload_scheduler"))
.instrument(info_span!("heatmap_uploader"))
.await
}

View File

@@ -179,13 +179,6 @@ where
// Schedule some work, if concurrency limit permits it
self.spawn_pending();
// This message is printed every scheduling iteration as proof of liveness when looking at logs
tracing::info!(
"Status: {} tasks running, {} pending",
self.running.len(),
self.pending.len()
);
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
@@ -265,11 +258,7 @@ where
self.tasks.spawn(fut);
let replaced = self.running.insert(tenant_shard_id, in_progress);
debug_assert!(replaced.is_none());
if replaced.is_some() {
tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
}
self.running.insert(tenant_shard_id, in_progress);
}
/// For all pending tenants that are elegible for execution, spawn their task.
@@ -279,9 +268,7 @@ where
while !self.pending.is_empty() && self.running.len() < self.concurrency {
// unwrap: loop condition includes !is_empty()
let pending = self.pending.pop_front().unwrap();
if !self.running.contains_key(pending.get_tenant_shard_id()) {
self.do_spawn(pending);
}
self.do_spawn(pending);
}
}
@@ -334,11 +321,7 @@ where
let tenant_shard_id = job.get_tenant_shard_id();
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
"Command already running, waiting for it"
);
tracing::info!("Command already running, waiting for it");
barrier
} else {
let running = self.spawn_now(job);

View File

@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
#[derive(Debug)]
struct LayerKeyspace {
layer: ReadableLayer,
target_keyspace: Vec<KeySpace>,
target_keyspace: KeySpace,
}
impl LayerFringe {
@@ -336,7 +336,6 @@ impl LayerFringe {
};
let removed = self.layers.remove_entry(&read_desc.layer_id);
match removed {
Some((
_,
@@ -344,15 +343,7 @@ impl LayerFringe {
layer,
target_keyspace,
},
)) => {
let mut keyspace = KeySpaceRandomAccum::new();
for ks in target_keyspace {
for part in ks.ranges {
keyspace.add_range(part);
}
}
Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
}
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
None => unreachable!("fringe internals are always consistent"),
}
}
@@ -367,7 +358,7 @@ impl LayerFringe {
let entry = self.layers.entry(layer_id.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().target_keyspace.push(keyspace);
entry.get_mut().target_keyspace.merge(&keyspace);
}
Entry::Vacant(entry) => {
self.planned_reads_by_lsn.push(ReadDesc {
@@ -376,7 +367,7 @@ impl LayerFringe {
});
entry.insert(LayerKeyspace {
layer,
target_keyspace: vec![keyspace],
target_keyspace: keyspace,
});
}
}

View File

@@ -478,23 +478,6 @@ impl DeltaLayerWriterInner {
key_end: Key,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<ResidentLayer> {
let temp_path = self.path.clone();
let result = self.finish0(key_end, timeline, ctx).await;
if result.is_err() {
tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
if let Err(e) = std::fs::remove_file(&temp_path) {
tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
}
}
result
}
async fn finish0(
self,
key_end: Key,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<ResidentLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -668,11 +651,19 @@ impl DeltaLayerWriter {
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<ResidentLayer> {
self.inner
.take()
.unwrap()
.finish(key_end, timeline, ctx)
.await
let inner = self.inner.take().unwrap();
let temp_path = inner.path.clone();
let result = inner.finish(key_end, timeline, ctx).await;
// The delta layer files can sometimes be really large. Clean them up.
if result.is_err() {
tracing::warn!(
"Cleaning up temporary delta file {temp_path} after error during writing"
);
if let Err(e) = std::fs::remove_file(&temp_path) {
tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
}
}
result
}
}

View File

@@ -47,7 +47,7 @@ use hex;
use itertools::Itertools;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::fs::File;
@@ -473,7 +473,7 @@ impl ImageLayerInner {
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
let reads = self
.plan_reads(keyspace, None, ctx)
.plan_reads(keyspace, ctx)
.await
.map_err(GetVectoredError::Other)?;
@@ -485,15 +485,9 @@ impl ImageLayerInner {
Ok(())
}
/// Traverse the layer's index to build read operations on the overlap of the input keyspace
/// and the keys in this layer.
///
/// If shard_identity is provided, it will be used to filter keys down to those stored on
/// this shard.
async fn plan_reads(
&self,
keyspace: KeySpace,
shard_identity: Option<&ShardIdentity>,
ctx: &RequestContext,
) -> anyhow::Result<Vec<VectoredRead>> {
let mut planner = VectoredReadPlanner::new(
@@ -513,6 +507,7 @@ impl ImageLayerInner {
for range in keyspace.ranges.iter() {
let mut range_end_handled = false;
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
range.start.write_to_byte_slice(&mut search_key);
@@ -525,22 +520,12 @@ impl ImageLayerInner {
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
assert!(key >= range.start);
let flag = if let Some(shard_identity) = shard_identity {
if shard_identity.is_key_disposable(&key) {
BlobFlag::Ignore
} else {
BlobFlag::None
}
} else {
BlobFlag::None
};
if key >= range.end {
planner.handle_range_end(offset);
range_end_handled = true;
break;
} else {
planner.handle(key, self.lsn, offset, flag);
planner.handle(key, self.lsn, offset, BlobFlag::None);
}
}
@@ -553,50 +538,6 @@ impl ImageLayerInner {
Ok(planner.finish())
}
/// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
/// then execute vectored GET operations, passing the results of all read keys into the writer.
pub(super) async fn filter(
&self,
shard_identity: &ShardIdentity,
writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
// Fragment the range into the regions owned by this ShardIdentity
let plan = self
.plan_reads(
KeySpace {
// If asked for the total key space, plan_reads will give us all the keys in the layer
ranges: vec![Key::MIN..Key::MAX],
},
Some(shard_identity),
ctx,
)
.await?;
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
let mut key_count = 0;
for read in plan.into_iter() {
let buf_size = read.size();
let buf = BytesMut::with_capacity(buf_size);
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
let frozen_buf = blobs_buf.buf.freeze();
for meta in blobs_buf.blobs.iter() {
let img_buf = frozen_buf.slice(meta.start..meta.end);
key_count += 1;
writer
.put_image(meta.meta.key, img_buf, ctx)
.await
.context(format!("Storing key {}", meta.meta.key))?;
}
}
Ok(key_count)
}
async fn do_reads_and_update_state(
&self,
reads: Vec<VectoredRead>,
@@ -709,7 +650,7 @@ impl ImageLayerWriterInner {
lsn,
},
);
trace!("creating image layer {}", path);
info!("new image layer {path}");
let mut file = {
VirtualFile::open_with_options(
&path,
@@ -829,7 +770,7 @@ impl ImageLayerWriterInner {
// FIXME: why not carry the virtualfile here, it supports renaming?
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
info!("created image layer {}", layer.local_path());
trace!("created image layer {}", layer.local_path());
Ok(layer)
}
@@ -914,196 +855,3 @@ impl Drop for ImageLayerWriter {
}
}
}
#[cfg(test)]
mod test {
use std::time::Duration;
use bytes::Bytes;
use pageserver_api::{
key::Key,
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
};
use utils::{
generation::Generation,
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::{
tenant::{config::TenantConf, harness::TenantHarness},
DEFAULT_PG_VERSION,
};
use super::ImageLayerWriter;
#[tokio::test]
async fn image_layer_rewrite() {
let tenant_conf = TenantConf {
gc_period: Duration::ZERO,
compaction_period: Duration::ZERO,
..TenantConf::default()
};
let tenant_id = TenantId::generate();
let mut gen = Generation::new(0xdead0001);
let mut get_next_gen = || {
let ret = gen;
gen = gen.next();
ret
};
// The LSN at which we will create an image layer to filter
let lsn = Lsn(0xdeadbeef0000);
let timeline_id = TimelineId::generate();
//
// Create an unsharded parent with a layer.
//
let harness = TenantHarness::create_custom(
"test_image_layer_rewrite--parent",
tenant_conf.clone(),
tenant_id,
ShardIdentity::unsharded(),
get_next_gen(),
)
.unwrap();
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
let range = input_start..input_end;
// Build an image layer to filter
let resident = {
let mut writer = ImageLayerWriter::new(
harness.conf,
timeline_id,
harness.tenant_shard_id,
&range,
lsn,
&ctx,
)
.await
.unwrap();
let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
let mut key = range.start;
while key < range.end {
writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
key = key.next();
}
writer.finish(&timeline, &ctx).await.unwrap()
};
let original_size = resident.metadata().file_size;
//
// Create child shards and do the rewrite, exercising filter().
// TODO: abstraction in TenantHarness for splits.
//
// Filter for various shards: this exercises cases like values at start of key range, end of key
// range, middle of key range.
let shard_count = ShardCount::new(4);
for shard_number in 0..shard_count.count() {
//
// mimic the shard split
//
let shard_identity = ShardIdentity::new(
ShardNumber(shard_number),
shard_count,
ShardStripeSize(0x8000),
)
.unwrap();
let harness = TenantHarness::create_custom(
Box::leak(Box::new(format!(
"test_image_layer_rewrite--child{}",
shard_identity.shard_slug()
))),
tenant_conf.clone(),
tenant_id,
shard_identity,
// NB: in reality, the shards would each fork off their own gen number sequence from the parent.
// But here, all we care about is that the gen number is unique.
get_next_gen(),
)
.unwrap();
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
//
// use filter() and make assertions
//
let mut filtered_writer = ImageLayerWriter::new(
harness.conf,
timeline_id,
harness.tenant_shard_id,
&range,
lsn,
&ctx,
)
.await
.unwrap();
let wrote_keys = resident
.filter(&shard_identity, &mut filtered_writer, &ctx)
.await
.unwrap();
let replacement = if wrote_keys > 0 {
Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
} else {
None
};
// This exact size and those below will need updating as/when the layer encoding changes, but
// should be deterministic for a given version of the format, as we used no randomness generating the input.
assert_eq!(original_size, 1597440);
match shard_number {
0 => {
// We should have written out just one stripe for our shard identity
assert_eq!(wrote_keys, 0x8000);
let replacement = replacement.unwrap();
// We should have dropped some of the data
assert!(replacement.metadata().file_size < original_size);
assert!(replacement.metadata().file_size > 0);
// Assert that we dropped ~3/4 of the data.
assert_eq!(replacement.metadata().file_size, 417792);
}
1 => {
// Shard 1 has no keys in our input range
assert_eq!(wrote_keys, 0x0);
assert!(replacement.is_none());
}
2 => {
// Shard 2 has one stripes in the input range
assert_eq!(wrote_keys, 0x8000);
let replacement = replacement.unwrap();
assert!(replacement.metadata().file_size < original_size);
assert!(replacement.metadata().file_size > 0);
assert_eq!(replacement.metadata().file_size, 417792);
}
3 => {
// Shard 3 has two stripes in the input range
assert_eq!(wrote_keys, 0x10000);
let replacement = replacement.unwrap();
assert!(replacement.metadata().file_size < original_size);
assert!(replacement.metadata().file_size > 0);
assert_eq!(replacement.metadata().file_size, 811008);
}
_ => unreachable!(),
}
}
}
}

View File

@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
use pageserver_api::shard::{ShardIndex, TenantShardId};
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
use tracing::Instrument;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use utils::sync::{gate, heavier_once_cell};
use utils::sync::heavier_once_cell;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
use super::delta_layer::{self, DeltaEntry};
use super::image_layer::{self};
use super::image_layer;
use super::{
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
};
use utils::generation::Generation;
@@ -161,7 +161,7 @@ impl Layer {
timeline.tenant_shard_id,
timeline.timeline_id,
file_name,
metadata.file_size,
metadata.file_size(),
);
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -194,7 +194,7 @@ impl Layer {
timeline.tenant_shard_id,
timeline.timeline_id,
file_name,
metadata.file_size,
metadata.file_size(),
);
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -227,7 +227,7 @@ impl Layer {
timeline
.metrics
.resident_physical_size_add(metadata.file_size);
.resident_physical_size_add(metadata.file_size());
ResidentLayer { downloaded, owner }
}
@@ -277,10 +277,9 @@ impl Layer {
let downloaded = resident.expect("just initialized");
// We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
// TODO: this leaves the temp file in place if the rename fails, risking us running
// out of space. Should we clean it up here or does the calling context deal with this?
utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
// if the rename works, the path is as expected
// TODO: sync system call
std::fs::rename(temp_path, owner.local_path())
.with_context(|| format!("rename temporary file as correct path for {owner}"))?;
Ok(ResidentLayer { downloaded, owner })
@@ -367,10 +366,7 @@ impl Layer {
.0
.get_or_maybe_download(true, Some(ctx))
.await
.map_err(|err| match err {
DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
other => GetVectoredError::Other(anyhow::anyhow!(other)),
})?;
.map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
self.0
.access_stats
@@ -1162,11 +1158,6 @@ impl LayerInner {
let consecutive_failures =
1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
if timeline.cancel.is_cancelled() {
// If we're shutting down, drop out before logging the error
return Err(e);
}
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
let backoff = utils::backoff::exponential_backoff_duration_seconds(
@@ -1273,7 +1264,6 @@ impl LayerInner {
lsn_end: lsn_range.end,
remote: !resident,
access_stats,
l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
}
} else {
let lsn = self.desc.image_layer_lsn();
@@ -1342,7 +1332,7 @@ impl LayerInner {
is_good_to_continue(&rx.borrow_and_update())?;
let Ok(gate) = timeline.gate.enter() else {
let Ok(_gate) = timeline.gate.enter() else {
return Err(EvictionCancelled::TimelineGone);
};
@@ -1430,7 +1420,7 @@ impl LayerInner {
Self::spawn_blocking(move || {
let _span = span.entered();
let res = self.evict_blocking(&timeline, &gate, &permit);
let res = self.evict_blocking(&timeline, &permit);
let waiters = self.inner.initializer_count();
@@ -1456,7 +1446,6 @@ impl LayerInner {
fn evict_blocking(
&self,
timeline: &Timeline,
_gate: &gate::GateGuard,
_permit: &heavier_once_cell::InitPermit,
) -> Result<(), EvictionCancelled> {
// now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
@@ -1811,15 +1800,16 @@ impl ResidentLayer {
use LayerKind::*;
let owner = &self.owner.0;
match self.downloaded.get(owner, ctx).await? {
Delta(ref d) => {
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
// while it's being held.
owner
.access_stats
.record_access(LayerAccessKind::KeyIter, ctx);
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
// while it's being held.
delta_layer::DeltaLayerInner::load_keys(d, ctx)
.await
.with_context(|| format!("Layer index is corrupted for {self}"))
@@ -1828,23 +1818,6 @@ impl ResidentLayer {
}
}
/// Read all they keys in this layer which match the ShardIdentity, and write them all to
/// the provided writer. Return the number of keys written.
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
pub(crate) async fn filter<'a>(
&'a self,
shard_identity: &ShardIdentity,
writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
use LayerKind::*;
match self.downloaded.get(&self.owner.0, ctx).await? {
Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
Image(i) => i.filter(shard_identity, writer, ctx).await,
}
}
/// Returns the amount of keys and values written to the writer.
pub(crate) async fn copy_delta_prefix(
&self,

View File

@@ -347,33 +347,37 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
mod test {
use super::*;
#[test]
fn image_layer_parse() {
fn image_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Image(ImageLayerName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
Ok(())
}
#[test]
fn delta_layer_parse() {
fn delta_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Delta(DeltaLayerName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
..Lsn::from_hex("000000000154C481").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
Ok(())
}
}

View File

@@ -17,7 +17,7 @@ use crate::tenant::{Tenant, TenantState};
use rand::Rng;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{backoff, completion, pausable_failpoint};
use utils::{backoff, completion};
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| {
@@ -380,28 +380,21 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let res = tenant
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
.await;
match res {
Ok(_) => {
error_run_count = 0;
period
}
Err(crate::tenant::GcError::TenantCancelled) => {
return;
}
Err(e) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
error!(
if let Err(e) = res {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
error!(
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
);
wait_duration
}
wait_duration
} else {
error_run_count = 0;
period
}
};

File diff suppressed because it is too large Load Diff

View File

@@ -9,10 +9,7 @@ use std::ops::{Deref, Range};
use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
RecordedDuration, Timeline,
};
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
use anyhow::{anyhow, Context};
use enumset::EnumSet;
@@ -25,13 +22,14 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
use crate::tenant::timeline::{Layer, ResidentLayer};
use crate::tenant::DeltaLayer;
use crate::tenant::PageReconstructError;
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
use crate::{page_cache, ZERO_PAGE};
use crate::keyspace::KeySpace;
use crate::repository::Key;
@@ -133,7 +131,8 @@ impl Timeline {
},
&image_ctx,
)
.await?;
.await
.map_err(anyhow::Error::from)?;
self.upload_new_image_layers(image_layers)?;
partitioning.parts.len()
@@ -175,24 +174,13 @@ impl Timeline {
async fn compact_shard_ancestors(
self: &Arc<Self>,
rewrite_max: usize,
ctx: &RequestContext,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut drop_layers = Vec::new();
let mut layers_to_rewrite: Vec<Layer> = Vec::new();
let layers_to_rewrite: Vec<Layer> = Vec::new();
// We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
// layer is behind this Lsn, it indicates that the layer is being retained beyond the
// pitr_interval, for example because a branchpoint references it.
//
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
// are rewriting layers.
let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
tracing::info!(
"latest_gc_cutoff: {}, pitr cutoff {}",
*latest_gc_cutoff,
self.gc_info.read().unwrap().cutoffs.pitr
);
// We will use the PITR cutoff as a condition for rewriting layers.
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
let layers = self.layers.read().await;
for layer_desc in layers.layer_map().iter_historic_layers() {
@@ -251,9 +239,9 @@ impl Timeline {
// Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
// without incurring the I/O cost of a rewrite.
if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
layer_desc.get_lsn_range().end, *latest_gc_cutoff);
if layer_desc.get_lsn_range().end >= pitr_cutoff {
debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
layer_desc.get_lsn_range().end, pitr_cutoff);
continue;
}
@@ -263,10 +251,13 @@ impl Timeline {
continue;
}
// Only rewrite layers if their generations differ. This guarantees:
// - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
// - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
if layer.metadata().generation == self.generation {
// Only rewrite layers if they would have different remote paths: either they belong to this
// shard but an old generation, or they belonged to another shard. This also implicitly
// guarantees that the layer is persistent in remote storage (as only remote persistent
// layers are carried across shard splits, any local-only layer would be in the current generation)
if layer.metadata().generation == self.generation
&& layer.metadata().shard.shard_count == self.shard_identity.count
{
debug!(%layer, "Skipping rewrite, is not from old generation");
continue;
}
@@ -279,69 +270,18 @@ impl Timeline {
}
// Fall through: all our conditions for doing a rewrite passed.
layers_to_rewrite.push(layer);
// TODO: implement rewriting
tracing::debug!(%layer, "Would rewrite layer");
}
// Drop read lock on layer map before we start doing time-consuming I/O
// Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
drop(layers);
let mut replace_image_layers = Vec::new();
for layer in layers_to_rewrite {
tracing::info!(layer=%layer, "Rewriting layer after shard split...");
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
&layer.layer_desc().key_range,
layer.layer_desc().image_layer_lsn(),
ctx,
)
.await?;
// Safety of layer rewrites:
// - We are writing to a different local file path than we are reading from, so the old Layer
// cannot interfere with the new one.
// - In the page cache, contents for a particular VirtualFile are stored with a file_id that
// is different for two layers with the same name (in `ImageLayerInner::new` we always
// acquire a fresh id from [`crate::page_cache::next_file_id`]. So readers do not risk
// reading the index from one layer file, and then data blocks from the rewritten layer file.
// - Any readers that have a reference to the old layer will keep it alive until they are done
// with it. If they are trying to promote from remote storage, that will fail, but this is the same
// as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
// - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
// - GC, which at worst witnesses us "undelete" a layer that they just deleted.
// - ingestion, which only inserts layers, therefore cannot collide with us.
let resident = layer.download_and_keep_resident().await?;
let keys_written = resident
.filter(&self.shard_identity, &mut image_layer_writer, ctx)
.await?;
if keys_written > 0 {
let new_layer = image_layer_writer.finish(self, ctx).await?;
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
layer.metadata().file_size,
new_layer.metadata().file_size);
replace_image_layers.push((layer, new_layer));
} else {
// Drop the old layer. Usually for this case we would already have noticed that
// the layer has no data for us with the ShardedRange check above, but
drop_layers.push(layer);
}
}
// At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
// metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
// to remote index) and be removed. This is inefficient but safe.
fail::fail_point!("compact-shard-ancestors-localonly");
// TODO: collect layers to rewrite
let replace_layers = Vec::new();
// Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
self.rewrite_layers(replace_image_layers, drop_layers)
.await?;
fail::fail_point!("compact-shard-ancestors-enqueued");
self.rewrite_layers(replace_layers, drop_layers).await?;
// We wait for all uploads to complete before finishing this compaction stage. This is not
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
@@ -349,8 +289,6 @@ impl Timeline {
// load.
self.remote_client.wait_completion().await?;
fail::fail_point!("compact-shard-ancestors-persistent");
Ok(())
}
@@ -421,6 +359,48 @@ impl Timeline {
return Ok(CompactLevel0Phase1Result::default());
}
// This failpoint is used together with `test_duplicate_layers` integration test.
// It returns the compaction result exactly the same layers as input to compaction.
// We want to ensure that this will not cause any problem when updating the layer map
// after the compaction is finished.
//
// Currently, there are two rare edge cases that will cause duplicated layers being
// inserted.
// 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
// is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
// map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
// point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
// and this causes an overwrite. This is acceptable because the content is the same, and we should do a
// layer replace instead of the normal remove / upload process.
// 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
// size length. Compaction will likely create the same set of n files afterwards.
//
// This failpoint is a superset of both of the cases.
if cfg!(feature = "testing") {
let active = (|| {
::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
false
})();
if active {
let mut new_layers = Vec::with_capacity(level0_deltas.len());
for delta in &level0_deltas {
// we are just faking these layers as being produced again for this failpoint
new_layers.push(
delta
.download_and_keep_resident()
.await
.context("download layer for failpoint")?,
);
}
tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
return Ok(CompactLevel0Phase1Result {
new_layers,
deltas_to_compact: level0_deltas,
});
}
}
// Gather the files to compact in this iteration.
//
// Start with the oldest Level 0 delta file, and collect any other
@@ -1170,10 +1150,10 @@ impl TimelineAdaptor {
lsn: Lsn,
key_range: &Range<Key>,
ctx: &RequestContext,
) -> Result<(), CreateImageLayersError> {
) -> Result<(), PageReconstructError> {
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
let image_layer_writer = ImageLayerWriter::new(
let mut image_layer_writer = ImageLayerWriter::new(
self.timeline.conf,
self.timeline.timeline_id,
self.timeline.tenant_shard_id,
@@ -1184,34 +1164,47 @@ impl TimelineAdaptor {
.await?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
Err(CreateImageLayersError::Other(anyhow::anyhow!(
Err(PageReconstructError::Other(anyhow::anyhow!(
"failpoint image-layer-writer-fail-before-finish"
)))
});
let keyspace = KeySpace {
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
};
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
let start = Key::MIN;
let ImageLayerCreationOutcome {
image,
next_start_key: _,
} = self
.timeline
.create_image_layer_for_rel_blocks(
&keyspace,
image_layer_writer,
lsn,
ctx,
key_range.clone(),
start,
)
.await?;
if let Some(image_layer) = image {
self.new_images.push(image_layer);
let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
for range in &keyspace_ranges {
let mut key = range.start;
while key < range.end {
let img = match self.timeline.get(key, lsn, ctx).await {
Ok(img) => img,
Err(err) => {
// If we fail to reconstruct a VM or FSM page, we can zero the
// page without losing any actual user data. That seems better
// than failing repeatedly and getting stuck.
//
// We had a bug at one point, where we truncated the FSM and VM
// in the pageserver, but the Postgres didn't know about that
// and continued to generate incremental WAL records for pages
// that didn't exist in the pageserver. Trying to replay those
// WAL records failed to find the previous image of the page.
// This special case allows us to recover from that situation.
// See https://github.com/neondatabase/neon/issues/2601.
//
// Unfortunately we cannot do this for the main fork, or for
// any metadata keys, keys, as that would lead to actual data
// loss.
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
ZERO_PAGE.clone()
} else {
return Err(err);
}
}
};
image_layer_writer.put_image(key, img, ctx).await?;
key = key.next();
}
}
let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
self.new_images.push(image_layer);
timer.stop_and_record();

View File

@@ -7,7 +7,7 @@ use anyhow::Context;
use pageserver_api::{models::TimelineState, shard::TenantShardId};
use tokio::sync::OwnedMutexGuard;
use tracing::{error, info, instrument, Instrument};
use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
use utils::{crashsafe, fs_ext, id::TimelineId};
use crate::{
config::PageServerConf,

View File

@@ -1,6 +1,6 @@
use std::sync::Arc;
use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
use super::{layer_manager::LayerManager, Timeline};
use crate::{
context::{DownloadBehavior, RequestContext},
task_mgr::TaskKind,
@@ -12,7 +12,7 @@ use crate::{
};
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
#[derive(Debug, thiserror::Error)]
pub(crate) enum Error {
@@ -23,7 +23,7 @@ pub(crate) enum Error {
#[error("shutting down, please retry later")]
ShuttingDown,
#[error("flushing failed")]
FlushAncestor(#[source] FlushLayerError),
FlushAncestor(#[source] anyhow::Error),
#[error("layer download failed")]
RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
#[error("copying LSN prefix locally failed")]
@@ -41,27 +41,6 @@ pub(crate) enum Error {
Unexpected(#[source] anyhow::Error),
}
impl From<Error> for ApiError {
fn from(value: Error) -> Self {
match value {
e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
// TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
Error::ShuttingDown => ApiError::ShuttingDown,
Error::OtherTimelineDetachOngoing(_) => {
ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
}
// All of these contain shutdown errors, in fact, it's the most common
e @ Error::FlushAncestor(_)
| e @ Error::RewrittenDeltaDownloadFailed(_)
| e @ Error::CopyDeltaPrefix(_)
| e @ Error::UploadRewritten(_)
| e @ Error::CopyFailed(_)
| e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
}
}
}
pub(crate) struct PreparedTimelineDetach {
layers: Vec<Layer>,
}
@@ -96,11 +75,6 @@ pub(super) async fn prepare(
.as_ref()
.map(|tl| (tl.clone(), detached.ancestor_lsn))
else {
// TODO: check if we have already been detached; for this we need to read the stored data
// on remote client, for that we need a follow-up which makes uploads cheaper and maintains
// a projection of the commited data.
//
// the error is wrong per openapi
return Err(NoAncestor);
};
@@ -110,7 +84,7 @@ pub(super) async fn prepare(
if ancestor.ancestor_timeline.is_some() {
// non-technical requirement; we could flatten N ancestors just as easily but we chose
// not to, at least initially
// not to
return Err(TooManyAncestors);
}

View File

@@ -7,20 +7,19 @@ use crate::{
index::{IndexPart, LayerFileMetadata},
},
storage_layer::LayerName,
Generation,
},
};
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use std::{
collections::{hash_map, HashMap},
str::FromStr,
};
use pageserver_api::shard::ShardIndex;
use std::{collections::HashMap, str::FromStr};
use utils::lsn::Lsn;
/// Identified files in the timeline directory.
pub(super) enum Discovered {
/// The only one we care about
Layer(LayerName, LocalLayerFileMetadata),
Layer(LayerName, Utf8PathBuf, u64),
/// Old ephmeral files from previous launches, should be removed
Ephemeral(String),
/// Old temporary timeline files, unsure what these really are, should be removed
@@ -28,7 +27,7 @@ pub(super) enum Discovered {
/// Temporary on-demand download files, should be removed
TemporaryDownload(String),
/// Backup file from previously future layers
IgnoredBackup(Utf8PathBuf),
IgnoredBackup,
/// Unrecognized, warn about these
Unknown(String),
}
@@ -44,15 +43,12 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
let discovered = match LayerName::from_str(&file_name) {
Ok(file_name) => {
let file_size = direntry.metadata()?.len();
Discovered::Layer(
file_name,
LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
)
Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
}
Err(_) => {
if file_name.ends_with(".old") {
// ignore these
Discovered::IgnoredBackup(direntry.path().to_owned())
Discovered::IgnoredBackup
} else if remote_timeline_client::is_temp_download_file(direntry.path()) {
Discovered::TemporaryDownload(file_name)
} else if is_ephemeral_file(&file_name) {
@@ -75,32 +71,37 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
/// this structure extends it with metadata describing the layer's presence in local storage.
#[derive(Clone, Debug)]
pub(super) struct LocalLayerFileMetadata {
pub(super) file_size: u64,
pub(super) metadata: LayerFileMetadata,
pub(super) local_path: Utf8PathBuf,
}
impl LocalLayerFileMetadata {
pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
pub fn new(
local_path: Utf8PathBuf,
file_size: u64,
generation: Generation,
shard: ShardIndex,
) -> Self {
Self {
local_path,
file_size,
metadata: LayerFileMetadata::new(file_size, generation, shard),
}
}
}
/// For a layer that is present in remote metadata, this type describes how to handle
/// it during startup: it is either Resident (and we have some metadata about a local file),
/// or it is Evicted (and we only have remote metadata).
/// Decision on what to do with a layer file after considering its local and remote metadata.
#[derive(Clone, Debug)]
pub(super) enum Decision {
/// The layer is not present locally.
Evicted(LayerFileMetadata),
/// The layer is present locally, and metadata matches: we may hook up this layer to the
/// existing file in local storage.
Resident {
/// The layer is present locally, but local metadata does not match remote; we must
/// delete it and treat it as evicted.
UseRemote {
local: LocalLayerFileMetadata,
remote: LayerFileMetadata,
},
/// The layer is present locally, and metadata matches.
UseLocal(LocalLayerFileMetadata),
}
/// A layer needs to be left out of the layer map.
@@ -116,81 +117,77 @@ pub(super) enum DismissedLayer {
/// In order to make crash safe updates to layer map, we must dismiss layers which are only
/// found locally or not yet included in the remote `index_part.json`.
LocalOnly(LocalLayerFileMetadata),
/// The layer exists in remote storage but the local layer's metadata (e.g. file size)
/// does not match it
BadMetadata(LocalLayerFileMetadata),
}
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
pub(super) fn reconcile(
local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
index_part: Option<&IndexPart>,
disk_consistent_lsn: Lsn,
generation: Generation,
shard: ShardIndex,
) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
let Some(index_part) = index_part else {
// If we have no remote metadata, no local layer files are considered valid to load
return local_layers
.into_iter()
.map(|(layer_name, local_metadata)| {
(layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
})
.collect();
};
use Decision::*;
let mut result = Vec::new();
// name => (local_metadata, remote_metadata)
type Collected =
HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
let mut remote_layers = HashMap::new();
let mut discovered = discovered
.into_iter()
.map(|(layer_name, local_path, file_size)| {
(
layer_name,
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
// it is not in IndexPart, in which case using our current generation makes sense
// because it will be uploaded in this generation.
(
Some(LocalLayerFileMetadata::new(
local_path, file_size, generation, shard,
)),
None,
),
)
})
.collect::<Collected>();
// Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise
// construct DismissedLayers to get rid of them.
for (layer_name, local_metadata) in local_layers {
let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
continue;
};
if remote_metadata.file_size != local_metadata.file_size {
result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
continue;
}
remote_layers.insert(
layer_name,
Decision::Resident {
local: local_metadata,
remote: remote_metadata.clone(),
},
);
}
// Construct Decision for layers that were not found locally
// merge any index_part information, when available
index_part
.layer_metadata
.iter()
.as_ref()
.map(|ip| ip.layer_metadata.iter())
.into_iter()
.flatten()
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
.for_each(|(name, metadata)| {
if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
entry.insert(Decision::Evicted(metadata.clone()));
if let Some(existing) = discovered.get_mut(name) {
existing.1 = Some(metadata);
} else {
discovered.insert(name.to_owned(), (None, Some(metadata)));
}
});
// For layers that were found in authoritative remote metadata, apply a final check that they are within
// the disk_consistent_lsn.
result.extend(remote_layers.into_iter().map(|(name, decision)| {
if name.is_in_future(disk_consistent_lsn) {
match decision {
Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
Decision::Resident {
local,
remote: _remote,
} => (name, Err(DismissedLayer::Future { local: Some(local) })),
}
} else {
(name, Ok(decision))
}
}));
discovered
.into_iter()
.map(|(name, (local, remote))| {
let decision = if name.is_in_future(disk_consistent_lsn) {
Err(DismissedLayer::Future { local })
} else {
match (local, remote) {
(Some(local), Some(remote)) if local.metadata != remote => {
Ok(UseRemote { local, remote })
}
(Some(x), Some(_)) => Ok(UseLocal(x)),
(None, Some(x)) => Ok(Evicted(x)),
(Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
(None, None) => {
unreachable!("there must not be any non-local non-remote files")
}
}
};
result
(name, decision)
})
.collect::<Vec<_>>()
}
pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
@@ -199,15 +196,25 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
}
pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
let local_size = local.file_size;
pub(super) fn cleanup_local_file_for_remote(
local: &LocalLayerFileMetadata,
remote: &LayerFileMetadata,
) -> anyhow::Result<()> {
let local_size = local.metadata.file_size();
let remote_size = remote.file_size();
let path = &local.local_path;
let file_name = path.file_name().expect("must be file path");
tracing::warn!(
"removing local file {file_name:?} because it has unexpected length {local_size};"
);
std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
let file_name = path.file_name().expect("must be file path");
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
assert!(
path.exists(),
"we would leave the local_layer without a file if this does not hold: {path}",
);
Err(err)
} else {
Ok(())
}
}
pub(super) fn cleanup_future_layer(
@@ -229,8 +236,8 @@ pub(super) fn cleanup_local_only_file(
) -> anyhow::Result<()> {
let kind = name.kind();
tracing::info!(
"found local-only {kind} layer {name} size {}",
local.file_size
"found local-only {kind} layer {name}, metadata {:?}",
local.metadata
);
std::fs::remove_file(&local.local_path)?;
Ok(())

View File

@@ -212,34 +212,13 @@ impl LayerManager {
&mut self,
rewrite_layers: &[(Layer, ResidentLayer)],
drop_layers: &[Layer],
metrics: &TimelineMetrics,
_metrics: &TimelineMetrics,
) {
let mut updates = self.layer_map.batch_update();
for (old_layer, new_layer) in rewrite_layers {
debug_assert_eq!(
old_layer.layer_desc().key_range,
new_layer.layer_desc().key_range
);
debug_assert_eq!(
old_layer.layer_desc().lsn_range,
new_layer.layer_desc().lsn_range
);
// Safety: we may never rewrite the same file in-place. Callers are responsible
// for ensuring that they only rewrite layers after something changes the path,
// such as an increment in the generation number.
assert_ne!(old_layer.local_path(), new_layer.local_path());
// TODO: implement rewrites (currently this code path only used for drops)
assert!(rewrite_layers.is_empty());
Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
Self::insert_historic_layer(
new_layer.as_ref().clone(),
&mut updates,
&mut self.layer_fmgr,
);
metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
}
for l in drop_layers {
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
}
@@ -255,13 +234,6 @@ impl LayerManager {
updates.flush()
}
#[cfg(test)]
pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
let mut updates = self.layer_map.batch_update();
Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
updates.flush()
}
/// Helper function to insert a layer into the layer map and file manager.
fn insert_historic_layer(
layer: Layer,

View File

@@ -705,7 +705,6 @@ impl ConnectionManagerState {
commit_lsn: info.commit_lsn,
safekeeper_connstr: info.safekeeper_connstr,
availability_zone: info.availability_zone,
standby_horizon: info.standby_horizon,
}
}
MessageType::SafekeeperDiscoveryResponse => {
@@ -726,21 +725,6 @@ impl ConnectionManagerState {
WALRECEIVER_BROKER_UPDATES.inc();
trace!(
"safekeeper info update: standby_horizon(cutoff)={}",
timeline_update.standby_horizon
);
if timeline_update.standby_horizon != 0 {
// ignore reports from safekeepers not connected to replicas
self.timeline
.standby_horizon
.store(Lsn(timeline_update.standby_horizon));
self.timeline
.metrics
.standby_horizon_gauge
.set(timeline_update.standby_horizon as i64);
}
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
let old_entry = self.wal_stream_candidates.insert(
new_safekeeper_id,
@@ -1110,7 +1094,6 @@ mod tests {
commit_lsn,
safekeeper_connstr: safekeeper_connstr.to_owned(),
availability_zone: None,
standby_horizon: 0,
},
latest_update,
}

Some files were not shown because too many files have changed in this diff Show More