mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-05 11:40:37 +00:00
Compare commits
1 Commits
release-56
...
arpad/incr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eb0b80e3ea |
@@ -17,7 +17,6 @@
|
||||
!libs/
|
||||
!neon_local/
|
||||
!pageserver/
|
||||
!patches/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!s3_scrubber/
|
||||
|
||||
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -5,7 +5,6 @@ self-hosted-runner:
|
||||
- large
|
||||
- large-arm64
|
||||
- small
|
||||
- small-arm64
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
|
||||
@@ -3,13 +3,13 @@ description: 'Create Branch using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
description: 'Neon API key'
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
project_id:
|
||||
description: 'ID of the Project to create Branch in'
|
||||
desctiption: 'ID of the Project to create Branch in'
|
||||
required: true
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
outputs:
|
||||
dsn:
|
||||
|
||||
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
description: 'Neon API key'
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
project_id:
|
||||
description: 'ID of the Project which should be deleted'
|
||||
desctiption: 'ID of the Project which should be deleted'
|
||||
required: true
|
||||
branch_id:
|
||||
description: 'ID of the branch to delete'
|
||||
desctiption: 'ID of the branch to delete'
|
||||
required: true
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
|
||||
runs:
|
||||
|
||||
14
.github/actions/neon-project-create/action.yml
vendored
14
.github/actions/neon-project-create/action.yml
vendored
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
description: 'Neon API key'
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
region_id:
|
||||
description: 'Region ID, if not set the project will be created in the default region'
|
||||
desctiption: 'Region ID, if not set the project will be created in the default region'
|
||||
default: aws-us-east-2
|
||||
postgres_version:
|
||||
description: 'Postgres version; default is 15'
|
||||
default: '15'
|
||||
desctiption: 'Postgres version; default is 15'
|
||||
default: 15
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
provisioner:
|
||||
description: 'k8s-pod or k8s-neonvm'
|
||||
desctiption: 'k8s-pod or k8s-neonvm'
|
||||
default: 'k8s-pod'
|
||||
compute_units:
|
||||
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
default: '[1, 1]'
|
||||
|
||||
outputs:
|
||||
|
||||
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
description: 'Neon API key'
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
project_id:
|
||||
description: 'ID of the Project to delete'
|
||||
desctiption: 'ID of the Project to delete'
|
||||
required: true
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
|
||||
runs:
|
||||
|
||||
14
.github/workflows/actionlint.yml
vendored
14
.github/workflows/actionlint.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
|
||||
actionlint:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: reviewdog/action-actionlint@v1
|
||||
@@ -36,15 +36,3 @@ jobs:
|
||||
fail_on_error: true
|
||||
filter_mode: nofilter
|
||||
level: error
|
||||
- run: |
|
||||
PAT='^\s*runs-on:.*-latest'
|
||||
if grep -ERq $PAT .github/workflows
|
||||
then
|
||||
grep -ERl $PAT .github/workflows |\
|
||||
while read -r f
|
||||
do
|
||||
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
|
||||
echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
6
.github/workflows/approved-for-ci-run.yml
vendored
6
.github/workflows/approved-for-ci-run.yml
vendored
@@ -44,7 +44,7 @@ jobs:
|
||||
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
@@ -60,7 +60,7 @@ jobs:
|
||||
github.event.action == 'labeled' &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
@@ -109,7 +109,7 @@ jobs:
|
||||
github.event.action == 'closed' &&
|
||||
github.event.pull_request.head.repo.full_name != github.repository
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
|
||||
|
||||
102
.github/workflows/benchmarking.yml
vendored
102
.github/workflows/benchmarking.yml
vendored
@@ -38,11 +38,6 @@ on:
|
||||
description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
|
||||
required: false
|
||||
default: false
|
||||
run_only_pgvector_tests:
|
||||
type: boolean
|
||||
description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
|
||||
required: false
|
||||
default: false
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -55,7 +50,6 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
bench:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
@@ -126,7 +120,6 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
generate-matrices:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
#
|
||||
# Available platforms:
|
||||
@@ -137,7 +130,7 @@ jobs:
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
|
||||
@@ -204,7 +197,6 @@ jobs:
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
pgbench-compare:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
needs: [ generate-matrices ]
|
||||
|
||||
strategy:
|
||||
@@ -351,92 +343,6 @@ jobs:
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
pgbench-pgvector:
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "1"
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-captest-pgvector"
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: Benchmark pgvector hnsw indexing
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_perf_olap.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Benchmark pgvector hnsw queries
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
|
||||
clickbench-compare:
|
||||
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
|
||||
# we use for performance testing in pgbench-compare.
|
||||
@@ -445,7 +351,7 @@ jobs:
|
||||
#
|
||||
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
|
||||
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
if: ${{ !cancelled() }}
|
||||
needs: [ generate-matrices, pgbench-compare ]
|
||||
|
||||
strategy:
|
||||
@@ -549,7 +455,7 @@ jobs:
|
||||
# We might change it after https://github.com/neondatabase/neon/issues/2900.
|
||||
#
|
||||
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
if: ${{ !cancelled() }}
|
||||
needs: [ generate-matrices, clickbench-compare ]
|
||||
|
||||
strategy:
|
||||
@@ -651,7 +557,7 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
user-examples-compare:
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
if: ${{ !cancelled() }}
|
||||
needs: [ generate-matrices, tpch-compare ]
|
||||
|
||||
strategy:
|
||||
|
||||
@@ -88,7 +88,7 @@ jobs:
|
||||
|
||||
merge-images:
|
||||
needs: [ build-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
|
||||
254
.github/workflows/build_and_test.yml
vendored
254
.github/workflows/build_and_test.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Cancel previous e2e-tests runs for this PR
|
||||
@@ -548,8 +548,8 @@ jobs:
|
||||
|
||||
report-benchmarks-failures:
|
||||
needs: [ benchmarks, create-test-report ]
|
||||
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
|
||||
runs-on: ubuntu-22.04
|
||||
if: github.ref_name == 'main' && failure()
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: slackapi/slack-github-action@v1
|
||||
@@ -723,13 +723,9 @@ jobs:
|
||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||
secrets: inherit
|
||||
|
||||
neon-image-arch:
|
||||
neon-image:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -751,6 +747,12 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
@@ -762,52 +764,25 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
|
||||
tags: |
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
neon-image:
|
||||
needs: [ neon-image-arch, tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Push multi-arch image to ECR
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
compute-node-image-arch:
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -854,14 +829,15 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
if: matrix.version == 'v16'
|
||||
if: ${{ matrix.version == 'v16' }}
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
target: compute-tools-image
|
||||
@@ -875,57 +851,14 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
tags: |
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
compute-node-image:
|
||||
needs: [ compute-node-image-arch, tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch compute-node image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- name: Create multi-arch compute-tools image
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Push multi-arch compute-tools image to ECR
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
@@ -933,8 +866,11 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.29.3
|
||||
VM_BUILDER_VERSION: v0.28.1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -947,48 +883,26 @@ jobs:
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
|
||||
# it won't have the proper authentication (written at v0.6.0)
|
||||
- name: Pulling compute-node image
|
||||
run: |
|
||||
docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-spec=vm-image-spec.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
test-images:
|
||||
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -1006,7 +920,7 @@ jobs:
|
||||
- name: Verify image versions
|
||||
shell: bash # ensure no set -e for better error messages
|
||||
run: |
|
||||
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
|
||||
echo "Pageserver version string: $pageserver_version"
|
||||
|
||||
@@ -1032,52 +946,82 @@ jobs:
|
||||
|
||||
promote-images:
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
env:
|
||||
VERSIONS: v14 v15 v16
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
# Don't add if-condition here.
|
||||
# The job should always be run because we have dependant other jobs that shouldn't be skipped
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Copy vm-compute-node images to ECR
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
for version in ${VERSIONS}; do
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Copy vm-compute-node images to Docker Hub
|
||||
run: |
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: github.ref_name == 'main'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
|
||||
docker buildx imagetools create -t $repo/neon:latest \
|
||||
$repo/neon:${{ needs.tag.outputs.build-tag }}
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
docker buildx imagetools create -t $repo/compute-tools:latest \
|
||||
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
- name: Push images to production ECR
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
|
||||
|
||||
for version in ${VERSIONS}; do
|
||||
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
|
||||
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
|
||||
echo "" > /github/home/.docker/config.json
|
||||
crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
|
||||
|
||||
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
|
||||
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
done
|
||||
- name: Push vm-compute-node to Docker Hub
|
||||
run: |
|
||||
crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
|
||||
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push latest tags to Docker Hub
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
|
||||
@@ -19,7 +19,7 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
check-image:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
|
||||
found: ${{ steps.check-image.outputs.found }}
|
||||
|
||||
2
.github/workflows/check-permissions.yml
vendored
2
.github/workflows/check-permissions.yml
vendored
@@ -16,7 +16,7 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Disallow CI runs on PRs from forks
|
||||
if: |
|
||||
|
||||
@@ -9,7 +9,7 @@ on:
|
||||
|
||||
jobs:
|
||||
cleanup:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Cleanup
|
||||
run: |
|
||||
|
||||
4
.github/workflows/neon_extra_builds.yml
vendored
4
.github/workflows/neon_extra_builds.yml
vendored
@@ -136,7 +136,7 @@ jobs:
|
||||
check-linux-arm-build:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
runs-on: [ self-hosted, small-arm64 ]
|
||||
runs-on: [ self-hosted, large-arm64 ]
|
||||
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
@@ -260,7 +260,7 @@ jobs:
|
||||
check-codestyle-rust-arm:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
runs-on: [ self-hosted, small-arm64 ]
|
||||
runs-on: [ self-hosted, large-arm64 ]
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
|
||||
2
.github/workflows/pg_clients.yml
vendored
2
.github/workflows/pg_clients.yml
vendored
@@ -20,7 +20,7 @@ concurrency:
|
||||
jobs:
|
||||
test-postgres-client-libs:
|
||||
# TODO: switch to gen2 runner, requires docker
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
env:
|
||||
DEFAULT_PG_VERSION: 14
|
||||
|
||||
2
.github/workflows/pin-build-tools-image.yml
vendored
2
.github/workflows/pin-build-tools-image.yml
vendored
@@ -26,7 +26,7 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
|
||||
2
.github/workflows/release-notify.yml
vendored
2
.github/workflows/release-notify.yml
vendored
@@ -19,7 +19,7 @@ on:
|
||||
|
||||
jobs:
|
||||
notify:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
steps:
|
||||
- uses: neondatabase/dev-actions/release-pr-notify@main
|
||||
|
||||
6
.github/workflows/release.yml
vendored
6
.github/workflows/release.yml
vendored
@@ -26,7 +26,7 @@ defaults:
|
||||
jobs:
|
||||
create-storage-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
## Storage & Compute release ${RELEASE_DATE}
|
||||
## Release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
@@ -65,7 +65,7 @@ jobs:
|
||||
|
||||
create-proxy-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
|
||||
6
.github/workflows/trigger-e2e-tests.yml
vendored
6
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -19,7 +19,7 @@ env:
|
||||
jobs:
|
||||
cancel-previous-e2e-tests:
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Cancel previous e2e-tests runs for this PR
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
|
||||
|
||||
tag:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: [ ubuntu-latest ]
|
||||
outputs:
|
||||
build-tag: ${{ steps.build-tag.outputs.tag }}
|
||||
|
||||
@@ -62,7 +62,7 @@ jobs:
|
||||
|
||||
trigger-e2e-tests:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
steps:
|
||||
|
||||
330
Cargo.lock
generated
330
Cargo.lock
generated
@@ -708,7 +708,7 @@ dependencies = [
|
||||
"sha1",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-tungstenite",
|
||||
"tokio-tungstenite 0.20.0",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -776,6 +776,7 @@ dependencies = [
|
||||
"pin-project",
|
||||
"serde",
|
||||
"time",
|
||||
"tz-rs",
|
||||
"url",
|
||||
"uuid",
|
||||
]
|
||||
@@ -978,12 +979,6 @@ version = "3.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
@@ -1071,9 +1066,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.38"
|
||||
version = "0.4.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
|
||||
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
|
||||
dependencies = [
|
||||
"android-tzdata",
|
||||
"iana-time-zone",
|
||||
@@ -1081,7 +1076,7 @@ dependencies = [
|
||||
"num-traits",
|
||||
"serde",
|
||||
"wasm-bindgen",
|
||||
"windows-targets 0.52.4",
|
||||
"windows-targets 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1108,7 +1103,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
|
||||
dependencies = [
|
||||
"ciborium-io",
|
||||
"half 1.8.2",
|
||||
"half",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1238,10 +1233,8 @@ dependencies = [
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
@@ -1290,6 +1283,12 @@ dependencies = [
|
||||
"tiny-keccak",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_fn"
|
||||
version = "0.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935"
|
||||
|
||||
[[package]]
|
||||
name = "const_format"
|
||||
version = "0.2.30"
|
||||
@@ -1464,21 +1463,26 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.5"
|
||||
version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
|
||||
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
version = "0.9.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"memoffset 0.8.0",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1592,7 +1596,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.14.0",
|
||||
"lock_api",
|
||||
"once_cell",
|
||||
"parking_lot_core 0.9.8",
|
||||
@@ -1969,6 +1973,21 @@ version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
|
||||
dependencies = [
|
||||
"foreign-types-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types-shared"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.1.0"
|
||||
@@ -1978,27 +1997,6 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "framed-websockets"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"bytemuck",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"http-body-util",
|
||||
"hyper 1.2.0",
|
||||
"hyper-util",
|
||||
"pin-project",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs2"
|
||||
version = "0.4.3"
|
||||
@@ -2251,17 +2249,6 @@ version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crunchy",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hash32"
|
||||
version = "0.3.1"
|
||||
@@ -2288,9 +2275,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.14.5"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"allocator-api2",
|
||||
@@ -2298,11 +2285,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
version = "0.9.1"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
|
||||
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
|
||||
dependencies = [
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.14.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2598,6 +2585,34 @@ dependencies = [
|
||||
"tokio-io-timeout",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tls"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"hyper 0.14.26",
|
||||
"native-tls",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tungstenite"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
|
||||
dependencies = [
|
||||
"http-body-util",
|
||||
"hyper 1.2.0",
|
||||
"hyper-util",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.21.0",
|
||||
"tungstenite 0.21.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.3"
|
||||
@@ -2675,7 +2690,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.14.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2915,12 +2930,6 @@ version = "0.4.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.10"
|
||||
@@ -2943,7 +2952,7 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
|
||||
dependencies = [
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.14.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2996,7 +3005,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"crossbeam-utils",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.14.0",
|
||||
"itoa",
|
||||
"lasso",
|
||||
"measured-derive",
|
||||
@@ -3139,6 +3148,24 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"openssl",
|
||||
"openssl-probe",
|
||||
"openssl-sys",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.25.1"
|
||||
@@ -3309,6 +3336,15 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_threads"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "oauth2"
|
||||
version = "4.4.2"
|
||||
@@ -3358,12 +3394,50 @@ version = "11.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.96"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.20.0"
|
||||
@@ -3493,7 +3567,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
|
||||
dependencies = [
|
||||
"dlv-list",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.14.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3570,7 +3644,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"svg_fmt",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
@@ -3815,14 +3888,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "parquet"
|
||||
version = "51.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
||||
version = "49.0.0"
|
||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"half 2.4.1",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.14.0",
|
||||
"num",
|
||||
"num-bigint",
|
||||
"paste",
|
||||
@@ -3830,13 +3902,12 @@ dependencies = [
|
||||
"thrift",
|
||||
"twox-hash",
|
||||
"zstd",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parquet_derive"
|
||||
version = "51.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
||||
version = "49.0.0"
|
||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
||||
dependencies = [
|
||||
"parquet",
|
||||
"proc-macro2",
|
||||
@@ -3863,9 +3934,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
|
||||
|
||||
[[package]]
|
||||
name = "pbkdf2"
|
||||
version = "0.12.2"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
|
||||
checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"hmac",
|
||||
@@ -4012,6 +4083,17 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-postgres",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
@@ -4120,7 +4202,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"itertools",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"rand 0.8.5",
|
||||
@@ -4278,7 +4359,6 @@ dependencies = [
|
||||
name = "proxy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
@@ -4295,14 +4375,12 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"consumption_metrics",
|
||||
"crossbeam-deque",
|
||||
"dashmap",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
"framed-websockets",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.13.2",
|
||||
"hashlink",
|
||||
"hex",
|
||||
"hmac",
|
||||
@@ -4310,9 +4388,9 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"http-body-util",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"hyper 1.2.0",
|
||||
"hyper-tungstenite",
|
||||
"hyper-util",
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
@@ -4321,6 +4399,7 @@ dependencies = [
|
||||
"md5",
|
||||
"measured",
|
||||
"metrics",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"parking_lot 0.12.1",
|
||||
@@ -4328,6 +4407,7 @@ dependencies = [
|
||||
"parquet_derive",
|
||||
"pbkdf2",
|
||||
"pin-project-lite",
|
||||
"postgres-native-tls",
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
@@ -4346,7 +4426,6 @@ dependencies = [
|
||||
"rstest",
|
||||
"rustc-hash",
|
||||
"rustls 0.22.4",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
@@ -4356,6 +4435,7 @@ dependencies = [
|
||||
"smol_str",
|
||||
"socket2 0.5.5",
|
||||
"subtle",
|
||||
"sync_wrapper",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
"tikv-jemalloc-ctl",
|
||||
@@ -4364,7 +4444,6 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-tungstenite",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -4376,6 +4455,7 @@ dependencies = [
|
||||
"utils",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"webpki-roots 0.25.2",
|
||||
"workspace_hack",
|
||||
"x509-parser",
|
||||
]
|
||||
@@ -4682,21 +4762,20 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"hyper-rustls 0.24.0",
|
||||
"hyper-tls",
|
||||
"ipnet",
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.21.11",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-native-tls",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"url",
|
||||
@@ -4704,7 +4783,6 @@ dependencies = [
|
||||
"wasm-bindgen-futures",
|
||||
"wasm-streams 0.3.0",
|
||||
"web-sys",
|
||||
"webpki-roots 0.25.2",
|
||||
"winreg 0.50.0",
|
||||
]
|
||||
|
||||
@@ -5130,22 +5208,20 @@ dependencies = [
|
||||
"hex",
|
||||
"histogram",
|
||||
"itertools",
|
||||
"once_cell",
|
||||
"native-tls",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres-native-tls",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
"rustls 0.22.4",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
@@ -5876,7 +5952,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
|
||||
[[package]]
|
||||
name = "svg_fmt"
|
||||
version = "0.4.2"
|
||||
source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
|
||||
source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
@@ -6089,6 +6165,8 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"num_threads",
|
||||
"serde",
|
||||
"time-core",
|
||||
"time-macros",
|
||||
@@ -6164,7 +6242,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -6198,6 +6276,16 @@ dependencies = [
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-native-tls"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
@@ -6292,7 +6380,19 @@ dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite",
|
||||
"tungstenite 0.20.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.21.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6306,7 +6406,7 @@ dependencies = [
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.14.0",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -6588,6 +6688,25 @@ dependencies = [
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"data-encoding",
|
||||
"http 1.1.0",
|
||||
"httparse",
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"url",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "1.6.3"
|
||||
@@ -6604,6 +6723,15 @@ version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
|
||||
|
||||
[[package]]
|
||||
name = "tz-rs"
|
||||
version = "0.6.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4"
|
||||
dependencies = [
|
||||
"const_fn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uname"
|
||||
version = "0.1.1"
|
||||
@@ -6676,12 +6804,11 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
"libc",
|
||||
"linux-raw-sys 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7350,7 +7477,6 @@ dependencies = [
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"aws-runtime",
|
||||
@@ -7376,7 +7502,7 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"getrandom 0.2.11",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.14.0",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper 0.14.26",
|
||||
@@ -7509,9 +7635,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.7.0"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
|
||||
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
|
||||
dependencies = [
|
||||
"zeroize_derive",
|
||||
]
|
||||
|
||||
36
Cargo.toml
36
Cargo.toml
@@ -41,15 +41,14 @@ license = "Apache-2.0"
|
||||
|
||||
## All dependency versions, used in the project
|
||||
[workspace.dependencies]
|
||||
ahash = "0.8"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
|
||||
azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_core = "0.19"
|
||||
azure_identity = "0.19"
|
||||
azure_storage = "0.19"
|
||||
azure_storage_blobs = "0.19"
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
@@ -75,7 +74,6 @@ clap = { version = "4.0", features = ["derive"] }
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-deque = "0.8.5"
|
||||
crossbeam-utils = "0.8.5"
|
||||
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
||||
either = "1.8"
|
||||
@@ -83,14 +81,13 @@ enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
fallible-iterator = "0.2"
|
||||
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.14"
|
||||
hashlink = "0.9.1"
|
||||
hashbrown = "0.13"
|
||||
hashlink = "0.8.4"
|
||||
hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
@@ -101,7 +98,7 @@ http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
tokio-tungstenite = "0.20.0"
|
||||
hyper-tungstenite = "0.13.0"
|
||||
indexmap = "2"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
@@ -114,6 +111,7 @@ md5 = "0.7.0"
|
||||
measured = { version = "0.0.21", features=["lasso"] }
|
||||
measured-process = { version = "0.0.21" }
|
||||
memoffset = "0.8"
|
||||
native-tls = "0.2"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
@@ -123,8 +121,8 @@ opentelemetry = "0.20.0"
|
||||
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.12.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "51.0.0"
|
||||
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "49.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
procfs = "0.14"
|
||||
@@ -160,8 +158,8 @@ socket2 = "0.5"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
"subtle" = "2.5.0"
|
||||
# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
|
||||
svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
|
||||
# https://github.com/nical/rust_debug/pull/4
|
||||
svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
@@ -190,7 +188,7 @@ url = "2.2"
|
||||
urlencoding = "2.1"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
rustls-native-certs = "0.7"
|
||||
webpki-roots = "0.25"
|
||||
x509-parser = "0.15"
|
||||
|
||||
## TODO replace this with tracing
|
||||
@@ -199,6 +197,7 @@ log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
@@ -239,12 +238,13 @@ tonic-build = "0.9"
|
||||
|
||||
[patch.crates-io]
|
||||
|
||||
# Needed to get `tokio-postgres-rustls` to depend on our fork.
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
# bug fixes for UUID
|
||||
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
|
||||
&& rm awscliv2.zip
|
||||
|
||||
# Mold: A Modern Linker
|
||||
ENV MOLD_VERSION v2.31.0
|
||||
ENV MOLD_VERSION v2.4.0
|
||||
RUN set -e \
|
||||
&& git clone https://github.com/rui314/mold.git \
|
||||
&& mkdir mold/build \
|
||||
|
||||
@@ -241,17 +241,11 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# because we build the images on different machines than where we run them.
|
||||
# Pass OPTFLAGS="" to remove it.
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
[](https://neon.tech)
|
||||
|
||||
|
||||
[](https://neon.tech)
|
||||
|
||||
# Neon
|
||||
|
||||
|
||||
@@ -27,12 +27,10 @@ reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
thiserror.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
|
||||
@@ -1,116 +0,0 @@
|
||||
use compute_api::{
|
||||
responses::CatalogObjects,
|
||||
spec::{Database, Role},
|
||||
};
|
||||
use futures::Stream;
|
||||
use postgres::{Client, NoTls};
|
||||
use std::{path::Path, process::Stdio, result::Result, sync::Arc};
|
||||
use tokio::{
|
||||
io::{AsyncBufReadExt, BufReader},
|
||||
process::Command,
|
||||
task,
|
||||
};
|
||||
use tokio_stream::{self as stream, StreamExt};
|
||||
use tokio_util::codec::{BytesCodec, FramedRead};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::{
|
||||
compute::ComputeNode,
|
||||
pg_helpers::{get_existing_dbs, get_existing_roles},
|
||||
};
|
||||
|
||||
pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
|
||||
let connstr = compute.connstr.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
let roles: Vec<Role>;
|
||||
{
|
||||
let mut xact = client.transaction()?;
|
||||
roles = get_existing_roles(&mut xact)?;
|
||||
}
|
||||
let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
|
||||
|
||||
Ok(CatalogObjects { roles, databases })
|
||||
})
|
||||
.await?
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SchemaDumpError {
|
||||
#[error("Database does not exist.")]
|
||||
DatabaseDoesNotExist,
|
||||
#[error("Failed to execute pg_dump.")]
|
||||
IO(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
// It uses the pg_dump utility to dump the schema of the specified database.
|
||||
// The output is streamed back to the caller and supposed to be streamed via HTTP.
|
||||
//
|
||||
// Before return the result with the output, it checks that pg_dump produced any output.
|
||||
// If not, it tries to parse the stderr output to determine if the database does not exist
|
||||
// and special error is returned.
|
||||
//
|
||||
// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
|
||||
pub async fn get_database_schema(
|
||||
compute: &Arc<ComputeNode>,
|
||||
dbname: &str,
|
||||
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
|
||||
let pgbin = &compute.pgbin;
|
||||
let basepath = Path::new(pgbin).parent().unwrap();
|
||||
let pgdump = basepath.join("pg_dump");
|
||||
let mut connstr = compute.connstr.clone();
|
||||
connstr.set_path(dbname);
|
||||
let mut cmd = Command::new(pgdump)
|
||||
.arg("--schema-only")
|
||||
.arg(connstr.as_str())
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.kill_on_drop(true)
|
||||
.spawn()?;
|
||||
|
||||
let stdout = cmd.stdout.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
|
||||
})?;
|
||||
|
||||
let stderr = cmd.stderr.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
|
||||
})?;
|
||||
|
||||
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
|
||||
let stderr_reader = BufReader::new(stderr);
|
||||
|
||||
let first_chunk = match stdout_reader.next().await {
|
||||
Some(Ok(bytes)) if !bytes.is_empty() => bytes,
|
||||
Some(Err(e)) => {
|
||||
return Err(SchemaDumpError::IO(e));
|
||||
}
|
||||
_ => {
|
||||
let mut lines = stderr_reader.lines();
|
||||
if let Some(line) = lines.next_line().await? {
|
||||
if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) {
|
||||
return Err(SchemaDumpError::DatabaseDoesNotExist);
|
||||
}
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
tokio::spawn(async move {
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
});
|
||||
|
||||
return Err(SchemaDumpError::IO(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"failed to start pg_dump",
|
||||
)));
|
||||
}
|
||||
};
|
||||
let initial_stream = stream::once(Ok(first_chunk.freeze()));
|
||||
// Consume stderr and log warnings
|
||||
tokio::spawn(async move {
|
||||
let mut lines = stderr_reader.lines();
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
});
|
||||
Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
|
||||
}
|
||||
@@ -5,21 +5,17 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::catalog::SchemaDumpError;
|
||||
use crate::catalog::{get_database_schema, get_dbs_and_roles};
|
||||
use crate::compute::forward_termination_signal;
|
||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use tokio::task;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
use utils::http::request::must_get_query_param;
|
||||
|
||||
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
|
||||
ComputeStatusResponse {
|
||||
@@ -137,34 +133,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/dbs_and_roles") => {
|
||||
info!("serving /dbs_and_roles GET request",);
|
||||
match get_dbs_and_roles(compute).await {
|
||||
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
|
||||
Err(_) => {
|
||||
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/database_schema") => {
|
||||
let database = match must_get_query_param(&req, "database") {
|
||||
Err(e) => return e.into_response(),
|
||||
Ok(database) => database,
|
||||
};
|
||||
info!("serving /database_schema GET request with database: {database}",);
|
||||
match get_database_schema(compute, &database).await {
|
||||
Ok(res) => render_plain(Body::wrap_stream(res)),
|
||||
Err(SchemaDumpError::DatabaseDoesNotExist) => {
|
||||
render_json_error("database does not exist", StatusCode::NOT_FOUND)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("can't get schema dump: {}", e);
|
||||
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from remote extension storage on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
@@ -335,25 +303,10 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
};
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.header(CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(serde_json::to_string(&error).unwrap()))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn render_json(body: Body) -> Response<Body> {
|
||||
Response::builder()
|
||||
.header(CONTENT_TYPE, "application/json")
|
||||
.body(body)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn render_plain(body: Body) -> Response<Body> {
|
||||
Response::builder()
|
||||
.header(CONTENT_TYPE, "text/plain")
|
||||
.body(body)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
|
||||
@@ -68,51 +68,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Info"
|
||||
|
||||
/dbs_and_roles:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get databases and roles in the catalog.
|
||||
description: ""
|
||||
operationId: getDbsAndRoles
|
||||
responses:
|
||||
200:
|
||||
description: Compute schema objects
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/DbsAndRoles"
|
||||
|
||||
/database_schema:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get schema dump
|
||||
parameters:
|
||||
- name: database
|
||||
in: query
|
||||
description: Database name to dump.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
example: "postgres"
|
||||
description: Get schema dump in SQL format.
|
||||
operationId: getDatabaseSchema
|
||||
responses:
|
||||
200:
|
||||
description: Schema dump
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Schema dump in SQL format.
|
||||
404:
|
||||
description: Non existing database.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
/check_writability:
|
||||
post:
|
||||
tags:
|
||||
@@ -274,73 +229,6 @@ components:
|
||||
num_cpus:
|
||||
type: integer
|
||||
|
||||
DbsAndRoles:
|
||||
type: object
|
||||
description: Databases and Roles
|
||||
required:
|
||||
- roles
|
||||
- databases
|
||||
properties:
|
||||
roles:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Role"
|
||||
databases:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Database"
|
||||
|
||||
Database:
|
||||
type: object
|
||||
description: Database
|
||||
required:
|
||||
- name
|
||||
- owner
|
||||
- restrict_conn
|
||||
- invalid
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
owner:
|
||||
type: string
|
||||
options:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/GenericOption"
|
||||
restrict_conn:
|
||||
type: boolean
|
||||
invalid:
|
||||
type: boolean
|
||||
|
||||
Role:
|
||||
type: object
|
||||
description: Role
|
||||
required:
|
||||
- name
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
encrypted_password:
|
||||
type: string
|
||||
options:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/GenericOption"
|
||||
|
||||
GenericOption:
|
||||
type: object
|
||||
description: Schema Generic option
|
||||
required:
|
||||
- name
|
||||
- vartype
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
value:
|
||||
type: string
|
||||
vartype:
|
||||
type: string
|
||||
|
||||
ComputeState:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -8,7 +8,6 @@ pub mod configurator;
|
||||
pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
pub mod monitor;
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use tracing::warn;
|
||||
|
||||
@@ -19,24 +17,17 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
|
||||
.arg(size_bytes.to_string())
|
||||
.spawn();
|
||||
|
||||
if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
|
||||
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
child_result
|
||||
.context("spawn() failed")
|
||||
.and_then(|mut child| child.wait().context("wait() failed"))
|
||||
.and_then(|status| match status.success() {
|
||||
true => Ok(()),
|
||||
false => {
|
||||
// The command failed. Maybe it was because the resize-swap file doesn't exist?
|
||||
// The --once flag causes it to delete itself on success so we don't disable swap
|
||||
// while postgres is running; maybe this is fine.
|
||||
match Path::new(RESIZE_SWAP_BIN).try_exists() {
|
||||
Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
|
||||
// The path doesn't exist; we're actually ok
|
||||
Ok(false) => {
|
||||
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
|
||||
Ok(())
|
||||
},
|
||||
}
|
||||
}
|
||||
false => Err(anyhow!("process exited with {status}")),
|
||||
})
|
||||
// wrap any prior error with the overall context that we couldn't run the command
|
||||
.with_context(|| {
|
||||
|
||||
@@ -152,9 +152,6 @@ pub struct NeonStorageControllerConf {
|
||||
/// Heartbeat timeout before marking a node offline
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub max_unavailable: Duration,
|
||||
|
||||
/// Threshold for auto-splitting a tenant into shards
|
||||
pub split_threshold: Option<u64>,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -167,7 +164,6 @@ impl Default for NeonStorageControllerConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
|
||||
split_threshold: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,13 +243,9 @@ impl StorageController {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
format!("port = {}", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
};
|
||||
@@ -309,10 +305,6 @@ impl StorageController {
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(split_threshold) = self.config.split_threshold.as_ref() {
|
||||
args.push(format!("--split-threshold={split_threshold}"))
|
||||
}
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
|
||||
@@ -99,13 +99,6 @@ name = "async-executor"
|
||||
[[bans.deny]]
|
||||
name = "smol"
|
||||
|
||||
[[bans.deny]]
|
||||
# We want to use rustls instead of the platform's native tls implementation.
|
||||
name = "native-tls"
|
||||
|
||||
[[bans.deny]]
|
||||
name = "openssl"
|
||||
|
||||
# This section is considered when running `cargo deny check sources`.
|
||||
# More documentation about the 'sources' section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG COMPUTE_IMAGE=compute-node-v14
|
||||
ARG TAG=latest
|
||||
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
|
||||
# to verify custom image builds (e.g pre-published ones).
|
||||
|
||||
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
|
||||
|
||||
set -eux -o pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
|
||||
use crate::spec::{ComputeSpec, Database, Role};
|
||||
use crate::spec::ComputeSpec;
|
||||
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
pub struct GenericAPIError {
|
||||
@@ -113,12 +113,6 @@ pub struct ComputeMetrics {
|
||||
pub total_ext_download_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
pub struct CatalogObjects {
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
/// This is not actually a compute API response, so consider moving
|
||||
/// to a different place.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use bytes::BufMut;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -52,8 +53,14 @@ impl Key {
|
||||
/// Encode a metadata key to a storage key.
|
||||
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
|
||||
assert!(is_metadata_key_slice(key), "key not in metadata key range");
|
||||
// Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
|
||||
Self::from_i128(i128::from_be_bytes(*key))
|
||||
Key {
|
||||
field1: key[0],
|
||||
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
|
||||
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
|
||||
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
|
||||
field5: key[11],
|
||||
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode a metadata key to a storage key.
|
||||
@@ -61,6 +68,17 @@ impl Key {
|
||||
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
|
||||
}
|
||||
|
||||
/// Extract a metadata key to a writer. The result should always be 16 bytes.
|
||||
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
|
||||
writer.put_u8(self.field1);
|
||||
assert!(self.field2 <= 0xFFFF);
|
||||
writer.put_u16(self.field2 as u16);
|
||||
writer.put_u32(self.field3);
|
||||
writer.put_u32(self.field4);
|
||||
writer.put_u8(self.field5);
|
||||
writer.put_u32(self.field6);
|
||||
}
|
||||
|
||||
/// Get the range of metadata keys.
|
||||
pub const fn metadata_key_range() -> Range<Self> {
|
||||
Key {
|
||||
@@ -103,7 +121,7 @@ impl Key {
|
||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||
pub fn to_i128(&self) -> i128 {
|
||||
assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
||||
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
||||
(((self.field1 & 0x7F) as i128) << 120)
|
||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||
| ((self.field3 as i128) << 72)
|
||||
@@ -157,7 +175,7 @@ impl Key {
|
||||
}
|
||||
|
||||
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
|
||||
/// Use [`Key::from_metadata_key`] instead.
|
||||
pub fn from_slice(b: &[u8]) -> Self {
|
||||
Key {
|
||||
field1: b[0],
|
||||
@@ -170,7 +188,7 @@ impl Key {
|
||||
}
|
||||
|
||||
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
|
||||
/// Use [`Key::extract_metadata_key_to_writer`] instead.
|
||||
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
||||
buf[0] = self.field1;
|
||||
BE::write_u32(&mut buf[1..5], self.field2);
|
||||
@@ -381,15 +399,10 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0xffff_ffff,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_size_key(key: &Key) -> bool {
|
||||
key.field1 == 0 && key.field6 == u32::MAX
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_key_range(rel: RelTag) -> Range<Key> {
|
||||
Key {
|
||||
@@ -427,25 +440,6 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
|
||||
if key.field1 == 0x01
|
||||
&& key.field3 == 0
|
||||
&& key.field4 == 0
|
||||
&& key.field5 == 0
|
||||
&& key.field6 == 0
|
||||
{
|
||||
match key.field2 {
|
||||
0 => Some(Ok(SlruKind::Clog)),
|
||||
1 => Some(Ok(SlruKind::MultiXactMembers)),
|
||||
2 => Some(Ok(SlruKind::MultiXactOffsets)),
|
||||
x => Some(Err(x)),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
@@ -474,18 +468,10 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0xffff_ffff,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_slru_segment_size_key(key: &Key) -> bool {
|
||||
key.field1 == 0x01
|
||||
&& key.field2 < 0x03
|
||||
&& key.field3 == 0x01
|
||||
&& key.field5 == 0
|
||||
&& key.field6 == u32::MAX
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
|
||||
let field2 = match kind {
|
||||
@@ -701,15 +687,10 @@ mod tests {
|
||||
let mut metadata_key = vec![AUX_KEY_PREFIX];
|
||||
metadata_key.extend_from_slice(&[0xFF; 15]);
|
||||
let encoded_key = Key::from_metadata_key(&metadata_key);
|
||||
let output_key = encoded_key.to_i128().to_be_bytes();
|
||||
let mut output_key = Vec::new();
|
||||
encoded_key.extract_metadata_key_to_writer(&mut output_key);
|
||||
assert_eq!(metadata_key, output_key);
|
||||
assert!(encoded_key.is_metadata_key());
|
||||
assert!(is_metadata_key_slice(&metadata_key));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_possible_largest_key() {
|
||||
Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
|
||||
// TODO: put this key into the system and see if anything breaks.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -307,7 +307,7 @@ impl KeySpace {
|
||||
}
|
||||
|
||||
/// Merge another keyspace into the current one.
|
||||
/// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
|
||||
/// Note: the keyspaces must not ovelap (enforced via assertions)
|
||||
pub fn merge(&mut self, other: &KeySpace) {
|
||||
let all_ranges = self
|
||||
.ranges
|
||||
|
||||
@@ -9,7 +9,7 @@ use std::{
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
sync::atomic::AtomicUsize,
|
||||
str::FromStr,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
@@ -161,22 +161,6 @@ impl std::fmt::Debug for TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
/// A temporary lease to a specific lsn inside a timeline.
|
||||
/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct LsnLease {
|
||||
#[serde_as(as = "SystemTimeAsRfc3339Millis")]
|
||||
pub valid_until: SystemTime,
|
||||
}
|
||||
|
||||
serde_with::serde_conv!(
|
||||
SystemTimeAsRfc3339Millis,
|
||||
SystemTime,
|
||||
|time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
|
||||
|value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
|
||||
);
|
||||
|
||||
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum ActivatingFrom {
|
||||
@@ -305,7 +289,7 @@ pub struct TenantConfig {
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
@@ -324,100 +308,28 @@ pub struct TenantConfig {
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
|
||||
/// tenant config. When the first aux file written, the policy will be persisted in the
|
||||
/// `index_part.json` file and has a limited migration path.
|
||||
///
|
||||
/// Currently, we only allow the following migration path:
|
||||
///
|
||||
/// Unset -> V1
|
||||
/// -> V2
|
||||
/// -> CrossValidation -> V2
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum AuxFilePolicy {
|
||||
/// V1 aux file policy: store everything in AUX_FILE_KEY
|
||||
#[strum(ascii_case_insensitive)]
|
||||
V1,
|
||||
/// V2 aux file policy: store in the AUX_FILE keyspace
|
||||
#[strum(ascii_case_insensitive)]
|
||||
V2,
|
||||
/// Cross validation runs both formats on the write path and does validation
|
||||
/// on the read path.
|
||||
#[strum(ascii_case_insensitive)]
|
||||
CrossValidation,
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
|
||||
matches!(
|
||||
(from, to),
|
||||
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
|
||||
)
|
||||
}
|
||||
impl FromStr for AuxFilePolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||
pub fn default_tenant_config() -> Self {
|
||||
Self::V1
|
||||
}
|
||||
}
|
||||
|
||||
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
|
||||
pub struct AtomicAuxFilePolicy(AtomicUsize);
|
||||
|
||||
impl AtomicAuxFilePolicy {
|
||||
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
|
||||
Self(AtomicUsize::new(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn load(&self) -> Option<AuxFilePolicy> {
|
||||
match self.0.load(std::sync::atomic::Ordering::Acquire) {
|
||||
0 => None,
|
||||
other => Some(AuxFilePolicy::from_usize(other)),
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.to_lowercase();
|
||||
if s == "v1" {
|
||||
Ok(Self::V1)
|
||||
} else if s == "v2" {
|
||||
Ok(Self::V2)
|
||||
} else if s == "crossvalidation" || s == "cross_validation" {
|
||||
Ok(Self::CrossValidation)
|
||||
} else {
|
||||
anyhow::bail!("cannot parse {} to aux file policy", s)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store(&self, policy: Option<AuxFilePolicy>) {
|
||||
self.0.store(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
std::sync::atomic::Ordering::Release,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn to_usize(self) -> usize {
|
||||
match self {
|
||||
Self::V1 => 1,
|
||||
Self::CrossValidation => 2,
|
||||
Self::V2 => 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_from_usize(this: usize) -> Option<Self> {
|
||||
match this {
|
||||
1 => Some(Self::V1),
|
||||
2 => Some(Self::CrossValidation),
|
||||
3 => Some(Self::V2),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_usize(this: usize) -> Self {
|
||||
Self::try_from_usize(this).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -438,28 +350,13 @@ impl EvictionPolicy {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum CompactionAlgorithm {
|
||||
Legacy,
|
||||
Tiered,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompactionAlgorithmSettings {
|
||||
pub kind: CompactionAlgorithm,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -707,9 +604,6 @@ pub struct TimelineInfo {
|
||||
pub state: TimelineState,
|
||||
|
||||
pub walreceiver_status: String,
|
||||
|
||||
/// The last aux file policy being used on this timeline
|
||||
pub last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -816,8 +710,6 @@ pub enum HistoricLayerInfo {
|
||||
lsn_end: Lsn,
|
||||
remote: bool,
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
l0: bool,
|
||||
},
|
||||
Image {
|
||||
layer_file_name: String,
|
||||
@@ -853,16 +745,6 @@ impl HistoricLayerInfo {
|
||||
};
|
||||
*field = value;
|
||||
}
|
||||
pub fn layer_file_size(&self) -> u64 {
|
||||
match self {
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_size, ..
|
||||
} => *layer_file_size,
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_size, ..
|
||||
} => *layer_file_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@@ -870,16 +752,6 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
|
||||
pub max_concurrent_downloads: NonZeroUsize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct IngestAuxFilesRequest {
|
||||
pub aux_files: HashMap<String, String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ListAuxFilesRequest {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct DownloadRemoteLayersTaskInfo {
|
||||
pub task_id: String,
|
||||
@@ -904,6 +776,9 @@ pub struct TimelineGcRequest {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerProcessStatus {
|
||||
pub pid: u32,
|
||||
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
|
||||
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
|
||||
pub kind: Cow<'static, str>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -942,55 +817,6 @@ pub struct TenantScanRemoteStorageResponse {
|
||||
pub shards: Vec<TenantScanRemoteStorageShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum TenantSorting {
|
||||
ResidentSize,
|
||||
MaxLogicalSize,
|
||||
}
|
||||
|
||||
impl Default for TenantSorting {
|
||||
fn default() -> Self {
|
||||
Self::ResidentSize
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct TopTenantShardsRequest {
|
||||
// How would you like to sort the tenants?
|
||||
pub order_by: TenantSorting,
|
||||
|
||||
// How many results?
|
||||
pub limit: usize,
|
||||
|
||||
// Omit tenants with more than this many shards (e.g. if this is the max number of shards
|
||||
// that the caller would ever split to)
|
||||
pub where_shards_lt: Option<ShardCount>,
|
||||
|
||||
// Omit tenants where the ordering metric is less than this (this is an optimization to
|
||||
// let us quickly exclude numerous tiny shards)
|
||||
pub where_gt: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
pub struct TopTenantShardItem {
|
||||
pub id: TenantShardId,
|
||||
|
||||
/// Total size of layers on local disk for all timelines in this tenant
|
||||
pub resident_size: u64,
|
||||
|
||||
/// Total size of layers in remote storage for all timelines in this tenant
|
||||
pub physical_size: u64,
|
||||
|
||||
/// The largest logical size of a timeline within this tenant
|
||||
pub max_logical_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
pub struct TopTenantShardsResponse {
|
||||
pub shards: Vec<TopTenantShardItem>,
|
||||
}
|
||||
|
||||
pub mod virtual_file {
|
||||
#[derive(
|
||||
Copy,
|
||||
@@ -1416,7 +1242,6 @@ impl PagestreamBeMessage {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -1624,69 +1449,4 @@ mod tests {
|
||||
assert_eq!(actual, expected, "example on {line}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_file_migration_path() {
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Self-migration is not a valid migration path, and the caller should handle it by itself.
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations not allowed
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations allowed
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_parse() {
|
||||
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(
|
||||
AuxFilePolicy::from_str("cross-validation").unwrap(),
|
||||
AuxFilePolicy::CrossValidation
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
|
||||
use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
|
||||
use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use postgres_ffi::Oid;
|
||||
|
||||
///
|
||||
@@ -68,57 +68,6 @@ impl fmt::Display for RelTag {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ParseRelTagError {
|
||||
#[error("invalid forknum")]
|
||||
InvalidForknum(#[source] std::num::ParseIntError),
|
||||
#[error("missing triplet member {}", .0)]
|
||||
MissingTripletMember(usize),
|
||||
#[error("invalid triplet member {}", .0)]
|
||||
InvalidTripletMember(usize, #[source] std::num::ParseIntError),
|
||||
}
|
||||
|
||||
impl std::str::FromStr for RelTag {
|
||||
type Err = ParseRelTagError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
use ParseRelTagError::*;
|
||||
|
||||
// FIXME: in postgres logs this separator is dot
|
||||
// Example:
|
||||
// could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
|
||||
// with a regex we could get this more painlessly
|
||||
let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
|
||||
Some((t, f)) => {
|
||||
let forknum = forkname_to_number(Some(f));
|
||||
let forknum = if let Ok(f) = forknum {
|
||||
f
|
||||
} else {
|
||||
f.parse::<u8>().map_err(InvalidForknum)?
|
||||
};
|
||||
|
||||
(t, Some(forknum))
|
||||
}
|
||||
None => (s, None),
|
||||
};
|
||||
|
||||
let mut split = triplet
|
||||
.splitn(3, '/')
|
||||
.enumerate()
|
||||
.map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
|
||||
let spcnode = split.next().ok_or(MissingTripletMember(0))??;
|
||||
let dbnode = split.next().ok_or(MissingTripletMember(1))??;
|
||||
let relnode = split.next().ok_or(MissingTripletMember(2))??;
|
||||
|
||||
Ok(RelTag {
|
||||
spcnode,
|
||||
forknum: forknum.unwrap_or(MAIN_FORKNUM),
|
||||
dbnode,
|
||||
relnode,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl RelTag {
|
||||
pub fn to_segfile_name(&self, segno: u32) -> String {
|
||||
let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
|
||||
|
||||
@@ -125,7 +125,7 @@ impl ShardCount {
|
||||
|
||||
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
||||
/// [`Self::literal`] would return.
|
||||
pub const fn new(val: u8) -> Self {
|
||||
pub fn new(val: u8) -> Self {
|
||||
Self(val)
|
||||
}
|
||||
}
|
||||
@@ -428,12 +428,6 @@ impl<'de> Deserialize<'de> for TenantShardId {
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardStripeSize(pub u32);
|
||||
|
||||
impl Default for ShardStripeSize {
|
||||
fn default() -> Self {
|
||||
DEFAULT_STRIPE_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardLayout(u8);
|
||||
@@ -565,14 +559,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Obtains the shard number and count combined into a `ShardIndex`.
|
||||
pub fn shard_index(&self) -> ShardIndex {
|
||||
ShardIndex {
|
||||
shard_count: self.count,
|
||||
shard_number: self.number,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
if self.count > ShardCount(0) {
|
||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||
@@ -719,25 +705,6 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
|
||||
ShardNumber((hash % count.0 as u32) as u8)
|
||||
}
|
||||
|
||||
/// For debugging, while not exposing the internals.
|
||||
#[derive(Debug)]
|
||||
#[allow(unused)] // used by debug formatting by pagectl
|
||||
struct KeyShardingInfo {
|
||||
shard0: bool,
|
||||
shard_number: ShardNumber,
|
||||
}
|
||||
|
||||
pub fn describe(
|
||||
key: &Key,
|
||||
shard_count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
) -> impl std::fmt::Debug {
|
||||
KeyShardingInfo {
|
||||
shard0: key_is_shard0(key),
|
||||
shard_number: key_to_shard_number(shard_count, stripe_size, key),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::Hex;
|
||||
|
||||
@@ -820,11 +820,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
|
||||
/// - Log as info/error result of handling COPY stream and send back
|
||||
/// ErrorResponse if that makes sense.
|
||||
/// - Shutdown the stream if we got Terminate.
|
||||
/// - Then close the connection because we don't handle exiting from COPY
|
||||
/// stream normally.
|
||||
/// Log as info/error result of handling COPY stream and send back
|
||||
/// ErrorResponse if that makes sense. Shutdown the stream if we got
|
||||
/// Terminate. TODO: transition into waiting for Sync msg if we initiate the
|
||||
/// close.
|
||||
pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
|
||||
use CopyStreamHandlerEnd::*;
|
||||
|
||||
@@ -850,6 +849,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
}
|
||||
}
|
||||
|
||||
if let Terminate = &end {
|
||||
self.state = ProtoState::Closed;
|
||||
}
|
||||
|
||||
let err_to_send_and_errcode = match &end {
|
||||
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
|
||||
Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
|
||||
@@ -879,12 +882,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
error!("failed to send ErrorResponse: {}", ee);
|
||||
}
|
||||
}
|
||||
|
||||
// Proper COPY stream finishing to continue using the connection is not
|
||||
// implemented at the server side (we don't need it so far). To prevent
|
||||
// further usages of the connection, close it.
|
||||
self.framed.shutdown().await.ok();
|
||||
self.state = ProtoState::Closed;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -178,13 +178,6 @@ impl PgConnectionConfig {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// The password is intentionally hidden and not part of this display string.
|
||||
write!(f, "postgresql://{}:{}", self.host, self.port)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
|
||||
|
||||
@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
// Is there enough space on the page for another logical message and an
|
||||
// XLOG_SWITCH? If not, start over.
|
||||
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
|
||||
if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
|
||||
if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -373,29 +373,31 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
||||
&[&(repeats as i32)],
|
||||
)?;
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
XLOG_SIZE_OF_XLOG_RECORD
|
||||
);
|
||||
|
||||
// Emit the XLOG_SWITCH
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
let xlog_switch_record_end: PgLsn =
|
||||
client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
|
||||
if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
!= XLOG_SIZE_OF_XLOG_SHORT_PHD
|
||||
{
|
||||
warn!(
|
||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
|
||||
xlog_switch_record_end,
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
);
|
||||
continue;
|
||||
}
|
||||
return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
|
||||
break;
|
||||
}
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
XLOG_SIZE_OF_XLOG_RECORD
|
||||
);
|
||||
|
||||
// Emit the XLOG_SWITCH
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let next_segment = PgLsn::from(0x0200_0000);
|
||||
ensure!(
|
||||
xlog_switch_record_end < next_segment,
|
||||
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
|
||||
xlog_switch_record_end,
|
||||
next_segment
|
||||
);
|
||||
ensure!(
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
||||
xlog_switch_record_end,
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
);
|
||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ license.workspace = true
|
||||
[dependencies]
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
itertools.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
@@ -7,9 +7,8 @@ pub mod framed;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{borrow::Cow, fmt, io, str};
|
||||
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
|
||||
|
||||
// re-export for use in utils pageserver_feedback.rs
|
||||
pub use postgres_protocol::PG_EPOCH;
|
||||
@@ -51,37 +50,15 @@ pub enum FeStartupPacket {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct StartupMessageParamsBuilder {
|
||||
params: BytesMut,
|
||||
}
|
||||
|
||||
impl StartupMessageParamsBuilder {
|
||||
/// Set parameter's value by its name.
|
||||
/// name and value must not contain a \0 byte
|
||||
pub fn insert(&mut self, name: &str, value: &str) {
|
||||
self.params.put(name.as_bytes());
|
||||
self.params.put(&b"\0"[..]);
|
||||
self.params.put(value.as_bytes());
|
||||
self.params.put(&b"\0"[..]);
|
||||
}
|
||||
|
||||
pub fn freeze(self) -> StartupMessageParams {
|
||||
StartupMessageParams {
|
||||
params: self.params.freeze(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
#[derive(Debug)]
|
||||
pub struct StartupMessageParams {
|
||||
params: Bytes,
|
||||
params: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl StartupMessageParams {
|
||||
/// Get parameter's value by its name.
|
||||
pub fn get(&self, name: &str) -> Option<&str> {
|
||||
self.iter().find_map(|(k, v)| (k == name).then_some(v))
|
||||
self.params.get(name).map(|s| s.as_str())
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
@@ -135,19 +112,15 @@ impl StartupMessageParams {
|
||||
|
||||
/// Iterate through key-value pairs in an arbitrary order.
|
||||
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
|
||||
let params =
|
||||
std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
|
||||
params.split_terminator('\0').tuples()
|
||||
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
|
||||
}
|
||||
|
||||
// This function is mostly useful in tests.
|
||||
#[doc(hidden)]
|
||||
pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
|
||||
let mut b = StartupMessageParamsBuilder::default();
|
||||
for (k, v) in pairs {
|
||||
b.insert(k, v)
|
||||
Self {
|
||||
params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
|
||||
}
|
||||
b.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -372,21 +345,35 @@ impl FeStartupPacket {
|
||||
(major_version, minor_version) => {
|
||||
// StartupMessage
|
||||
|
||||
let s = str::from_utf8(&msg).map_err(|_e| {
|
||||
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
|
||||
})?;
|
||||
let s = s.strip_suffix('\0').ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
)
|
||||
})?;
|
||||
// Parse pairs of null-terminated strings (key, value).
|
||||
// See `postgres: ProcessStartupPacket, build_startup_packet`.
|
||||
let mut tokens = str::from_utf8(&msg)
|
||||
.map_err(|_e| {
|
||||
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
|
||||
})?
|
||||
.strip_suffix('\0') // drop packet's own null
|
||||
.ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
)
|
||||
})?
|
||||
.split_terminator('\0');
|
||||
|
||||
let mut params = HashMap::new();
|
||||
while let Some(name) = tokens.next() {
|
||||
let value = tokens.next().ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: key without value".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
params.insert(name.to_owned(), value.to_owned());
|
||||
}
|
||||
|
||||
FeStartupPacket::StartupMessage {
|
||||
major_version,
|
||||
minor_version,
|
||||
params: StartupMessageParams {
|
||||
params: msg.slice_ref(s.as_bytes()),
|
||||
},
|
||||
params: StartupMessageParams { params },
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -26,14 +26,13 @@ use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use futures_util::TryStreamExt;
|
||||
use http_types::{StatusCode, Url};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
|
||||
use crate::{
|
||||
error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
|
||||
ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
|
||||
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
|
||||
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
TimeTravelError, TimeoutOrCancel,
|
||||
};
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
@@ -138,8 +137,6 @@ impl AzureBlobStorage {
|
||||
let mut last_modified = None;
|
||||
let mut metadata = HashMap::new();
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let download = async {
|
||||
let response = builder
|
||||
// convert to concrete Pageable
|
||||
@@ -203,22 +200,13 @@ impl AzureBlobStorage {
|
||||
})
|
||||
};
|
||||
|
||||
let download = tokio::select! {
|
||||
tokio::select! {
|
||||
bufs = download => bufs,
|
||||
cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
|
||||
TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
|
||||
TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
|
||||
TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
|
||||
TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
|
||||
},
|
||||
};
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
let outcome = match &download {
|
||||
Ok(_) => AttemptOutcome::Ok,
|
||||
Err(_) => AttemptOutcome::Err,
|
||||
};
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, outcome, started_at);
|
||||
download
|
||||
}
|
||||
}
|
||||
|
||||
async fn permit(
|
||||
@@ -352,10 +340,7 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Put;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
let _permit = self.permit(RequestKind::Put, cancel).await?;
|
||||
|
||||
let op = async {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
@@ -379,25 +364,14 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
match fut.await {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
};
|
||||
|
||||
let res = tokio::select! {
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let outcome = match res {
|
||||
Ok(_) => AttemptOutcome::Ok,
|
||||
Err(_) => AttemptOutcome::Err,
|
||||
};
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, outcome, started_at);
|
||||
|
||||
res
|
||||
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(
|
||||
@@ -443,13 +417,12 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
let _permit = self.permit(RequestKind::Delete, cancel).await?;
|
||||
|
||||
let op = async {
|
||||
// TODO batch requests are not supported by the SDK
|
||||
// TODO batch requests are also not supported by the SDK
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
|
||||
for path in paths {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
|
||||
@@ -474,16 +447,10 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let res = tokio::select! {
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
res
|
||||
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn copy(
|
||||
@@ -492,9 +459,7 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Copy;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
let _permit = self.permit(RequestKind::Copy, cancel).await?;
|
||||
|
||||
let timeout = tokio::time::sleep(self.timeout);
|
||||
|
||||
@@ -538,21 +503,15 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
};
|
||||
|
||||
let res = tokio::select! {
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
|
||||
_ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
|
||||
_ = timeout => {
|
||||
let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
|
||||
let e = e.context(format!("Timeout, last status: {copy_status:?}"));
|
||||
Err(e)
|
||||
},
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
mod azure_blob;
|
||||
mod error;
|
||||
mod local_fs;
|
||||
mod metrics;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
mod support;
|
||||
@@ -122,8 +121,8 @@ impl RemotePath {
|
||||
self.0.file_name()
|
||||
}
|
||||
|
||||
pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
|
||||
Self(self.0.join(path))
|
||||
pub fn join(&self, segment: &Utf8Path) -> Self {
|
||||
Self(self.0.join(segment))
|
||||
}
|
||||
|
||||
pub fn get_path(&self) -> &Utf8PathBuf {
|
||||
|
||||
@@ -46,16 +46,15 @@ use utils::backoff;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
error::Cancelled,
|
||||
metrics::{start_counting_cancelled_wait, start_measuring_requests},
|
||||
support::PermitCarrying,
|
||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
|
||||
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
|
||||
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
use crate::metrics::AttemptOutcome;
|
||||
pub(super) use crate::metrics::RequestKind;
|
||||
pub(super) mod metrics;
|
||||
|
||||
use self::metrics::AttemptOutcome;
|
||||
pub(super) use self::metrics::RequestKind;
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
@@ -228,7 +227,7 @@ impl S3Bucket {
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
|
||||
@@ -249,7 +248,7 @@ impl S3Bucket {
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
Ok(permit)
|
||||
@@ -288,7 +287,7 @@ impl S3Bucket {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
@@ -296,7 +295,7 @@ impl S3Bucket {
|
||||
return Err(DownloadError::NotFound);
|
||||
}
|
||||
Err(e) => {
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
@@ -372,12 +371,12 @@ impl S3Bucket {
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
let resp = resp.context("request deletion")?;
|
||||
crate::metrics::BUCKET_METRICS
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
|
||||
@@ -436,14 +435,14 @@ pin_project_lite::pin_project! {
|
||||
/// Times and tracks the outcome of the request.
|
||||
struct TimedDownload<S> {
|
||||
started_at: std::time::Instant,
|
||||
outcome: AttemptOutcome,
|
||||
outcome: metrics::AttemptOutcome,
|
||||
#[pin]
|
||||
inner: S
|
||||
}
|
||||
|
||||
impl<S> PinnedDrop for TimedDownload<S> {
|
||||
fn drop(mut this: Pin<&mut Self>) {
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -452,7 +451,7 @@ impl<S> TimedDownload<S> {
|
||||
fn new(started_at: std::time::Instant, inner: S) -> Self {
|
||||
TimedDownload {
|
||||
started_at,
|
||||
outcome: AttemptOutcome::Cancelled,
|
||||
outcome: metrics::AttemptOutcome::Cancelled,
|
||||
inner,
|
||||
}
|
||||
}
|
||||
@@ -469,8 +468,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||
let res = ready!(this.inner.poll_next(cx));
|
||||
match &res {
|
||||
Some(Ok(_)) => {}
|
||||
Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
|
||||
None => *this.outcome = AttemptOutcome::Ok,
|
||||
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
|
||||
None => *this.outcome = metrics::AttemptOutcome::Ok,
|
||||
}
|
||||
|
||||
Poll::Ready(res)
|
||||
@@ -544,7 +543,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
crate::metrics::BUCKET_METRICS
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
@@ -626,7 +625,7 @@ impl RemoteStorage for S3Bucket {
|
||||
if let Ok(inner) = &res {
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, inner, started_at);
|
||||
}
|
||||
@@ -674,7 +673,7 @@ impl RemoteStorage for S3Bucket {
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
@@ -978,6 +977,28 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
fn start_counting_cancelled_wait(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
|
||||
metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
|
||||
})
|
||||
}
|
||||
|
||||
/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
|
||||
fn start_measuring_requests(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Cancelled,
|
||||
started_at,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
|
||||
struct VerOrDelete {
|
||||
kind: VerOrDeleteKind,
|
||||
|
||||
@@ -15,7 +15,6 @@ pub(crate) enum RequestKind {
|
||||
TimeTravel = 5,
|
||||
}
|
||||
|
||||
use scopeguard::ScopeGuard;
|
||||
use RequestKind::*;
|
||||
|
||||
impl RequestKind {
|
||||
@@ -34,10 +33,10 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct RequestTyped<C>([C; 6]);
|
||||
pub(super) struct RequestTyped<C>([C; 6]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(crate) fn get(&self, kind: RequestKind) -> &C {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
&self.0[kind.as_index()]
|
||||
}
|
||||
|
||||
@@ -59,19 +58,19 @@ impl<C> RequestTyped<C> {
|
||||
}
|
||||
|
||||
impl RequestTyped<Histogram> {
|
||||
pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
|
||||
pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
|
||||
self.get(kind).observe(started_at.elapsed().as_secs_f64())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct PassFailCancelledRequestTyped<C> {
|
||||
pub(super) struct PassFailCancelledRequestTyped<C> {
|
||||
success: RequestTyped<C>,
|
||||
fail: RequestTyped<C>,
|
||||
cancelled: RequestTyped<C>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) enum AttemptOutcome {
|
||||
pub(super) enum AttemptOutcome {
|
||||
Ok,
|
||||
Err,
|
||||
Cancelled,
|
||||
@@ -87,7 +86,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
|
||||
}
|
||||
|
||||
impl AttemptOutcome {
|
||||
pub(crate) fn as_str(&self) -> &'static str {
|
||||
pub(super) fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
AttemptOutcome::Ok => "ok",
|
||||
AttemptOutcome::Err => "err",
|
||||
@@ -97,7 +96,7 @@ impl AttemptOutcome {
|
||||
}
|
||||
|
||||
impl<C> PassFailCancelledRequestTyped<C> {
|
||||
pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
|
||||
pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
|
||||
let target = match outcome {
|
||||
AttemptOutcome::Ok => &self.success,
|
||||
AttemptOutcome::Err => &self.fail,
|
||||
@@ -120,7 +119,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
|
||||
}
|
||||
|
||||
impl PassFailCancelledRequestTyped<Histogram> {
|
||||
pub(crate) fn observe_elapsed(
|
||||
pub(super) fn observe_elapsed(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
outcome: impl Into<AttemptOutcome>,
|
||||
@@ -131,44 +130,19 @@ impl PassFailCancelledRequestTyped<Histogram> {
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
|
||||
pub(crate) fn start_counting_cancelled_wait(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.cancelled_waits
|
||||
.get(kind)
|
||||
.inc()
|
||||
})
|
||||
}
|
||||
|
||||
/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
|
||||
pub(crate) fn start_measuring_requests(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Cancelled,
|
||||
started_at,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) struct BucketMetrics {
|
||||
pub(super) struct BucketMetrics {
|
||||
/// Full request duration until successful completion, error or cancellation.
|
||||
pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
|
||||
pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
|
||||
/// Total amount of seconds waited on queue.
|
||||
pub(crate) wait_seconds: RequestTyped<Histogram>,
|
||||
pub(super) wait_seconds: RequestTyped<Histogram>,
|
||||
|
||||
/// Track how many semaphore awaits were cancelled per request type.
|
||||
///
|
||||
/// This is in case cancellations are happening more than expected.
|
||||
pub(crate) cancelled_waits: RequestTyped<IntCounter>,
|
||||
pub(super) cancelled_waits: RequestTyped<IntCounter>,
|
||||
|
||||
/// Total amount of deleted objects in batches or single requests.
|
||||
pub(crate) deleted_objects_total: IntCounter,
|
||||
pub(super) deleted_objects_total: IntCounter,
|
||||
}
|
||||
|
||||
impl Default for BucketMetrics {
|
||||
@@ -50,9 +50,6 @@ pub struct SkTimelineInfo {
|
||||
pub safekeeper_connstr: Option<String>,
|
||||
#[serde(default)]
|
||||
pub http_connstr: Option<String>,
|
||||
// Minimum of all active RO replicas flush LSN
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub standby_horizon: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
|
||||
@@ -9,33 +9,6 @@ use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
/// Declare a failpoint that can use the `pause` failpoint action.
|
||||
/// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
#[macro_export]
|
||||
macro_rules! pausable_failpoint {
|
||||
($name:literal) => {
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint {}", $name);
|
||||
fail::fail_point!($name);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
};
|
||||
($name:literal, $cond:expr) => {
|
||||
if cfg!(feature = "testing") {
|
||||
if $cond {
|
||||
pausable_failpoint!($name)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
///
|
||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||
|
||||
@@ -19,13 +19,13 @@
|
||||
/// // right: [0x68; 1]
|
||||
/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
|
||||
/// ```
|
||||
pub struct Hex<S>(pub S);
|
||||
#[derive(PartialEq)]
|
||||
pub struct Hex<'a>(pub &'a [u8]);
|
||||
|
||||
impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
|
||||
impl std::fmt::Debug for Hex<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[")?;
|
||||
let chunks = self.0.as_ref().chunks(16);
|
||||
for (i, c) in chunks.enumerate() {
|
||||
for (i, c) in self.0.chunks(16).enumerate() {
|
||||
if i > 0 && !c.is_empty() {
|
||||
writeln!(f, ", ")?;
|
||||
}
|
||||
@@ -36,15 +36,6 @@ impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
|
||||
write!(f, "0x{b:02x}")?;
|
||||
}
|
||||
}
|
||||
write!(f, "; {}]", self.0.as_ref().len())
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
|
||||
fn eq(&self, other: &Hex<R>) -> bool {
|
||||
let left = self.0.as_ref();
|
||||
let right = other.0.as_ref();
|
||||
|
||||
left == right
|
||||
write!(f, "; {}]", self.0.len())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,8 +135,7 @@ impl Gate {
|
||||
let started_at = std::time::Instant::now();
|
||||
let mut do_close = std::pin::pin!(self.do_close());
|
||||
|
||||
// with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
|
||||
let nag_after = Duration::from_millis(100);
|
||||
let nag_after = Duration::from_secs(1);
|
||||
|
||||
let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
|
||||
return;
|
||||
|
||||
@@ -496,9 +496,9 @@ mod tests {
|
||||
// TODO: When updating Postgres versions, this test will cause
|
||||
// problems. Postgres version in message needs updating.
|
||||
//
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||
|
||||
@@ -30,27 +30,47 @@
|
||||
//! 2024-04-15 on i3en.3xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||
//! short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||
//! short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||
//! short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||
//! short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||
//! short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||
//! short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||
//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||
//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||
//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||
//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||
//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||
//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||
//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||
//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
|
||||
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
|
||||
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
|
||||
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
|
||||
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
|
||||
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
|
||||
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
|
||||
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
|
||||
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
|
||||
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
|
||||
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
|
||||
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
|
||||
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
|
||||
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
|
||||
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
|
||||
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
|
||||
//! ```
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::{PostgresRedoManager, ProcessKind},
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
sync::Arc,
|
||||
@@ -60,32 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-short"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -93,10 +120,16 @@ criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
fn bench_impl(
|
||||
process_kind: ProcessKind,
|
||||
redo_work: Arc<Request>,
|
||||
n_redos: u64,
|
||||
nclients: u64,
|
||||
) -> Duration {
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
conf.walredo_process_kind = process_kind;
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
@@ -125,13 +158,27 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
|
||||
});
|
||||
}
|
||||
|
||||
rt.block_on(async move {
|
||||
let elapsed = rt.block_on(async move {
|
||||
let mut total_wallclock_time = Duration::ZERO;
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
total_wallclock_time += res.unwrap();
|
||||
}
|
||||
total_wallclock_time
|
||||
})
|
||||
});
|
||||
|
||||
// consistency check to ensure process kind setting worked
|
||||
if nredos_per_client > 0 {
|
||||
assert_eq!(
|
||||
manager
|
||||
.status()
|
||||
.process
|
||||
.map(|p| p.kind)
|
||||
.expect("the benchmark work causes a walredo process to be spawned"),
|
||||
std::borrow::Cow::Borrowed(process_kind.into())
|
||||
);
|
||||
}
|
||||
|
||||
elapsed
|
||||
}
|
||||
|
||||
async fn client(
|
||||
|
||||
@@ -1,12 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub mod util;
|
||||
@@ -490,18 +486,6 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn top_tenant_shards(
|
||||
&self,
|
||||
request: TopTenantShardsRequest,
|
||||
) -> Result<TopTenantShardsResponse> {
|
||||
let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
|
||||
self.request(Method::POST, uri, request)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn layer_map_info(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -565,57 +549,4 @@ impl Client {
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn ingest_aux_files(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
aux_files: HashMap<String, String>,
|
||||
) -> Result<bool> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
|
||||
);
|
||||
let resp = self
|
||||
.request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
|
||||
.await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => Ok(true),
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_aux_files(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashMap<String, Bytes>> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/list_aux_files",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
|
||||
);
|
||||
let resp = self
|
||||
.request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
|
||||
.await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => {
|
||||
let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
|
||||
Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
|
||||
})?;
|
||||
Ok(resp)
|
||||
}
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_compaction::helpers::PAGE_SZ;
|
||||
use pageserver_compaction::simulator::MockTimeline;
|
||||
use rand::Rng;
|
||||
use std::io::Write;
|
||||
@@ -52,7 +51,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
|
||||
let mut executor = MockTimeline::new();
|
||||
|
||||
// Convert the logical size in MB into a key range.
|
||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
|
||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
|
||||
//let key_range = u64::MIN..u64::MAX;
|
||||
println!(
|
||||
"starting simulation with key range {:016X}-{:016X}",
|
||||
|
||||
@@ -25,7 +25,7 @@ use std::collections::{HashSet, VecDeque};
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::helpers::{
|
||||
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
|
||||
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
|
||||
};
|
||||
use crate::interface::*;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -379,7 +379,7 @@ where
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?,
|
||||
&self.shard_identity,
|
||||
) * PAGE_SZ;
|
||||
) * 8192;
|
||||
|
||||
let wal_size = job
|
||||
.input_layers
|
||||
@@ -441,7 +441,7 @@ where
|
||||
let mut window = KeyspaceWindow::new(
|
||||
E::Key::MIN..E::Key::MAX,
|
||||
keyspace,
|
||||
self.target_file_size / PAGE_SZ,
|
||||
self.target_file_size / 8192,
|
||||
);
|
||||
while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
|
||||
new_jobs.push(CompactionJob::<E> {
|
||||
@@ -530,6 +530,8 @@ where
|
||||
// If we have accumulated only a narrow band of keyspace, create an
|
||||
// image layer. Otherwise write a delta layer.
|
||||
|
||||
// FIXME: deal with the case of lots of values for same key
|
||||
|
||||
// FIXME: we are ignoring images here. Did we already divide the work
|
||||
// so that we won't encounter them here?
|
||||
|
||||
@@ -548,93 +550,38 @@ where
|
||||
let mut new_jobs = Vec::new();
|
||||
|
||||
// Slide a window through the keyspace
|
||||
let mut key_accum =
|
||||
std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size));
|
||||
let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
|
||||
let mut all_in_window: bool = false;
|
||||
let mut window = Window::new();
|
||||
|
||||
// Helper function to create a job for a new delta layer with given key-lsn
|
||||
// rectangle.
|
||||
let create_delta_job = |key_range, lsn_range: &Range<Lsn>, new_jobs: &mut Vec<_>| {
|
||||
// The inputs for the job are all the input layers of the original job that
|
||||
// overlap with the rectangle.
|
||||
let batch_layers: Vec<LayerId> = job
|
||||
.input_layers
|
||||
.iter()
|
||||
.filter(|layer_id| {
|
||||
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
assert!(!batch_layers.is_empty());
|
||||
new_jobs.push(CompactionJob {
|
||||
key_range,
|
||||
lsn_range: lsn_range.clone(),
|
||||
strategy: CompactionStrategy::CreateDelta,
|
||||
input_layers: batch_layers,
|
||||
completed: false,
|
||||
});
|
||||
};
|
||||
|
||||
loop {
|
||||
if all_in_window && window.is_empty() {
|
||||
if all_in_window && window.elems.is_empty() {
|
||||
// All done!
|
||||
break;
|
||||
}
|
||||
|
||||
// If we now have enough keyspace for next delta layer in the window, create a
|
||||
// new delta layer
|
||||
if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
|
||||
{
|
||||
create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
|
||||
continue;
|
||||
}
|
||||
assert!(!all_in_window);
|
||||
|
||||
// Process next key in the key space
|
||||
match key_accum.next().await.transpose()? {
|
||||
None => {
|
||||
all_in_window = true;
|
||||
}
|
||||
Some(next_key) if next_key.partition_lsns.is_empty() => {
|
||||
// Normal case: extend the window by the key
|
||||
let batch_layers: Vec<LayerId> = job
|
||||
.input_layers
|
||||
.iter()
|
||||
.filter(|layer_id| {
|
||||
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
assert!(!batch_layers.is_empty());
|
||||
new_jobs.push(CompactionJob {
|
||||
key_range,
|
||||
lsn_range: job.lsn_range.clone(),
|
||||
strategy: CompactionStrategy::CreateDelta,
|
||||
input_layers: batch_layers,
|
||||
completed: false,
|
||||
});
|
||||
} else {
|
||||
assert!(!all_in_window);
|
||||
if let Some(next_key) = key_accum.next().await.transpose()? {
|
||||
window.feed(next_key.key, next_key.size);
|
||||
}
|
||||
Some(next_key) => {
|
||||
// A key with too large size impact for a single delta layer. This
|
||||
// case occurs if you make a huge number of updates for a single key.
|
||||
//
|
||||
// Drain the window with has_more = false to make a clean cut before
|
||||
// the key, and then make dedicated delta layers for the single key.
|
||||
//
|
||||
// We cannot cluster the key with the others, because we don't want
|
||||
// layer files to overlap with each other in the lsn,key space (no
|
||||
// overlaps for the rectangles).
|
||||
let key = next_key.key;
|
||||
debug!("key {key} with size impact larger than the layer size");
|
||||
while !window.is_empty() {
|
||||
let has_more = false;
|
||||
let key_range = window.choose_next_delta(self.target_file_size, has_more)
|
||||
.expect("with has_more==false, choose_next_delta always returns something for a non-empty Window");
|
||||
create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
|
||||
}
|
||||
|
||||
// Not really required: but here for future resilience:
|
||||
// We make a "gap" here, so any structure the window holds should
|
||||
// probably be reset.
|
||||
window = Window::new();
|
||||
|
||||
let mut prior_lsn = job.lsn_range.start;
|
||||
let mut lsn_ranges = Vec::new();
|
||||
for (lsn, _size) in next_key.partition_lsns.iter() {
|
||||
lsn_ranges.push(prior_lsn..*lsn);
|
||||
prior_lsn = *lsn;
|
||||
}
|
||||
lsn_ranges.push(prior_lsn..job.lsn_range.end);
|
||||
for lsn_range in lsn_ranges {
|
||||
let key_range = key..key.next();
|
||||
create_delta_job(key_range, &lsn_range, &mut new_jobs);
|
||||
}
|
||||
} else {
|
||||
all_in_window = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -663,8 +610,8 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Sliding window through keyspace and values for image layer
|
||||
/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
|
||||
// Sliding window through keyspace and values
|
||||
// This is used by over_with_images to decide on good split points
|
||||
struct KeyspaceWindow<K> {
|
||||
head: KeyspaceWindowHead<K>,
|
||||
|
||||
@@ -804,9 +751,9 @@ struct WindowElement<K> {
|
||||
accum_size: u64,
|
||||
}
|
||||
|
||||
/// Sliding window through keyspace and values for delta layer tiling
|
||||
///
|
||||
/// This is used to decide which delta layer to write next.
|
||||
// Sliding window through keyspace and values
|
||||
//
|
||||
// This is used to decide what layer to write next, from the beginning of the window.
|
||||
struct Window<K> {
|
||||
elems: VecDeque<WindowElement<K>>,
|
||||
|
||||
@@ -830,13 +777,11 @@ where
|
||||
fn feed(&mut self, key: K, size: u64) {
|
||||
let last_size;
|
||||
if let Some(last) = self.elems.back_mut() {
|
||||
// We require the keys to be strictly increasing for the window.
|
||||
// Keys should already have been deduplicated by `accum_key_values`
|
||||
assert!(
|
||||
last.last_key < key,
|
||||
"last_key(={}) >= key(={key})",
|
||||
last.last_key
|
||||
);
|
||||
assert!(last.last_key <= key);
|
||||
if key == last.last_key {
|
||||
last.accum_size += size;
|
||||
return;
|
||||
}
|
||||
last_size = last.accum_size;
|
||||
} else {
|
||||
last_size = 0;
|
||||
@@ -858,10 +803,6 @@ where
|
||||
self.elems.front().unwrap().accum_size - self.splitoff_size
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.elems.is_empty()
|
||||
}
|
||||
|
||||
fn commit_upto(&mut self, mut upto: usize) {
|
||||
while upto > 1 {
|
||||
let popped = self.elems.pop_front().unwrap();
|
||||
@@ -924,7 +865,7 @@ where
|
||||
// If we're willing to stretch it up to 1.25 target size, could we
|
||||
// gobble up the rest of the work? This avoids creating very small
|
||||
// "tail" layers at the end of the keyspace
|
||||
if !has_more && self.remain_size() < target_size * 5 / 4 {
|
||||
if !has_more && self.remain_size() < target_size * 5 / 3 {
|
||||
self.commit_upto(self.elems.len());
|
||||
} else {
|
||||
let delta_split_at = self.find_size_split(target_size);
|
||||
|
||||
@@ -16,8 +16,6 @@ use std::pin::Pin;
|
||||
use std::task::{ready, Poll};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub const PAGE_SZ: u64 = 8192;
|
||||
|
||||
pub fn keyspace_total_size<K>(
|
||||
keyspace: &CompactionKeySpace<K>,
|
||||
shard_identity: &ShardIdentity,
|
||||
@@ -237,14 +235,9 @@ pub struct KeySize<K> {
|
||||
pub key: K,
|
||||
pub num_values: u64,
|
||||
pub size: u64,
|
||||
/// The lsns to partition at (if empty then no per-lsn partitioning)
|
||||
pub partition_lsns: Vec<(Lsn, u64)>,
|
||||
}
|
||||
|
||||
pub fn accum_key_values<'a, I, K, D, E>(
|
||||
input: I,
|
||||
target_size: u64,
|
||||
) -> impl Stream<Item = Result<KeySize<K>, E>>
|
||||
pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
|
||||
where
|
||||
K: Eq + PartialOrd + Display + Copy,
|
||||
I: Stream<Item = Result<D, E>>,
|
||||
@@ -256,35 +249,25 @@ where
|
||||
|
||||
if let Some(first) = input.next().await {
|
||||
let first = first?;
|
||||
let mut part_size = first.size();
|
||||
let mut accum: KeySize<K> = KeySize {
|
||||
key: first.key(),
|
||||
num_values: 1,
|
||||
size: part_size,
|
||||
partition_lsns: Vec::new(),
|
||||
size: first.size(),
|
||||
};
|
||||
let mut last_key = accum.key;
|
||||
while let Some(this) = input.next().await {
|
||||
let this = this?;
|
||||
if this.key() == accum.key {
|
||||
let add_size = this.size();
|
||||
if part_size + add_size > target_size {
|
||||
accum.partition_lsns.push((this.lsn(), part_size));
|
||||
part_size = 0;
|
||||
}
|
||||
part_size += add_size;
|
||||
accum.size += add_size;
|
||||
accum.size += this.size();
|
||||
accum.num_values += 1;
|
||||
} else {
|
||||
assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
|
||||
last_key = accum.key;
|
||||
yield accum;
|
||||
part_size = this.size();
|
||||
accum = KeySize {
|
||||
key: this.key(),
|
||||
num_values: 1,
|
||||
size: part_size,
|
||||
partition_lsns: Vec::new(),
|
||||
size: this.size(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -184,12 +184,6 @@ impl<L> Level<L> {
|
||||
}
|
||||
let mut events: Vec<Event<K>> = Vec::new();
|
||||
for (idx, l) in self.layers.iter().enumerate() {
|
||||
let key_range = l.key_range();
|
||||
if key_range.end == key_range.start.next() && l.is_delta() {
|
||||
// Ignore single-key delta layers as they can be stacked on top of each other
|
||||
// as that is the only way to cut further.
|
||||
continue;
|
||||
}
|
||||
events.push(Event {
|
||||
key: l.key_range().start,
|
||||
layer_idx: idx,
|
||||
|
||||
@@ -14,7 +14,6 @@ use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::helpers::PAGE_SZ;
|
||||
use crate::helpers::{merge_delta_keys, overlaps_with};
|
||||
|
||||
use crate::interface;
|
||||
@@ -380,8 +379,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.file_size,
|
||||
MockLayer::Image(this) => this.file_size,
|
||||
MockLayer::Delta(this) => this.file_size(),
|
||||
MockLayer::Image(this) => this.file_size(),
|
||||
}
|
||||
}
|
||||
fn short_id(&self) -> String {
|
||||
@@ -510,7 +509,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
|
||||
let new_layer = Arc::new(MockImageLayer {
|
||||
key_range: key_range.clone(),
|
||||
lsn_range: lsn..lsn,
|
||||
file_size: accum_size * PAGE_SZ,
|
||||
file_size: accum_size * 8192,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!(
|
||||
|
||||
@@ -20,6 +20,10 @@ pub(crate) fn setup_logging() {
|
||||
/// even if we produce an extremely narrow delta layer, spanning just that one
|
||||
/// key, we still too many records to fit in the target file size. We need to
|
||||
/// split in the LSN dimension too in that case.
|
||||
///
|
||||
/// TODO: The code to avoid this problem has not been implemented yet! So the
|
||||
/// assertion currently fails, but we need to make it not fail.
|
||||
#[ignore]
|
||||
#[tokio::test]
|
||||
async fn test_many_updates_for_single_key() {
|
||||
setup_logging();
|
||||
@@ -39,9 +43,9 @@ async fn test_many_updates_for_single_key() {
|
||||
}
|
||||
for l in executor.live_layers.iter() {
|
||||
assert!(l.file_size() < executor.target_file_size * 2);
|
||||
// Sanity check that none of the delta layers are empty either.
|
||||
// sanity check that none of the delta layers are stupidly small either
|
||||
if l.is_delta() {
|
||||
assert!(l.file_size() > 0);
|
||||
assert!(l.file_size() > executor.target_file_size / 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@ pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
postgres_ffi.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
|
||||
@@ -52,6 +52,7 @@
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::METADATA_FILE_NAME;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::path::PathBuf;
|
||||
@@ -82,11 +83,6 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
||||
let split: Vec<&str> = name.split("__").collect();
|
||||
let keys: Vec<&str> = split[0].split('-').collect();
|
||||
let mut lsns: Vec<&str> = split[1].split('-').collect();
|
||||
|
||||
if lsns.last().expect("should").len() == 8 {
|
||||
lsns.pop();
|
||||
}
|
||||
|
||||
if lsns.len() == 1 {
|
||||
lsns.push(lsns[0]);
|
||||
}
|
||||
@@ -158,6 +154,10 @@ pub fn main() -> Result<()> {
|
||||
let line = PathBuf::from_str(&line).unwrap();
|
||||
let filename = line.file_name().unwrap();
|
||||
let filename = filename.to_str().unwrap();
|
||||
if filename == METADATA_FILE_NAME {
|
||||
// Don't try and parse "metadata" like a key-lsn range
|
||||
continue;
|
||||
}
|
||||
let (key_range, lsn_range) = parse_filename(filename);
|
||||
files.push(Layer {
|
||||
filename: filename.to_owned(),
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output<'a> {
|
||||
layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
|
||||
layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
timeline_metadata: &'a TimelineMetadata,
|
||||
}
|
||||
|
||||
@@ -1,477 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use clap::Parser;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
reltag::{BlockNumber, RelTag, SlruKind},
|
||||
shard::{ShardCount, ShardStripeSize},
|
||||
};
|
||||
use std::str::FromStr;
|
||||
|
||||
#[derive(Parser)]
|
||||
pub(super) struct DescribeKeyCommand {
|
||||
/// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
|
||||
input: Vec<String>,
|
||||
|
||||
/// The number of shards to calculate what Keys placement would be.
|
||||
#[arg(long)]
|
||||
shard_count: Option<CustomShardCount>,
|
||||
|
||||
/// The sharding stripe size.
|
||||
///
|
||||
/// The default is hardcoded. It makes no sense to provide this without providing
|
||||
/// `--shard-count`.
|
||||
#[arg(long, requires = "shard_count")]
|
||||
stripe_size: Option<u32>,
|
||||
}
|
||||
|
||||
/// Sharded shard count without unsharded count, which the actual ShardCount supports.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(super) struct CustomShardCount(std::num::NonZeroU8);
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(super) enum InvalidShardCount {
|
||||
#[error(transparent)]
|
||||
ParsingFailed(#[from] std::num::ParseIntError),
|
||||
#[error("too few shards")]
|
||||
TooFewShards,
|
||||
}
|
||||
|
||||
impl FromStr for CustomShardCount {
|
||||
type Err = InvalidShardCount;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let inner: std::num::NonZeroU8 = s.parse()?;
|
||||
if inner.get() < 2 {
|
||||
Err(InvalidShardCount::TooFewShards)
|
||||
} else {
|
||||
Ok(CustomShardCount(inner))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CustomShardCount> for ShardCount {
|
||||
fn from(value: CustomShardCount) -> Self {
|
||||
ShardCount::new(value.0.get())
|
||||
}
|
||||
}
|
||||
|
||||
impl DescribeKeyCommand {
|
||||
pub(super) fn execute(self) {
|
||||
let DescribeKeyCommand {
|
||||
input,
|
||||
shard_count,
|
||||
stripe_size,
|
||||
} = self;
|
||||
|
||||
let material = KeyMaterial::try_from(input.as_slice()).unwrap();
|
||||
let kind = material.kind();
|
||||
let key = Key::from(material);
|
||||
|
||||
println!("parsed from {kind}: {key}:");
|
||||
println!();
|
||||
println!("{key:?}");
|
||||
|
||||
macro_rules! kind_query {
|
||||
($name:ident) => {{
|
||||
let s: &'static str = stringify!($name);
|
||||
let s = s.strip_prefix("is_").unwrap_or(s);
|
||||
let s = s.strip_suffix("_key").unwrap_or(s);
|
||||
|
||||
#[allow(clippy::needless_borrow)]
|
||||
(s, pageserver_api::key::$name(key))
|
||||
}};
|
||||
}
|
||||
|
||||
// the current characterization is a mess of these boolean queries and separate
|
||||
// "recognization". I think it accurately represents how strictly we model the Key
|
||||
// right now, but could of course be made less confusing.
|
||||
|
||||
let queries = [
|
||||
("rel_block", pageserver_api::key::is_rel_block_key(&key)),
|
||||
kind_query!(is_rel_vm_block_key),
|
||||
kind_query!(is_rel_fsm_block_key),
|
||||
kind_query!(is_slru_block_key),
|
||||
kind_query!(is_inherited_key),
|
||||
("rel_size", pageserver_api::key::is_rel_size_key(&key)),
|
||||
(
|
||||
"slru_segment_size",
|
||||
pageserver_api::key::is_slru_segment_size_key(&key),
|
||||
),
|
||||
];
|
||||
|
||||
let recognized_kind = "recognized kind";
|
||||
let metadata_key = "metadata key";
|
||||
let shard_placement = "shard placement";
|
||||
|
||||
let longest = queries
|
||||
.iter()
|
||||
.map(|t| t.0)
|
||||
.chain([recognized_kind, metadata_key, shard_placement])
|
||||
.map(|s| s.len())
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
let colon = 1;
|
||||
let padding = 1;
|
||||
|
||||
for (name, is) in queries {
|
||||
let width = longest - name.len() + colon + padding;
|
||||
println!("{}{:width$}{}", name, ":", is);
|
||||
}
|
||||
|
||||
let width = longest - recognized_kind.len() + colon + padding;
|
||||
println!(
|
||||
"{}{:width$}{:?}",
|
||||
recognized_kind,
|
||||
":",
|
||||
RecognizedKeyKind::new(key),
|
||||
);
|
||||
|
||||
if let Some(shard_count) = shard_count {
|
||||
// seeing the sharding placement might be confusing, so leave it out unless shard
|
||||
// count was given.
|
||||
|
||||
let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
|
||||
println!(
|
||||
"# placement with shard_count: {} and stripe_size: {}:",
|
||||
shard_count.0, stripe_size.0
|
||||
);
|
||||
let width = longest - shard_placement.len() + colon + padding;
|
||||
println!(
|
||||
"{}{:width$}{:?}",
|
||||
shard_placement,
|
||||
":",
|
||||
pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Hand-wavy "inputs we accept" for a key.
|
||||
#[derive(Debug)]
|
||||
pub(super) enum KeyMaterial {
|
||||
Hex(Key),
|
||||
String(SpanAttributesFromLogs),
|
||||
Split(RelTag, BlockNumber),
|
||||
}
|
||||
|
||||
impl KeyMaterial {
|
||||
fn kind(&self) -> &'static str {
|
||||
match self {
|
||||
KeyMaterial::Hex(_) => "hex",
|
||||
KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<KeyMaterial> for Key {
|
||||
fn from(value: KeyMaterial) -> Self {
|
||||
match value {
|
||||
KeyMaterial::Hex(key) => key,
|
||||
KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
|
||||
| KeyMaterial::Split(rt, blocknum) => {
|
||||
pageserver_api::key::rel_block_to_key(rt, blocknum)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(value: &[S]) -> Result<Self, Self::Error> {
|
||||
match value {
|
||||
[] => anyhow::bail!(
|
||||
"need 1..N positional arguments describing the key, try hex or a log line"
|
||||
),
|
||||
[one] => {
|
||||
let one = one.as_ref();
|
||||
|
||||
let key = Key::from_hex(one).map(KeyMaterial::Hex);
|
||||
|
||||
let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
|
||||
|
||||
match (key, attrs) {
|
||||
(Ok(key), _) => Ok(key),
|
||||
(_, Ok(s)) => Ok(s),
|
||||
(Err(e1), Err(e2)) => anyhow::bail!(
|
||||
"failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
|
||||
),
|
||||
}
|
||||
}
|
||||
more => {
|
||||
// assume going left to right one of these is a reltag and then we find a blocknum
|
||||
// this works, because we don't have plain numbers at least right after reltag in
|
||||
// logs. for some definition of "works".
|
||||
|
||||
let Some((reltag_at, reltag)) = more
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.enumerate()
|
||||
.find_map(|(i, s)| {
|
||||
s.split_once("rel=")
|
||||
.map(|(_garbage, actual)| actual)
|
||||
.unwrap_or(s)
|
||||
.parse::<RelTag>()
|
||||
.ok()
|
||||
.map(|rt| (i, rt))
|
||||
})
|
||||
else {
|
||||
anyhow::bail!("found no RelTag in arguments");
|
||||
};
|
||||
|
||||
let Some(blocknum) = more
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.skip(reltag_at)
|
||||
.find_map(|s| {
|
||||
s.split_once("blkno=")
|
||||
.map(|(_garbage, actual)| actual)
|
||||
.unwrap_or(s)
|
||||
.parse::<BlockNumber>()
|
||||
.ok()
|
||||
})
|
||||
else {
|
||||
anyhow::bail!("found no blocknum in arguments");
|
||||
};
|
||||
|
||||
Ok(KeyMaterial::Split(reltag, blocknum))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
|
||||
|
||||
impl std::str::FromStr for SpanAttributesFromLogs {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
// accept the span separator but do not require or fail if either is missing
|
||||
// "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
|
||||
let (_, reltag) = s
|
||||
.split_once("rel=")
|
||||
.ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
|
||||
let reltag = reltag.split_whitespace().next().unwrap();
|
||||
|
||||
let (_, blocknum) = s
|
||||
.split_once("blkno=")
|
||||
.ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
|
||||
let blocknum = blocknum.split_whitespace().next().unwrap();
|
||||
|
||||
let reltag = reltag
|
||||
.parse()
|
||||
.with_context(|| format!("parse reltag from {reltag:?}"))?;
|
||||
let blocknum = blocknum
|
||||
.parse()
|
||||
.with_context(|| format!("parse blocknum from {blocknum:?}"))?;
|
||||
|
||||
Ok(Self(reltag, blocknum))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)] // debug print is used
|
||||
enum RecognizedKeyKind {
|
||||
DbDir,
|
||||
ControlFile,
|
||||
Checkpoint,
|
||||
AuxFilesV1,
|
||||
SlruDir(Result<SlruKind, u32>),
|
||||
RelMap(RelTagish<2>),
|
||||
RelDir(RelTagish<2>),
|
||||
AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[allow(unused)]
|
||||
enum AuxFileV2 {
|
||||
Recognized(&'static str, utils::Hex<[u8; 13]>),
|
||||
OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
|
||||
Other(utils::Hex<[u8; 13]>),
|
||||
}
|
||||
|
||||
impl RecognizedKeyKind {
|
||||
fn new(key: Key) -> Option<Self> {
|
||||
use RecognizedKeyKind::{
|
||||
AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
|
||||
};
|
||||
|
||||
let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
|
||||
|
||||
Some(match key {
|
||||
pageserver_api::key::DBDIR_KEY => DbDir,
|
||||
pageserver_api::key::CONTROLFILE_KEY => ControlFile,
|
||||
pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
|
||||
pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
|
||||
_ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
|
||||
_ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
|
||||
RelMap([key.field2, key.field3].into())
|
||||
}
|
||||
_ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
|
||||
RelDir([key.field2, key.field3].into())
|
||||
}
|
||||
_ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
|
||||
AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
|
||||
),
|
||||
_ => return None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl AuxFileV2 {
|
||||
fn new(key: Key) -> Option<AuxFileV2> {
|
||||
const EMPTY_HASH: [u8; 13] = {
|
||||
let mut out = [0u8; 13];
|
||||
let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
|
||||
let mut i = 3;
|
||||
while i < 16 {
|
||||
out[i - 3] = hash[i];
|
||||
i += 1;
|
||||
}
|
||||
out
|
||||
};
|
||||
|
||||
let bytes = key.to_i128().to_be_bytes();
|
||||
let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
|
||||
|
||||
assert_eq!(EMPTY_HASH.len(), hash.0.len());
|
||||
|
||||
// TODO: we could probably find the preimages for the hashes
|
||||
|
||||
Some(match (bytes[1], bytes[2]) {
|
||||
(1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
|
||||
(1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
|
||||
(1, 3) if hash.0 == EMPTY_HASH => {
|
||||
AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
|
||||
}
|
||||
(2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
|
||||
(1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
|
||||
(0xff, 0xff) => AuxFileV2::Other(hash),
|
||||
_ => return None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Prefix of RelTag, currently only known use cases are the two item versions.
|
||||
///
|
||||
/// Renders like a reltag with `/`, nothing else.
|
||||
struct RelTagish<const N: usize>([u32; N]);
|
||||
|
||||
impl<const N: usize> From<[u32; N]> for RelTagish<N> {
|
||||
fn from(val: [u32; N]) -> Self {
|
||||
RelTagish(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> std::fmt::Debug for RelTagish<N> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use std::fmt::Write as _;
|
||||
let mut first = true;
|
||||
self.0.iter().try_for_each(|x| {
|
||||
if !first {
|
||||
f.write_char('/')?;
|
||||
}
|
||||
first = false;
|
||||
write!(f, "{}", x)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver::aux_file::encode_aux_file_key;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn hex_is_key_material() {
|
||||
let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
|
||||
assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_positional_spanalike_is_key_material() {
|
||||
// why is this needed? if you are checking many, then copypaste starts to appeal
|
||||
let strings = [
|
||||
(line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
|
||||
(line!(), "rel=1663/208101/2620_fsm blkno=2"),
|
||||
(line!(), "rel=1663/208101/2620.1 blkno=2"),
|
||||
];
|
||||
|
||||
let mut first: Option<Key> = None;
|
||||
|
||||
for (line, example) in strings {
|
||||
let m = KeyMaterial::try_from(&[example][..])
|
||||
.unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
|
||||
let key = Key::from(m);
|
||||
if let Some(first) = first {
|
||||
assert_eq!(first, key);
|
||||
} else {
|
||||
first = Some(key);
|
||||
}
|
||||
}
|
||||
|
||||
// not supporting this is rather accidential, but I think the input parsing is lenient
|
||||
// enough already
|
||||
KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_spanlike_args() {
|
||||
let strings = [
|
||||
(line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
|
||||
(line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
|
||||
(line!(), &["1663/208101/2620_fsm", "2"][..]),
|
||||
];
|
||||
|
||||
let mut first: Option<Key> = None;
|
||||
|
||||
for (line, example) in strings {
|
||||
let m = KeyMaterial::try_from(example)
|
||||
.unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
|
||||
let key = Key::from(m);
|
||||
if let Some(first) = first {
|
||||
assert_eq!(first, key);
|
||||
} else {
|
||||
first = Some(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn recognized_auxfiles() {
|
||||
use AuxFileV2::*;
|
||||
|
||||
let empty = [
|
||||
0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
|
||||
];
|
||||
let foobar = [
|
||||
0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
|
||||
];
|
||||
|
||||
#[rustfmt::skip]
|
||||
let examples = [
|
||||
(line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
|
||||
(line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
|
||||
(line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
|
||||
(line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
|
||||
(line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
|
||||
(line!(), "foobar", Other(utils::Hex(foobar))),
|
||||
];
|
||||
|
||||
for (line, path, expected) in examples {
|
||||
let key = encode_aux_file_key(path);
|
||||
let recognized =
|
||||
AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
|
||||
|
||||
assert_eq!(recognized, expected);
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
|
||||
None,
|
||||
"example key has one too few 0 after 6 before 1"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
mod draw_timeline_dir;
|
||||
mod index_part;
|
||||
mod key;
|
||||
mod layer_map_analyzer;
|
||||
mod layers;
|
||||
|
||||
@@ -62,8 +61,6 @@ enum Commands {
|
||||
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
||||
#[command(subcommand)]
|
||||
Layer(LayerCmd),
|
||||
/// Debug print a hex key found from logs
|
||||
Key(key::DescribeKeyCommand),
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
@@ -186,7 +183,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
|
||||
.await?;
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
@@ -223,7 +219,6 @@ fn handle_metadata(
|
||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||
println!("Current metadata:\n{meta:?}");
|
||||
let mut update_meta = false;
|
||||
// TODO: simplify this part
|
||||
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
|
||||
meta = TimelineMetadata::new(
|
||||
*disk_consistent_lsn,
|
||||
|
||||
@@ -1,105 +0,0 @@
|
||||
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
/// Ingest aux files into the pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(args));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: None,
|
||||
targets: {
|
||||
if let Some(targets) = &args.targets {
|
||||
if targets.len() != 1 {
|
||||
anyhow::bail!("must specify exactly one target");
|
||||
}
|
||||
Some(targets.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let timeline = timelines[0];
|
||||
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
println!("operating on timeline {}", timeline);
|
||||
|
||||
mgmt_api_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
config: TenantConfig {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.await?;
|
||||
|
||||
for batch in 0..100 {
|
||||
let items = (0..100)
|
||||
.map(|id| {
|
||||
(
|
||||
format!("pg_logical/mappings/{:03}.{:03}", batch, id),
|
||||
format!("{:08}", id),
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
let file_cnt = items.len();
|
||||
mgmt_api_client
|
||||
.ingest_aux_files(tenant_shard_id, timeline_id, items)
|
||||
.await?;
|
||||
println!("ingested {file_cnt} files");
|
||||
}
|
||||
|
||||
for _ in 0..100 {
|
||||
let start = Instant::now();
|
||||
let files = mgmt_api_client
|
||||
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
|
||||
.await?;
|
||||
println!(
|
||||
"{} files found in {}s",
|
||||
files.len(),
|
||||
start.elapsed().as_secs_f64()
|
||||
);
|
||||
}
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
@@ -2,11 +2,9 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
|
||||
|
||||
use pageserver_client::mgmt_api;
|
||||
use rand::seq::SliceRandom;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info};
|
||||
use utils::id::{TenantTimelineId, TimelineId};
|
||||
|
||||
use std::{f64, sync::Arc};
|
||||
use tokio::{
|
||||
sync::{mpsc, OwnedSemaphorePermit},
|
||||
task::JoinSet,
|
||||
@@ -14,7 +12,10 @@ use tokio::{
|
||||
|
||||
use std::{
|
||||
num::NonZeroUsize,
|
||||
sync::atomic::{AtomicU64, Ordering},
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
@@ -50,31 +51,19 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
downloads_count: u64,
|
||||
downloads_bytes: u64,
|
||||
evictions_count: u64,
|
||||
timeline_restarts: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
runtime: Duration,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
evictions_count: AtomicU64,
|
||||
downloads_count: AtomicU64,
|
||||
downloads_bytes: AtomicU64,
|
||||
evictions: AtomicU64,
|
||||
downloads: AtomicU64,
|
||||
timeline_restarts: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn eviction_done(&self) {
|
||||
self.evictions_count.fetch_add(1, Ordering::Relaxed);
|
||||
self.evictions.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn download_done(&self, size: u64) {
|
||||
self.downloads_count.fetch_add(1, Ordering::Relaxed);
|
||||
self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
|
||||
fn download_done(&self) {
|
||||
self.downloads.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn timeline_restart_done(&self) {
|
||||
self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
|
||||
@@ -103,49 +92,28 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let token = CancellationToken::new();
|
||||
let mut tasks = JoinSet::new();
|
||||
|
||||
let periodic_stats = Arc::new(LiveStats::default());
|
||||
let total_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let start = Instant::now();
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
tasks.spawn({
|
||||
let periodic_stats = Arc::clone(&periodic_stats);
|
||||
let total_stats = Arc::clone(&total_stats);
|
||||
let cloned_token = token.clone();
|
||||
let live_stats = Arc::clone(&live_stats);
|
||||
async move {
|
||||
let mut last_at = Instant::now();
|
||||
loop {
|
||||
if cloned_token.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
|
||||
let now = Instant::now();
|
||||
let delta: Duration = now - last_at;
|
||||
last_at = now;
|
||||
|
||||
let LiveStats {
|
||||
evictions_count,
|
||||
downloads_count,
|
||||
downloads_bytes,
|
||||
evictions,
|
||||
downloads,
|
||||
timeline_restarts,
|
||||
} = &*periodic_stats;
|
||||
let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
|
||||
let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
|
||||
let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
|
||||
} = &*live_stats;
|
||||
let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||
let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||
let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
|
||||
|
||||
total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
|
||||
total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
|
||||
total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
|
||||
total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
|
||||
|
||||
let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
|
||||
let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
|
||||
let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
|
||||
|
||||
info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
|
||||
info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -156,42 +124,14 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
args,
|
||||
Arc::clone(&mgmt_api_client),
|
||||
tl,
|
||||
Arc::clone(&periodic_stats),
|
||||
token.clone(),
|
||||
Arc::clone(&live_stats),
|
||||
));
|
||||
}
|
||||
}
|
||||
if let Some(runtime) = args.runtime {
|
||||
tokio::spawn(async move {
|
||||
tokio::time::sleep(runtime.into()).await;
|
||||
token.cancel();
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
res.unwrap();
|
||||
}
|
||||
let end = Instant::now();
|
||||
let duration: Duration = end - start;
|
||||
|
||||
let output = {
|
||||
let LiveStats {
|
||||
evictions_count,
|
||||
downloads_count,
|
||||
downloads_bytes,
|
||||
timeline_restarts,
|
||||
} = &*total_stats;
|
||||
Output {
|
||||
downloads_count: downloads_count.load(Ordering::Relaxed),
|
||||
downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
|
||||
evictions_count: evictions_count.load(Ordering::Relaxed),
|
||||
timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
|
||||
runtime: duration,
|
||||
}
|
||||
};
|
||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||
println!("{output}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -200,7 +140,6 @@ async fn timeline_actor(
|
||||
mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
|
||||
timeline: TenantTimelineId,
|
||||
live_stats: Arc<LiveStats>,
|
||||
token: CancellationToken,
|
||||
) {
|
||||
// TODO: support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||
@@ -210,7 +149,7 @@ async fn timeline_actor(
|
||||
layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
|
||||
concurrency: Arc<tokio::sync::Semaphore>,
|
||||
}
|
||||
while !token.is_cancelled() {
|
||||
loop {
|
||||
debug!("restarting timeline");
|
||||
let layer_map_info = mgmt_api_client
|
||||
.layer_map_info(tenant_shard_id, timeline.timeline_id)
|
||||
@@ -246,7 +185,7 @@ async fn timeline_actor(
|
||||
|
||||
live_stats.timeline_restart_done();
|
||||
|
||||
while !token.is_cancelled() {
|
||||
loop {
|
||||
assert!(!timeline.joinset.is_empty());
|
||||
if let Some(res) = timeline.joinset.try_join_next() {
|
||||
debug!(?res, "a layer actor exited, should not happen");
|
||||
@@ -316,7 +255,7 @@ async fn layer_actor(
|
||||
.layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||
.await
|
||||
.unwrap();
|
||||
live_stats.download_done(layer.layer_file_size());
|
||||
live_stats.download_done();
|
||||
did_it
|
||||
}
|
||||
};
|
||||
|
||||
@@ -14,7 +14,6 @@ mod util {
|
||||
|
||||
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
|
||||
mod cmd {
|
||||
pub(super) mod aux_files;
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod ondemand_download_churn;
|
||||
@@ -28,7 +27,6 @@ enum Args {
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||
AuxFiles(cmd::aux_files::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -48,7 +46,6 @@ fn main() {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
Args::AuxFiles(args) => cmd::aux_files::main(args),
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
@@ -1,39 +1,15 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use ::metrics::IntGauge;
|
||||
use bytes::{Buf, BufMut, Bytes};
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
|
||||
use tracing::warn;
|
||||
|
||||
// BEGIN Copyright (c) 2017 Servo Contributors
|
||||
|
||||
/// Const version of FNV hash.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub const fn fnv_hash(bytes: &[u8]) -> u128 {
|
||||
const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
|
||||
const PRIME: u128 = 0x0000000001000000000000000000013B;
|
||||
|
||||
let mut hash = INITIAL_STATE;
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
hash ^= bytes[i] as u128;
|
||||
hash = hash.wrapping_mul(PRIME);
|
||||
i += 1;
|
||||
}
|
||||
hash
|
||||
}
|
||||
|
||||
// END Copyright (c) 2017 Servo Contributors
|
||||
|
||||
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
|
||||
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
|
||||
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
|
||||
let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
|
||||
let hash = fnv_hash(data).to_be_bytes();
|
||||
let mut key = [0; METADATA_KEY_SIZE];
|
||||
let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
|
||||
key[0] = AUX_KEY_PREFIX;
|
||||
key[1] = dir_level1;
|
||||
key[2] = dir_level2;
|
||||
key[3..16].copy_from_slice(&hash[3..16]);
|
||||
key[3..16].copy_from_slice(&hash[0..13]);
|
||||
Key::from_metadata_key_fixed_size(&key)
|
||||
}
|
||||
|
||||
@@ -164,55 +140,6 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
|
||||
Ok(encoded)
|
||||
}
|
||||
|
||||
/// An estimation of the size of aux files.
|
||||
pub struct AuxFileSizeEstimator {
|
||||
aux_file_size_gauge: IntGauge,
|
||||
size: Arc<std::sync::Mutex<Option<isize>>>,
|
||||
}
|
||||
|
||||
impl AuxFileSizeEstimator {
|
||||
pub fn new(aux_file_size_gauge: IntGauge) -> Self {
|
||||
Self {
|
||||
aux_file_size_gauge,
|
||||
size: Arc::new(std::sync::Mutex::new(None)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn on_base_backup(&self, new_size: usize) {
|
||||
let mut guard = self.size.lock().unwrap();
|
||||
*guard = Some(new_size as isize);
|
||||
self.report(new_size as isize);
|
||||
}
|
||||
|
||||
pub fn on_add(&self, file_size: usize) {
|
||||
let mut guard = self.size.lock().unwrap();
|
||||
if let Some(size) = &mut *guard {
|
||||
*size += file_size as isize;
|
||||
self.report(*size);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn on_remove(&self, file_size: usize) {
|
||||
let mut guard = self.size.lock().unwrap();
|
||||
if let Some(size) = &mut *guard {
|
||||
*size -= file_size as isize;
|
||||
self.report(*size);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn on_update(&self, old_size: usize, new_size: usize) {
|
||||
let mut guard = self.size.lock().unwrap();
|
||||
if let Some(size) = &mut *guard {
|
||||
*size += new_size as isize - old_size as isize;
|
||||
self.report(*size);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn report(&self, size: isize) {
|
||||
self.aux_file_size_gauge.set(size as i64);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -221,19 +148,15 @@ mod tests {
|
||||
fn test_hash_portable() {
|
||||
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
|
||||
// if the algorithm produces the same hash across different environments.
|
||||
|
||||
assert_eq!(
|
||||
265160408618497461376862998434862070044,
|
||||
super::fnv_hash("test1".as_bytes())
|
||||
305317690835051308206966631765527126151,
|
||||
twox_hash::xxh3::hash128("test1".as_bytes())
|
||||
);
|
||||
assert_eq!(
|
||||
295486155126299629456360817749600553988,
|
||||
super::fnv_hash("test/test2".as_bytes())
|
||||
);
|
||||
assert_eq!(
|
||||
144066263297769815596495629667062367629,
|
||||
super::fnv_hash("".as_bytes())
|
||||
85104974691013376326742244813280798847,
|
||||
twox_hash::xxh3::hash128("test/test2".as_bytes())
|
||||
);
|
||||
assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -241,28 +164,28 @@ mod tests {
|
||||
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
|
||||
// of the page server.
|
||||
assert_eq!(
|
||||
"62000001017F8B83D94F7081693471ABF91C",
|
||||
encode_aux_file_key("pg_logical/mappings/test1").to_string(),
|
||||
"6200000101E5B20C5F8DD5AA3289D6D9EAFA",
|
||||
encode_aux_file_key("pg_logical/mappings/test1").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"62000001027F8E83D94F7081693471ABFCCD",
|
||||
encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
|
||||
"620000010239AAC544893139B26F501B97E6",
|
||||
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"62000001032E07BB014262B821756295C58D",
|
||||
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
|
||||
"620000010300000000000000000000000000",
|
||||
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"62000001FF4F38E1C74754E7D03C1A660178",
|
||||
encode_aux_file_key("pg_logical/unsupported").to_string(),
|
||||
"62000001FF8635AF2134B7266EC5B4189FD6",
|
||||
encode_aux_file_key("pg_logical/unsupported").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"62000002017F8D83D94F7081693471ABFB92",
|
||||
"6200000201772D0E5D71DE14DA86142A1619",
|
||||
encode_aux_file_key("pg_replslot/test3").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"620000FFFF2B6ECC8AEF93F643DC44F15E03",
|
||||
encode_aux_file_key("other_file_not_supported").to_string(),
|
||||
"620000FFFF1866EBEB53B807B26A2416F317",
|
||||
encode_aux_file_key("other_file_not_supported").to_string()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -284,6 +284,7 @@ fn start_pageserver(
|
||||
))
|
||||
.unwrap();
|
||||
pageserver::preinitialize_metrics();
|
||||
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
@@ -382,7 +383,7 @@ fn start_pageserver(
|
||||
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
||||
|
||||
// Set up remote storage client
|
||||
let remote_storage = create_remote_storage_client(conf)?;
|
||||
let remote_storage = Some(create_remote_storage_client(conf)?);
|
||||
|
||||
// Set up deletion queue
|
||||
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
||||
@@ -515,12 +516,16 @@ fn start_pageserver(
|
||||
}
|
||||
});
|
||||
|
||||
let secondary_controller = secondary::spawn_tasks(
|
||||
tenant_manager.clone(),
|
||||
remote_storage.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
shutdown_pageserver.clone(),
|
||||
);
|
||||
let secondary_controller = if let Some(remote_storage) = &remote_storage {
|
||||
secondary::spawn_tasks(
|
||||
tenant_manager.clone(),
|
||||
remote_storage.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
shutdown_pageserver.clone(),
|
||||
)
|
||||
} else {
|
||||
secondary::null_controller()
|
||||
};
|
||||
|
||||
// shared state between the disk-usage backed eviction background task and the http endpoint
|
||||
// that allows triggering disk-usage based eviction manually. note that the http endpoint
|
||||
@@ -528,13 +533,15 @@ fn start_pageserver(
|
||||
// been configured.
|
||||
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
|
||||
|
||||
launch_disk_usage_global_eviction_task(
|
||||
conf,
|
||||
remote_storage.clone(),
|
||||
disk_usage_eviction_state.clone(),
|
||||
tenant_manager.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
)?;
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
launch_disk_usage_global_eviction_task(
|
||||
conf,
|
||||
remote_storage.clone(),
|
||||
disk_usage_eviction_state.clone(),
|
||||
tenant_manager.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
)?;
|
||||
}
|
||||
|
||||
// Start up the service to handle HTTP mgmt API request. We created the
|
||||
// listener earlier already.
|
||||
@@ -647,20 +654,17 @@ fn start_pageserver(
|
||||
None,
|
||||
"libpq endpoint listener",
|
||||
true,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
page_service::libpq_listener_main(
|
||||
tenant_manager,
|
||||
broker_client,
|
||||
pg_auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
libpq_ctx,
|
||||
task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
async move {
|
||||
page_service::libpq_listener_main(
|
||||
conf,
|
||||
broker_client,
|
||||
pg_auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
libpq_ctx,
|
||||
task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -689,7 +693,14 @@ fn start_pageserver(
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
pageserver::shutdown_pageserver(
|
||||
&tenant_manager,
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
unreachable!()
|
||||
})
|
||||
}
|
||||
|
||||
@@ -99,7 +99,7 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
|
||||
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
|
||||
@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
|
||||
// mean the synthetic size worker should terminate.
|
||||
let shutting_down = matches!(
|
||||
e.downcast_ref::<PageReconstructError>(),
|
||||
Some(PageReconstructError::Cancelled)
|
||||
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||
);
|
||||
|
||||
if !shutting_down {
|
||||
|
||||
@@ -311,7 +311,7 @@ impl DeletionList {
|
||||
result.extend(
|
||||
timeline_layers
|
||||
.into_iter()
|
||||
.map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
|
||||
.map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -632,7 +632,7 @@ impl DeletionQueue {
|
||||
///
|
||||
/// If remote_storage is None, then the returned workers will also be None.
|
||||
pub fn new<C>(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
control_plane_client: Option<C>,
|
||||
conf: &'static PageServerConf,
|
||||
) -> (Self, Option<DeletionQueueWorkers<C>>)
|
||||
@@ -658,6 +658,23 @@ impl DeletionQueue {
|
||||
// longer to flush after Tenants have all been torn down.
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let remote_storage = match remote_storage {
|
||||
None => {
|
||||
return (
|
||||
Self {
|
||||
client: DeletionQueueClient {
|
||||
tx,
|
||||
executor_tx,
|
||||
lsn_table: lsn_table.clone(),
|
||||
},
|
||||
cancel,
|
||||
},
|
||||
None,
|
||||
)
|
||||
}
|
||||
Some(r) => r,
|
||||
};
|
||||
|
||||
(
|
||||
Self {
|
||||
client: DeletionQueueClient {
|
||||
@@ -748,7 +765,7 @@ mod test {
|
||||
/// Simulate a pageserver restart by destroying and recreating the deletion queue
|
||||
async fn restart(&mut self) {
|
||||
let (deletion_queue, workers) = DeletionQueue::new(
|
||||
self.storage.clone(),
|
||||
Some(self.storage.clone()),
|
||||
Some(self.mock_control_plane.clone()),
|
||||
self.harness.conf,
|
||||
);
|
||||
@@ -858,7 +875,7 @@ mod test {
|
||||
let mock_control_plane = MockControlPlane::new();
|
||||
|
||||
let (deletion_queue, worker) = DeletionQueue::new(
|
||||
storage.clone(),
|
||||
Some(storage.clone()),
|
||||
Some(mock_control_plane.clone()),
|
||||
harness.conf,
|
||||
);
|
||||
|
||||
@@ -534,12 +534,18 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
});
|
||||
}
|
||||
EvictionLayer::Secondary(layer) => {
|
||||
let file_size = layer.metadata.file_size;
|
||||
let file_size = layer.metadata.file_size();
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.secondary_tenant
|
||||
.evict_layer(layer.timeline_id, layer.name)
|
||||
.evict_layer(
|
||||
tenant_manager.get_conf(),
|
||||
layer.timeline_id,
|
||||
layer.name,
|
||||
layer.metadata,
|
||||
)
|
||||
.await;
|
||||
Ok(file_size)
|
||||
});
|
||||
@@ -641,7 +647,7 @@ impl EvictionLayer {
|
||||
pub(crate) fn get_file_size(&self) -> u64 {
|
||||
match self {
|
||||
Self::Attached(l) => l.layer_desc().file_size,
|
||||
Self::Secondary(sl) => sl.metadata.file_size,
|
||||
Self::Secondary(sl) => sl.metadata.file_size(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,37 +257,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Obtain lease for the given LSN
|
||||
parameters:
|
||||
- name: lsn
|
||||
in: query
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
description: A LSN to obtain the lease for
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LsnLease"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -612,80 +581,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
ŕequired: true
|
||||
schema:
|
||||
type: string
|
||||
|
||||
put:
|
||||
description: |
|
||||
Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
|
||||
Current implementation might not be retryable across failure cases, but will be enhanced in future.
|
||||
Detaching should be expected to be expensive operation. Timeouts should be retried.
|
||||
responses:
|
||||
"200":
|
||||
description: |
|
||||
The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
|
||||
If any timelines were deleted after reparenting, they might not be on this list.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/AncestorDetached"
|
||||
|
||||
"400":
|
||||
description: |
|
||||
Number of early checks meaning the timeline cannot be detached now:
|
||||
- the ancestor of timeline has an ancestor: not supported, see RFC
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
"404":
|
||||
description: Tenant or timeline not found.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
|
||||
"409":
|
||||
description: |
|
||||
The timeline can never be detached:
|
||||
- timeline has no ancestor, implying that the timeline has never had an ancestor
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
|
||||
"500":
|
||||
description: |
|
||||
Transient error, for example, pageserver shutdown happened while
|
||||
processing the request but we were unable to distinguish that. Must
|
||||
be retried.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
"503":
|
||||
description: |
|
||||
Temporarily unavailable, please retry. Possible reasons:
|
||||
- another timeline detach for the same tenant is underway, please retry later
|
||||
- detected shutdown error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/:
|
||||
get:
|
||||
description: Get tenants list
|
||||
@@ -1085,15 +980,6 @@ components:
|
||||
type: string
|
||||
enum: [past, present, future, nodata]
|
||||
|
||||
LsnLease:
|
||||
type: object
|
||||
required:
|
||||
- valid_until
|
||||
properties:
|
||||
valid_until:
|
||||
type: string
|
||||
format: date-time
|
||||
|
||||
PageserverUtilization:
|
||||
type: object
|
||||
required:
|
||||
@@ -1151,19 +1037,6 @@ components:
|
||||
format: int64
|
||||
description: How many bytes of layer content were in the latest layer heatmap
|
||||
|
||||
AncestorDetached:
|
||||
type: object
|
||||
required:
|
||||
- reparented_timelines
|
||||
properties:
|
||||
reparented_timelines:
|
||||
type: array
|
||||
description: Set of reparented timeline ids
|
||||
properties:
|
||||
type: string
|
||||
format: hex
|
||||
description: TimelineId
|
||||
|
||||
|
||||
Error:
|
||||
type: object
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
//!
|
||||
//! Management HTTP API
|
||||
//!
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
@@ -16,9 +14,6 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
@@ -29,11 +24,7 @@ use pageserver_api::models::TenantScanRemoteStorageShard;
|
||||
use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::TopTenantShardsRequest;
|
||||
use pageserver_api::models::TopTenantShardsResponse;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
@@ -74,9 +65,7 @@ use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::SpawnMode;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
@@ -115,7 +104,7 @@ pub struct State {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
@@ -129,7 +118,7 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
@@ -184,6 +173,9 @@ impl From<PageReconstructError> for ApiError {
|
||||
PageReconstructError::Cancelled => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::AncestorStopping(_) => {
|
||||
ApiError::ResourceUnavailable(format!("{pre}").into())
|
||||
}
|
||||
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||
}
|
||||
@@ -279,13 +271,6 @@ impl From<GetTenantError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetTimelineError> for ApiError {
|
||||
fn from(gte: GetTimelineError) -> Self {
|
||||
// Rationale: tenant is activated only after eligble timelines activate
|
||||
ApiError::NotFound(gte.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetActiveTenantError> for ApiError {
|
||||
fn from(e: GetActiveTenantError) -> ApiError {
|
||||
match e {
|
||||
@@ -393,7 +378,7 @@ async fn build_timeline_info_common(
|
||||
let guard = timeline.last_received_wal.lock().unwrap();
|
||||
if let Some(info) = guard.as_ref() {
|
||||
(
|
||||
Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
|
||||
Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
|
||||
Some(info.last_received_msg_lsn),
|
||||
Some(info.last_received_msg_ts),
|
||||
)
|
||||
@@ -448,8 +433,6 @@ async fn build_timeline_info_common(
|
||||
state,
|
||||
|
||||
walreceiver_status,
|
||||
|
||||
last_aux_file_policy: timeline.last_aux_file_policy.load(),
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
@@ -650,7 +633,9 @@ async fn timeline_preserve_initdb_handler(
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, false)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
timeline
|
||||
.preserve_initdb_archive()
|
||||
@@ -692,7 +677,9 @@ async fn timeline_detail_handler(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, false)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
@@ -826,6 +813,12 @@ async fn tenant_attach_handler(
|
||||
|
||||
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
|
||||
|
||||
if state.remote_storage.is_none() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"attach_tenant is not possible because pageserver was configured without remote storage"
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let shard_params = ShardParameters::default();
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
|
||||
@@ -1650,6 +1643,12 @@ async fn tenant_time_travel_remote_storage_handler(
|
||||
)));
|
||||
}
|
||||
|
||||
let Some(storage) = state.remote_storage.as_ref() else {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run time travel"
|
||||
)));
|
||||
};
|
||||
|
||||
if timestamp > done_if_after {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"The done_if_after timestamp comes before the timestamp to recover to"
|
||||
@@ -1659,7 +1658,7 @@ async fn tenant_time_travel_remote_storage_handler(
|
||||
tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
|
||||
|
||||
remote_timeline_client::upload::time_travel_recover_tenant(
|
||||
&state.remote_storage,
|
||||
storage,
|
||||
&tenant_shard_id,
|
||||
timestamp,
|
||||
done_if_after,
|
||||
@@ -1704,32 +1703,6 @@ async fn handle_tenant_break(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
// Obtains an lsn lease on the given timeline.
|
||||
async fn lsn_lease_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let lsn: Lsn = parse_query_param(&request, "lsn")?
|
||||
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline
|
||||
.make_lsn_lease(lsn, &ctx)
|
||||
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
async fn timeline_gc_handler(
|
||||
mut request: Request<Body>,
|
||||
@@ -1742,7 +1715,12 @@ async fn timeline_gc_handler(
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
|
||||
let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
|
||||
let gc_result = wait_task_done
|
||||
.await
|
||||
.context("wait for gc task")
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, gc_result)
|
||||
}
|
||||
@@ -1765,8 +1743,6 @@ async fn timeline_compact_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
@@ -1775,9 +1751,6 @@ async fn timeline_compact_handler(
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
@@ -1802,8 +1775,6 @@ async fn timeline_checkpoint_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
@@ -1811,26 +1782,11 @@ async fn timeline_checkpoint_handler(
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
match e {
|
||||
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
|
||||
other => ApiError::InternalServerError(other.into()),
|
||||
|
||||
}
|
||||
})?;
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e|
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e)
|
||||
}
|
||||
)?;
|
||||
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1915,11 +1871,14 @@ async fn timeline_detach_ancestor_handler(
|
||||
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
let ctx = &ctx;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let (_guard, prepared) = timeline
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let res = state
|
||||
.tenant_manager
|
||||
@@ -1949,6 +1908,11 @@ async fn deletion_queue_flush(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&r);
|
||||
|
||||
if state.remote_storage.is_none() {
|
||||
// Nothing to do if remote storage is disabled.
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
|
||||
let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
|
||||
|
||||
let flush = async {
|
||||
@@ -2053,7 +2017,9 @@ async fn active_timeline_of_active_tenant(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
Ok(tenant.get_timeline(timeline_id, true)?)
|
||||
tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))
|
||||
}
|
||||
|
||||
async fn always_panic_handler(
|
||||
@@ -2111,11 +2077,18 @@ async fn disk_usage_eviction_run(
|
||||
};
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let Some(storage) = state.remote_storage.as_ref() else {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run eviction iteration"
|
||||
)));
|
||||
};
|
||||
|
||||
let eviction_state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&eviction_state,
|
||||
&state.remote_storage,
|
||||
storage,
|
||||
usage,
|
||||
&state.tenant_manager,
|
||||
config.eviction_order,
|
||||
@@ -2152,23 +2125,29 @@ async fn tenant_scan_remote_handler(
|
||||
let state = get_state(&request);
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
|
||||
let Some(remote_storage) = state.remote_storage.as_ref() else {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Remote storage not configured"
|
||||
)));
|
||||
};
|
||||
|
||||
let mut response = TenantScanRemoteStorageResponse::default();
|
||||
|
||||
let (shards, _other_keys) =
|
||||
list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone())
|
||||
list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
|
||||
for tenant_shard_id in shards {
|
||||
let (timeline_ids, _other_keys) =
|
||||
list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone())
|
||||
list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
|
||||
let mut generation = Generation::none();
|
||||
for timeline_id in timeline_ids {
|
||||
match download_index_part(
|
||||
&state.remote_storage,
|
||||
remote_storage,
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
Generation::MAX,
|
||||
@@ -2317,31 +2296,6 @@ async fn post_tracing_event_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn force_aux_policy_switch_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
|
||||
let policy: AuxFilePolicy = json_request(&mut r).await?;
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
timeline
|
||||
.do_switch_aux_policy(policy)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn put_io_engine_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -2404,150 +2358,6 @@ async fn get_utilization(
|
||||
.map_err(ApiError::InternalServerError)
|
||||
}
|
||||
|
||||
async fn list_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let body: ListAuxFilesRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
|
||||
json_response(StatusCode::OK, files)
|
||||
}
|
||||
|
||||
async fn ingest_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let body: IngestAuxFilesRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let mut modification = timeline.begin_modification(
|
||||
Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
|
||||
);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
for (fname, content) in body.aux_files {
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
modification
|
||||
.commit(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Report on the largest tenants on this pageserver, for the storage controller to identify
|
||||
/// candidates for splitting
|
||||
async fn post_top_tenants(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let request: TopTenantShardsRequest = json_request(&mut r).await?;
|
||||
let state = get_state(&r);
|
||||
|
||||
fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 {
|
||||
match order_by {
|
||||
TenantSorting::ResidentSize => sizes.resident_size,
|
||||
TenantSorting::MaxLogicalSize => sizes.max_logical_size,
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
struct HeapItem {
|
||||
metric: u64,
|
||||
sizes: TopTenantShardItem,
|
||||
}
|
||||
|
||||
impl PartialOrd for HeapItem {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which
|
||||
/// supports popping the greatest item but not the smallest.
|
||||
impl Ord for HeapItem {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
Reverse(self.metric).cmp(&Reverse(other.metric))
|
||||
}
|
||||
}
|
||||
|
||||
let mut top_n: BinaryHeap<HeapItem> = BinaryHeap::with_capacity(request.limit);
|
||||
|
||||
// FIXME: this is a lot of clones to take this tenant list
|
||||
for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() {
|
||||
if let Some(shards_lt) = request.where_shards_lt {
|
||||
// Ignore tenants which already have >= this many shards
|
||||
if tenant_shard_id.shard_count >= shards_lt {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let sizes = match tenant_slot {
|
||||
TenantSlot::Attached(tenant) => tenant.get_sizes(),
|
||||
TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let metric = get_size_metric(&sizes, &request.order_by);
|
||||
|
||||
if let Some(gt) = request.where_gt {
|
||||
// Ignore tenants whose metric is <= the lower size threshold, to do less sorting work
|
||||
if metric <= gt {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match top_n.peek() {
|
||||
None => {
|
||||
// Top N list is empty: candidate becomes first member
|
||||
top_n.push(HeapItem { metric, sizes });
|
||||
}
|
||||
Some(i) if i.metric > metric && top_n.len() < request.limit => {
|
||||
// Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end
|
||||
top_n.push(HeapItem { metric, sizes });
|
||||
}
|
||||
Some(i) if i.metric > metric => {
|
||||
// List is at limit and lowest value is greater than our candidate, drop it.
|
||||
}
|
||||
Some(_) => top_n.push(HeapItem { metric, sizes }),
|
||||
}
|
||||
|
||||
while top_n.len() > request.limit {
|
||||
top_n.pop();
|
||||
}
|
||||
}
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
TopTenantShardsResponse {
|
||||
shards: top_n.into_iter().map(|i| i.sizes).collect(),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
@@ -2760,10 +2570,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|
||||
|r| api_handler(r, get_timestamp_of_lsn_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|
||||
|r| api_handler(r, lsn_lease_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
|
||||
|r| api_handler(r, timeline_gc_handler),
|
||||
@@ -2837,19 +2643,6 @@ pub fn make_router(
|
||||
|r| api_handler(r, timeline_collect_keyspace),
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
||||
|r| api_handler(r, force_aux_policy_switch_handler),
|
||||
)
|
||||
.get("/v1/utilization", |r| api_handler(r, get_utilization))
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
|
||||
|r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
|
||||
|r| testing_api_handler("list_aux_files", r, list_aux_files),
|
||||
)
|
||||
.post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics;
|
||||
#[tracing::instrument(skip_all, fields(%exit_code))]
|
||||
pub async fn shutdown_pageserver(
|
||||
tenant_manager: &TenantManager,
|
||||
mut deletion_queue: DeletionQueue,
|
||||
deletion_queue: Option<DeletionQueue>,
|
||||
exit_code: i32,
|
||||
) {
|
||||
use std::time::Duration;
|
||||
@@ -89,7 +89,9 @@ pub async fn shutdown_pageserver(
|
||||
.await;
|
||||
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
if let Some(mut deletion_queue) = deletion_queue {
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
}
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
@@ -112,6 +114,10 @@ pub async fn shutdown_pageserver(
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub(crate) const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
@@ -525,15 +525,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_standby_horizon",
|
||||
"Standby apply LSN for which GC is hold off, by timeline.",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_resident_physical_size",
|
||||
@@ -594,15 +585,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define current logical size metric")
|
||||
});
|
||||
|
||||
static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_aux_file_estimated_size",
|
||||
"The size of all aux files for a timeline in aux file v2 store.",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) mod initial_logical_size {
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -1867,6 +1849,7 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) time_spent_on_ingest: Histogram,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
@@ -1890,6 +1873,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
time_spent_on_ingest: register_histogram!(
|
||||
"pageserver_wal_ingest_put_value_seconds",
|
||||
"Actual time spent on ingesting a record",
|
||||
redo_histogram_time_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
@@ -2001,6 +1990,29 @@ impl Default for WalRedoProcessCounters {
|
||||
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
Lazy::new(WalRedoProcessCounters::default);
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub mod wal_redo {
|
||||
use super::*;
|
||||
|
||||
static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
|
||||
std::sync::Mutex::new(
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_wal_redo_process_kind",
|
||||
"The configured process kind for walredo",
|
||||
&["kind"],
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
});
|
||||
|
||||
pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
|
||||
// use guard to avoid races around the next two steps
|
||||
let guard = PROCESS_KIND.lock().unwrap();
|
||||
guard.reset();
|
||||
guard.with_label_values(&[&format!("{kind}")]).set(1);
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
pub(crate) struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
@@ -2100,11 +2112,9 @@ pub(crate) struct TimelineMetrics {
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub aux_file_size_gauge: IntGauge,
|
||||
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||
pub evictions: IntCounter,
|
||||
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
|
||||
@@ -2170,9 +2180,6 @@ impl TimelineMetrics {
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let standby_horizon_gauge = STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -2180,9 +2187,6 @@ impl TimelineMetrics {
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let aux_file_size_gauge = AUX_FILE_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
// TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
|
||||
let directory_entries_count_gauge_closure = {
|
||||
let tenant_shard_id = *tenant_shard_id;
|
||||
@@ -2218,10 +2222,8 @@ impl TimelineMetrics {
|
||||
find_gc_cutoffs_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
aux_file_size_gauge,
|
||||
directory_entries_count_gauge,
|
||||
evictions,
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
@@ -2253,7 +2255,6 @@ impl TimelineMetrics {
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
{
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
@@ -2263,7 +2264,6 @@ impl TimelineMetrics {
|
||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
@@ -2320,7 +2320,6 @@ use pin_project_lite::pin_project;
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -2330,35 +2329,35 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
|
||||
/// Maintain a per timeline gauge in addition to the global gauge.
|
||||
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
|
||||
last_set: AtomicU64,
|
||||
struct PerTimelineRemotePhysicalSizeGauge {
|
||||
last_set: u64,
|
||||
gauge: UIntGauge,
|
||||
}
|
||||
|
||||
impl PerTimelineRemotePhysicalSizeGauge {
|
||||
fn new(per_timeline_gauge: UIntGauge) -> Self {
|
||||
Self {
|
||||
last_set: AtomicU64::new(0),
|
||||
last_set: per_timeline_gauge.get(),
|
||||
gauge: per_timeline_gauge,
|
||||
}
|
||||
}
|
||||
pub(crate) fn set(&self, sz: u64) {
|
||||
fn set(&mut self, sz: u64) {
|
||||
self.gauge.set(sz);
|
||||
let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
|
||||
if sz < prev {
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
|
||||
if sz < self.last_set {
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
|
||||
} else {
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
|
||||
};
|
||||
self.last_set = sz;
|
||||
}
|
||||
pub(crate) fn get(&self) -> u64 {
|
||||
fn get(&self) -> u64 {
|
||||
self.gauge.get()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PerTimelineRemotePhysicalSizeGauge {
|
||||
fn drop(&mut self) {
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
|
||||
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2366,7 +2365,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
|
||||
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
||||
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
|
||||
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
@@ -2374,27 +2373,38 @@ pub(crate) struct RemoteTimelineClientMetrics {
|
||||
|
||||
impl RemoteTimelineClientMetrics {
|
||||
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id_str = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id_str = timeline_id.to_string();
|
||||
|
||||
let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
|
||||
REMOTE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_id_str,
|
||||
shard_id: shard_id_str,
|
||||
timeline_id: timeline_id_str,
|
||||
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
||||
shard_id: format!("{}", tenant_shard_id.shard_slug()),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
calls: Mutex::new(HashMap::default()),
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
bytes_finished_counter: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge,
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn remote_physical_size_set(&self, sz: u64) {
|
||||
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
|
||||
let gauge = guard.get_or_insert_with(|| {
|
||||
PerTimelineRemotePhysicalSizeGauge::new(
|
||||
REMOTE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
&self.timeline_id,
|
||||
])
|
||||
.unwrap(),
|
||||
)
|
||||
});
|
||||
gauge.set(sz);
|
||||
}
|
||||
|
||||
pub(crate) fn remote_physical_size_get(&self) -> u64 {
|
||||
let guard = self.remote_physical_size_gauge.lock().unwrap();
|
||||
guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
|
||||
}
|
||||
|
||||
pub fn remote_operation_time(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
|
||||
@@ -19,7 +19,6 @@ use pageserver_api::models::{
|
||||
};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use pageserver_api::shard::ShardNumber;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
@@ -33,8 +32,6 @@ use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::time::SystemTime;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
@@ -52,6 +49,7 @@ use utils::{
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
@@ -61,16 +59,13 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::GetTenantError;
|
||||
use crate::tenant::mgr::ShardResolveResult;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
use crate::tenant::timeline::FlushLayerError;
|
||||
use crate::tenant::timeline::WaitLsnError;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Tenant;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
@@ -140,7 +135,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
|
||||
/// Listens for connections, and launches a new handler task for each.
|
||||
///
|
||||
pub async fn libpq_listener_main(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
listener: TcpListener,
|
||||
@@ -185,7 +180,7 @@ pub async fn libpq_listener_main(
|
||||
"serving compute connection task",
|
||||
false,
|
||||
page_service_conn_main(
|
||||
tenant_manager.clone(),
|
||||
conf,
|
||||
broker_client.clone(),
|
||||
local_auth,
|
||||
socket,
|
||||
@@ -208,7 +203,7 @@ pub async fn libpq_listener_main(
|
||||
|
||||
#[instrument(skip_all, fields(peer_addr))]
|
||||
async fn page_service_conn_main(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
@@ -261,14 +256,11 @@ async fn page_service_conn_main(
|
||||
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
|
||||
let socket = std::pin::pin!(socket);
|
||||
|
||||
fail::fail_point!("ps::connection-start::pre-login");
|
||||
|
||||
// XXX: pgbackend.run() should take the connection_ctx,
|
||||
// and create a child per-query context when it invokes process_query.
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
// and create the per-query context in process_query ourselves.
|
||||
let mut conn_handler =
|
||||
PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
|
||||
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
|
||||
match pgbackend
|
||||
@@ -299,12 +291,11 @@ struct HandlerTimeline {
|
||||
}
|
||||
|
||||
struct PageServerHandler {
|
||||
_conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
|
||||
/// The context created for the lifetime of the connection
|
||||
/// services by this PageServerHandler.
|
||||
/// For each query received over the connection,
|
||||
@@ -373,7 +364,7 @@ impl From<WaitLsnError> for PageStreamError {
|
||||
match value {
|
||||
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
|
||||
WaitLsnError::Shutdown => Self::Shutdown,
|
||||
e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
|
||||
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -383,20 +374,20 @@ impl From<WaitLsnError> for QueryError {
|
||||
match value {
|
||||
e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
|
||||
WaitLsnError::Shutdown => Self::Shutdown,
|
||||
WaitLsnError::BadState { .. } => Self::Reconnect,
|
||||
WaitLsnError::BadState => Self::Reconnect,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
connection_ctx: RequestContext,
|
||||
) -> Self {
|
||||
PageServerHandler {
|
||||
tenant_manager,
|
||||
_conf: conf,
|
||||
broker_client,
|
||||
auth,
|
||||
claims: None,
|
||||
@@ -561,9 +552,13 @@ impl PageServerHandler {
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
|
||||
let tenant = self
|
||||
.get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
|
||||
.await?;
|
||||
let tenant = mgr::get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::First,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Make request tracer if needed
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
@@ -606,7 +601,6 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
trace!("query: {copy_data_bytes:?}");
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
// Trace request if needed
|
||||
if let Some(t) = tracer.as_mut() {
|
||||
@@ -621,7 +615,6 @@ impl PageServerHandler {
|
||||
|
||||
let (response, span) = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -631,7 +624,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -641,7 +633,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
// shard_id is filled in by the handler
|
||||
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
@@ -652,7 +643,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -662,7 +652,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -737,9 +726,13 @@ impl PageServerHandler {
|
||||
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let tenant = self
|
||||
.get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
|
||||
.await?;
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::Zero,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
let timeline = tenant
|
||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||
.await?;
|
||||
@@ -831,10 +824,7 @@ impl PageServerHandler {
|
||||
// We only want to persist the data, and it doesn't matter if it's in the
|
||||
// shape of deltas or images.
|
||||
info!("flushing layers");
|
||||
timeline.freeze_and_flush().await.map_err(|e| match e {
|
||||
FlushLayerError::Cancelled => QueryError::Shutdown,
|
||||
other => QueryError::Other(other.into()),
|
||||
})?;
|
||||
timeline.freeze_and_flush().await?;
|
||||
|
||||
info!("done");
|
||||
Ok(())
|
||||
@@ -919,39 +909,6 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id, %lsn))]
|
||||
async fn handle_make_lsn_lease<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
|
||||
.await?;
|
||||
let lease = timeline.make_lsn_lease(lsn, ctx)?;
|
||||
let valid_until = lease
|
||||
.valid_until
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.map_err(|e| QueryError::Other(e.into()))?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"valid_until",
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(
|
||||
&valid_until.as_millis().to_be_bytes(),
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_rel_exists_request(
|
||||
&mut self,
|
||||
@@ -1413,69 +1370,18 @@ impl PageServerHandler {
|
||||
timeline_id: TimelineId,
|
||||
selector: ShardSelector,
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
let tenant = self
|
||||
.get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT)
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
selector,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
/// Get a shard's [`Tenant`] in its active state, if present. If we don't find the shard and some
|
||||
/// slots for this tenant are `InProgress` then we will wait.
|
||||
/// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait.
|
||||
///
|
||||
/// `timeout` is used as a total timeout for the whole wait operation.
|
||||
async fn get_active_tenant_with_timeout(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
shard_selector: ShardSelector,
|
||||
timeout: Duration,
|
||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||
let wait_start = Instant::now();
|
||||
let deadline = wait_start + timeout;
|
||||
|
||||
// Resolve TenantId to TenantShardId. This is usually a quick one-shot thing, the loop is
|
||||
// for handling the rare case that the slot we're accessing is InProgress.
|
||||
let tenant_shard = loop {
|
||||
let resolved = self
|
||||
.tenant_manager
|
||||
.resolve_attached_shard(&tenant_id, shard_selector);
|
||||
match resolved {
|
||||
ShardResolveResult::Found(tenant_shard) => break tenant_shard,
|
||||
ShardResolveResult::NotFound => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||
tenant_id,
|
||||
)));
|
||||
}
|
||||
ShardResolveResult::InProgress(barrier) => {
|
||||
// We can't authoritatively answer right now: wait for InProgress state
|
||||
// to end, then try again
|
||||
tokio::select! {
|
||||
_ = self.await_connection_cancelled() => {
|
||||
return Err(GetActiveTenantError::Cancelled)
|
||||
},
|
||||
_ = barrier.wait() => {
|
||||
// The barrier completed: proceed around the loop to try looking up again
|
||||
},
|
||||
_ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
|
||||
return Err(GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state: None,
|
||||
wait_time: timeout,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
tracing::debug!("Waiting for tenant to enter active state...");
|
||||
tenant_shard
|
||||
.wait_to_become_active(deadline.duration_since(Instant::now()))
|
||||
.await?;
|
||||
Ok(tenant_shard)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -1517,7 +1423,6 @@ where
|
||||
_pgb: &mut PostgresBackend<IO>,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
fail::fail_point!("ps::connection-start::startup-packet");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1532,12 +1437,11 @@ where
|
||||
Err(QueryError::SimulatedConnectionError)
|
||||
});
|
||||
|
||||
fail::fail_point!("ps::connection-start::process-query");
|
||||
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
debug!("process query {query_string:?}");
|
||||
let parts = query_string.split_whitespace().collect::<Vec<_>>();
|
||||
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
|
||||
if query_string.starts_with("pagestream_v2 ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
@@ -1562,7 +1466,9 @@ where
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
|
||||
} else if query_string.starts_with("pagestream ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
@@ -1587,7 +1493,10 @@ where
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||
} else if query_string.starts_with("basebackup ") {
|
||||
let (_, params_raw) = query_string.split_at("basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for basebackup command"
|
||||
@@ -1605,23 +1514,26 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
let lsn = if params.len() >= 3 {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let gzip = match params.get(3) {
|
||||
Some(&"--gzip") => true,
|
||||
None => false,
|
||||
Some(third_param) => {
|
||||
let gzip = if params.len() >= 4 {
|
||||
if params[3] == "--gzip" {
|
||||
true
|
||||
} else {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Parameter in position 3 unknown {third_param}",
|
||||
)))
|
||||
"Parameter in position 3 unknown {}",
|
||||
params[3],
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
@@ -1645,7 +1557,10 @@ where
|
||||
res?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for get_last_record_rlsn command"
|
||||
@@ -1687,7 +1602,10 @@ where
|
||||
.await?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
|
||||
else if query_string.starts_with("fullbackup ") {
|
||||
let (_, params_raw) = query_string.split_at("fullbackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for fullbackup command"
|
||||
@@ -1704,18 +1622,18 @@ where
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
let lsn = if params.len() > 2 {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
|
||||
let prev_lsn = if params.len() > 3 {
|
||||
Some(
|
||||
Lsn::from_str(prev_lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
|
||||
Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
@@ -1748,7 +1666,8 @@ where
|
||||
// 2. Run:
|
||||
// cat my_backup/base.tar | psql -h $PAGESERVER \
|
||||
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
|
||||
let params = &parts[2..];
|
||||
let (_, params_raw) = query_string.split_at("import basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
if params.len() != 5 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import basebackup command"
|
||||
@@ -1797,7 +1716,8 @@ where
|
||||
//
|
||||
// Files are scheduled to be persisted to remote storage, and the
|
||||
// caller should poll the http api to check when that is done.
|
||||
let params = &parts[2..];
|
||||
let (_, params_raw) = query_string.split_at("import wal ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
if params.len() != 4 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import wal command"
|
||||
@@ -1835,45 +1755,10 @@ where
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("lease lsn ") {
|
||||
let params = &parts[2..];
|
||||
if params.len() != 3 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number {} for lease lsn command",
|
||||
params.len()
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_shard_id = TenantShardId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_shard_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// The caller is responsible for providing correct lsn.
|
||||
let lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
|
||||
match self
|
||||
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error obtaining lsn lease for {lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else if let Some(params) = parts.strip_prefix(&["show"]) {
|
||||
} else if query_string.starts_with("show ") {
|
||||
// show <tenant_id>
|
||||
let (_, params_raw) = query_string.split_at("show ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
if params.len() != 1 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for config command"
|
||||
@@ -1886,13 +1771,13 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let tenant = self
|
||||
.get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::Zero,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
)
|
||||
.await?;
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::Zero,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
@@ -34,16 +35,12 @@ use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
|
||||
/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
|
||||
const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LsnForTimestamp {
|
||||
@@ -78,19 +75,11 @@ pub enum LsnForTimestamp {
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum CalculateLogicalSizeError {
|
||||
pub enum CalculateLogicalSizeError {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
|
||||
/// Something went wrong while reading the metadata we use to calculate logical size
|
||||
/// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
|
||||
/// in the `From` implementation for this variant.
|
||||
#[error(transparent)]
|
||||
PageRead(PageReconstructError),
|
||||
|
||||
/// Something went wrong deserializing metadata that we read to calculate logical size
|
||||
#[error("decode error: {0}")]
|
||||
Decode(#[from] DeserializeError),
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -115,8 +104,10 @@ impl From<PageReconstructError> for CollectKeySpaceError {
|
||||
impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||
fn from(pre: PageReconstructError) -> Self {
|
||||
match pre {
|
||||
PageReconstructError::Cancelled => Self::Cancelled,
|
||||
_ => Self::PageRead(pre),
|
||||
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
|
||||
Self::Cancelled
|
||||
}
|
||||
_ => Self::Other(pre.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -708,17 +699,13 @@ impl Timeline {
|
||||
.await
|
||||
.context("scan")?;
|
||||
let mut result = HashMap::new();
|
||||
let mut sz = 0;
|
||||
for (_, v) in kv {
|
||||
let v = v.context("get value")?;
|
||||
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
|
||||
for (fname, content) in v {
|
||||
sz += fname.len();
|
||||
sz += content.len();
|
||||
result.insert(fname, content);
|
||||
}
|
||||
}
|
||||
self.aux_file_size_estimator.on_base_backup(sz);
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
@@ -727,11 +714,10 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
match current_policy {
|
||||
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::CrossValidation) => {
|
||||
match self.get_switch_aux_file_policy() {
|
||||
AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
|
||||
AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
|
||||
AuxFilePolicy::CrossValidation => {
|
||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
|
||||
match (v1_result, v2_result) {
|
||||
@@ -769,7 +755,7 @@ impl Timeline {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub(crate) async fn get_current_logical_size_non_incremental(
|
||||
pub async fn get_current_logical_size_non_incremental(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -778,7 +764,7 @@ impl Timeline {
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf)?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
@@ -1479,40 +1465,7 @@ impl<'a> DatadirModification<'a> {
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let switch_policy = self.tline.get_switch_aux_file_policy();
|
||||
|
||||
let policy = {
|
||||
let current_policy = self.tline.last_aux_file_policy.load();
|
||||
// Allowed switch path:
|
||||
// * no aux files -> v1/v2/cross-validation
|
||||
// * cross-validation->v2
|
||||
|
||||
let current_policy = if current_policy.is_none() {
|
||||
// This path will only be hit once per tenant: we will decide the final policy in this code block.
|
||||
// The next call to `put_file` will always have `last_aux_file_policy != None`.
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
|
||||
if aux_files_key_v1.is_empty() {
|
||||
None
|
||||
} else {
|
||||
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
||||
Some(AuxFilePolicy::V1)
|
||||
}
|
||||
} else {
|
||||
current_policy
|
||||
};
|
||||
|
||||
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
|
||||
self.tline.do_switch_aux_policy(switch_policy)?;
|
||||
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
|
||||
switch_policy
|
||||
} else {
|
||||
// This branch handles non-valid migration path, and the case that switch_policy == current_policy.
|
||||
// And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
|
||||
current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
|
||||
}
|
||||
};
|
||||
|
||||
let policy = self.tline.get_switch_aux_file_policy();
|
||||
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
@@ -1521,45 +1474,23 @@ impl<'a> DatadirModification<'a> {
|
||||
Err(PageReconstructError::MissingKey(_)) => None,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
|
||||
let files = if let Some(ref old_val) = old_val {
|
||||
aux_file::decode_file_value(old_val)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
let mut other_files = Vec::with_capacity(files.len());
|
||||
let mut modifying_file = None;
|
||||
for file @ (p, content) in files {
|
||||
if path == p {
|
||||
assert!(
|
||||
modifying_file.is_none(),
|
||||
"duplicated entries found for {}",
|
||||
path
|
||||
);
|
||||
modifying_file = Some(content);
|
||||
} else {
|
||||
other_files.push(file);
|
||||
}
|
||||
}
|
||||
let mut new_files = other_files;
|
||||
match (modifying_file, content.is_empty()) {
|
||||
(Some(old_content), false) => {
|
||||
self.tline
|
||||
.aux_file_size_estimator
|
||||
.on_update(old_content.len(), content.len());
|
||||
new_files.push((path, content));
|
||||
}
|
||||
(Some(old_content), true) => {
|
||||
self.tline
|
||||
.aux_file_size_estimator
|
||||
.on_remove(old_content.len());
|
||||
// not adding the file key to the final `new_files` vec.
|
||||
}
|
||||
(None, false) => {
|
||||
self.tline.aux_file_size_estimator.on_add(content.len());
|
||||
new_files.push((path, content));
|
||||
}
|
||||
(None, true) => warn!("removing non-existing aux file: {}", path),
|
||||
}
|
||||
let new_files = if content.is_empty() {
|
||||
files
|
||||
.into_iter()
|
||||
.filter(|(p, _)| &path != p)
|
||||
.collect::<Vec<_>>()
|
||||
} else {
|
||||
files
|
||||
.into_iter()
|
||||
.filter(|(p, _)| &path != p)
|
||||
.chain(std::iter::once((path, content)))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
}
|
||||
@@ -1612,7 +1543,8 @@ impl<'a> DatadirModification<'a> {
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::Cancelled
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
| PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
@@ -1719,6 +1651,8 @@ impl<'a> DatadirModification<'a> {
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
@@ -1758,6 +1692,8 @@ impl<'a> DatadirModification<'a> {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
timer.observe_duration();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1793,12 +1729,6 @@ impl<'a> DatadirModification<'a> {
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
/// Only used during unit tests, force putting a key into the modification.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
|
||||
self.put(key, val);
|
||||
}
|
||||
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -238,13 +238,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
io_buf,
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({len} bytes)"),
|
||||
format!("blob too large ({} bytes)", len),
|
||||
)),
|
||||
);
|
||||
}
|
||||
if len > 0x0fff_ffff {
|
||||
tracing::warn!("writing blob above future limit ({len} bytes)");
|
||||
}
|
||||
let mut len_buf = (len as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
io_buf.extend_from_slice(&len_buf[..]);
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
use anyhow::bail;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
@@ -321,7 +320,7 @@ pub struct TenantConf {
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
pub compaction_algorithm: CompactionAlgorithmSettings,
|
||||
pub compaction_algorithm: CompactionAlgorithm,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is #of bytes of WAL.
|
||||
@@ -374,8 +373,6 @@ pub struct TenantConf {
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||
/// file is written.
|
||||
pub switch_aux_file_policy: AuxFilePolicy,
|
||||
}
|
||||
|
||||
@@ -407,7 +404,7 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
@@ -498,9 +495,7 @@ impl TenantConfOpt {
|
||||
.unwrap_or(global_conf.compaction_threshold),
|
||||
compaction_algorithm: self
|
||||
.compaction_algorithm
|
||||
.as_ref()
|
||||
.unwrap_or(&global_conf.compaction_algorithm)
|
||||
.clone(),
|
||||
.unwrap_or(global_conf.compaction_algorithm),
|
||||
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
|
||||
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
|
||||
image_creation_threshold: self
|
||||
@@ -553,9 +548,7 @@ impl Default for TenantConf {
|
||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
compaction_algorithm: CompactionAlgorithmSettings {
|
||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||
},
|
||||
compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
@@ -581,7 +574,7 @@ impl Default for TenantConf {
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
|
||||
switch_aux_file_policy: AuxFilePolicy::V1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, instrument, Instrument};
|
||||
|
||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
@@ -181,23 +181,25 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
|
||||
|
||||
async fn remove_tenant_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path, cancel).await },
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path, cancel).await },
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -295,7 +297,7 @@ impl DeleteTenantFlow {
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
@@ -306,7 +308,9 @@ impl DeleteTenantFlow {
|
||||
|
||||
let mut guard = Self::prepare(&tenant).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
|
||||
if let Err(e) =
|
||||
Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
|
||||
{
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
return Err(e);
|
||||
}
|
||||
@@ -323,7 +327,7 @@ impl DeleteTenantFlow {
|
||||
async fn run_inner(
|
||||
guard: &mut OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
@@ -335,9 +339,14 @@ impl DeleteTenantFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
|
||||
.await
|
||||
.context("remote_mark")?;
|
||||
// IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
|
||||
// Though sounds scary, different mark name?
|
||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
@@ -474,7 +483,7 @@ impl DeleteTenantFlow {
|
||||
fn schedule_background(
|
||||
guard: OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) {
|
||||
@@ -503,7 +512,7 @@ impl DeleteTenantFlow {
|
||||
async fn background(
|
||||
mut guard: OwnedMutexGuard<Self>,
|
||||
conf: &PageServerConf,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
@@ -542,7 +551,7 @@ impl DeleteTenantFlow {
|
||||
|
||||
remove_tenant_remote_delete_mark(
|
||||
conf,
|
||||
&remote_storage,
|
||||
remote_storage.as_ref(),
|
||||
&tenant.tenant_shard_id,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
|
||||
@@ -7,7 +7,7 @@ use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use pageserver_api::upcall_api::ReAttachResponseTenant;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
@@ -16,9 +16,10 @@ use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
use sysinfo::SystemExt;
|
||||
use tokio::fs;
|
||||
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -45,8 +46,8 @@ use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::storage_layer::inmemory_layer;
|
||||
use crate::tenant::timeline::ShutdownMode;
|
||||
use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::fs_ext::PathExt;
|
||||
@@ -118,7 +119,6 @@ pub(crate) enum TenantsMapRemoveResult {
|
||||
|
||||
/// When resolving a TenantId to a shard, we may be looking for the 0th
|
||||
/// shard, or we might be looking for whichever shard holds a particular page.
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) enum ShardSelector {
|
||||
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
|
||||
/// ignore it.
|
||||
@@ -127,8 +127,6 @@ pub(crate) enum ShardSelector {
|
||||
First,
|
||||
/// Pick the shard that holds this key
|
||||
Page(Key),
|
||||
/// The shard ID is known: pick the given shard
|
||||
Known(ShardIndex),
|
||||
}
|
||||
|
||||
/// A convenience for use with the re_attach ControlPlaneClient function: rather
|
||||
@@ -171,14 +169,6 @@ impl TenantStartupMode {
|
||||
}
|
||||
}
|
||||
|
||||
/// Result type for looking up a TenantId to a specific shard
|
||||
pub(crate) enum ShardResolveResult {
|
||||
NotFound,
|
||||
Found(Arc<Tenant>),
|
||||
// Wait for this barrrier, then query again
|
||||
InProgress(utils::completion::Barrier),
|
||||
}
|
||||
|
||||
impl TenantsMap {
|
||||
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
|
||||
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
|
||||
@@ -192,6 +182,51 @@ impl TenantsMap {
|
||||
}
|
||||
}
|
||||
|
||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||
/// resolve this to a fully qualified TenantShardId.
|
||||
fn resolve_attached_shard(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
selector: ShardSelector,
|
||||
) -> Option<TenantShardId> {
|
||||
let mut want_shard = None;
|
||||
match self {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
||||
// Ignore all slots that don't contain an attached tenant
|
||||
let tenant = match &slot.1 {
|
||||
TenantSlot::Attached(t) => t,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
match selector {
|
||||
ShardSelector::First => return Some(*slot.0),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return Some(*slot.0)
|
||||
}
|
||||
ShardSelector::Page(key) => {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return Some(*slot.0);
|
||||
}
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through: we didn't find an acceptable shard
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map.
|
||||
///
|
||||
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
|
||||
@@ -356,17 +391,22 @@ async fn init_load_generations(
|
||||
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
|
||||
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
|
||||
// are processed, even though we don't block on recovery completing here.
|
||||
let attached_tenants = generations
|
||||
.iter()
|
||||
.flat_map(|(id, start_mode)| {
|
||||
match start_mode {
|
||||
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
|
||||
TenantStartupMode::Secondary => None,
|
||||
}
|
||||
.map(|gen| (*id, *gen))
|
||||
})
|
||||
.collect();
|
||||
resources.deletion_queue_client.recover(attached_tenants)?;
|
||||
//
|
||||
// Must only do this if remote storage is enabled, otherwise deletion queue
|
||||
// is not running and channel push will fail.
|
||||
if resources.remote_storage.is_some() {
|
||||
let attached_tenants = generations
|
||||
.iter()
|
||||
.flat_map(|(id, start_mode)| {
|
||||
match start_mode {
|
||||
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
|
||||
TenantStartupMode::Secondary => None,
|
||||
}
|
||||
.map(|gen| (*id, *gen))
|
||||
})
|
||||
.collect();
|
||||
resources.deletion_queue_client.recover(attached_tenants)?;
|
||||
}
|
||||
|
||||
Ok(Some(generations))
|
||||
}
|
||||
@@ -420,6 +460,53 @@ fn load_tenant_config(
|
||||
}
|
||||
};
|
||||
|
||||
// Clean up legacy `metadata` files.
|
||||
// Doing it here because every single tenant directory is visited here.
|
||||
// In any later code, there's different treatment of tenant dirs
|
||||
// ... depending on whether the tenant is in re-attach response or not
|
||||
// ... epending on whether the tenant is ignored or not
|
||||
assert_eq!(
|
||||
&conf.tenant_path(&tenant_shard_id),
|
||||
&tenant_dir_path,
|
||||
"later use of conf....path() methods would be dubious"
|
||||
);
|
||||
let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
|
||||
Ok(iter) => {
|
||||
let mut timelines = Vec::new();
|
||||
for res in iter {
|
||||
let p = res?;
|
||||
let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
|
||||
// skip any entries that aren't TimelineId, such as
|
||||
// - *.___temp dirs
|
||||
// - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
|
||||
continue;
|
||||
};
|
||||
timelines.push(timeline_id);
|
||||
}
|
||||
timelines
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
|
||||
Err(e) => return Err(anyhow::anyhow!(e)),
|
||||
};
|
||||
for timeline_id in timelines {
|
||||
let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let metadata_path = timeline_path.join(METADATA_FILE_NAME);
|
||||
match std::fs::remove_file(&metadata_path) {
|
||||
Ok(()) => {
|
||||
crashsafe::fsync(timeline_path)
|
||||
.context("fsync timeline dir after removing legacy metadata file")?;
|
||||
info!("removed legacy metadata file at {metadata_path}");
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
// something removed the file earlier, or it was never there
|
||||
// We don't care, this software version doesn't write it again, so, we're good.
|
||||
}
|
||||
Err(e) => {
|
||||
anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
@@ -524,7 +611,6 @@ pub async fn init_tenant_mgr(
|
||||
TenantSlot::Attached(Tenant::create_broken_tenant(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
resources.remote_storage.clone(),
|
||||
format!("{}", e),
|
||||
)),
|
||||
);
|
||||
@@ -717,7 +803,6 @@ fn tenant_spawn(
|
||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||
);
|
||||
|
||||
let remote_storage = resources.remote_storage.clone();
|
||||
let tenant = match Tenant::spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
@@ -732,7 +817,7 @@ fn tenant_spawn(
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
|
||||
Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
|
||||
Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
|
||||
}
|
||||
};
|
||||
|
||||
@@ -2018,77 +2103,6 @@ impl TenantManager {
|
||||
|
||||
Ok(reparented)
|
||||
}
|
||||
|
||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||
/// resolve this to a fully qualified TenantShardId.
|
||||
///
|
||||
/// During shard splits: we shall see parent shards in InProgress state and skip them, and
|
||||
/// instead match on child shards which should appear in Attached state. Very early in a shard
|
||||
/// split, or in other cases where a shard is InProgress, we will return our own InProgress result
|
||||
/// to instruct the caller to wait for that to finish before querying again.
|
||||
pub(crate) fn resolve_attached_shard(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
selector: ShardSelector,
|
||||
) -> ShardResolveResult {
|
||||
let tenants = self.tenants.read().unwrap();
|
||||
let mut want_shard = None;
|
||||
let mut any_in_progress = None;
|
||||
|
||||
match &*tenants {
|
||||
TenantsMap::Initializing => ShardResolveResult::NotFound,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
||||
// Ignore all slots that don't contain an attached tenant
|
||||
let tenant = match &slot.1 {
|
||||
TenantSlot::Attached(t) => t,
|
||||
TenantSlot::InProgress(barrier) => {
|
||||
// We might still find a usable shard, but in case we don't, remember that
|
||||
// we saw at least one InProgress slot, so that we can distinguish this case
|
||||
// from a simple NotFound in our return value.
|
||||
any_in_progress = Some(barrier.clone());
|
||||
continue;
|
||||
}
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
match selector {
|
||||
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return ShardResolveResult::Found(tenant.clone())
|
||||
}
|
||||
ShardSelector::Page(key) => {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return ShardResolveResult::Found(tenant.clone());
|
||||
}
|
||||
}
|
||||
ShardSelector::Known(shard)
|
||||
if tenant.shard_identity.shard_index() == shard =>
|
||||
{
|
||||
return ShardResolveResult::Found(tenant.clone());
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through: we didn't find a slot that was in Attached state & matched our selector. If
|
||||
// we found one or more InProgress slot, indicate to caller that they should retry later. Otherwise
|
||||
// this requested shard simply isn't found.
|
||||
if let Some(barrier) = any_in_progress {
|
||||
ShardResolveResult::InProgress(barrier)
|
||||
} else {
|
||||
ShardResolveResult::NotFound
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -2137,6 +2151,105 @@ pub(crate) enum GetActiveTenantError {
|
||||
Broken(String),
|
||||
}
|
||||
|
||||
/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
|
||||
/// state, then wait for up to `timeout`. If the [`Tenant`] is not currently in [`TenantState::Active`],
|
||||
/// then wait for up to `timeout` (minus however long we waited for the slot).
|
||||
pub(crate) async fn get_active_tenant_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
shard_selector: ShardSelector,
|
||||
timeout: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||
enum WaitFor {
|
||||
Barrier(utils::completion::Barrier),
|
||||
Tenant(Arc<Tenant>),
|
||||
}
|
||||
|
||||
let wait_start = Instant::now();
|
||||
let deadline = wait_start + timeout;
|
||||
|
||||
let (wait_for, tenant_shard_id) = {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
|
||||
// Resolve TenantId to TenantShardId
|
||||
let tenant_shard_id = locked
|
||||
.resolve_attached_shard(&tenant_id, shard_selector)
|
||||
.ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||
tenant_id,
|
||||
)))?;
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.map_err(GetTenantError::MapState)?;
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
match tenant.current_state() {
|
||||
TenantState::Active => {
|
||||
// Fast path: we don't need to do any async waiting.
|
||||
return Ok(tenant.clone());
|
||||
}
|
||||
_ => {
|
||||
tenant.activate_now();
|
||||
(WaitFor::Tenant(tenant.clone()), tenant_shard_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(TenantSlot::Secondary(_)) => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
|
||||
tenant_shard_id,
|
||||
)))
|
||||
}
|
||||
Some(TenantSlot::InProgress(barrier)) => {
|
||||
(WaitFor::Barrier(barrier.clone()), tenant_shard_id)
|
||||
}
|
||||
None => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||
tenant_id,
|
||||
)))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let tenant = match wait_for {
|
||||
WaitFor::Barrier(barrier) => {
|
||||
tracing::debug!("Waiting for tenant InProgress state to pass...");
|
||||
timeout_cancellable(
|
||||
deadline.duration_since(Instant::now()),
|
||||
cancel,
|
||||
barrier.wait(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state: None,
|
||||
wait_time: wait_start.elapsed(),
|
||||
},
|
||||
TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled,
|
||||
})?;
|
||||
{
|
||||
let locked = TENANTS.read().unwrap();
|
||||
let peek_slot =
|
||||
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.map_err(GetTenantError::MapState)?;
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Attached(tenant)) => tenant.clone(),
|
||||
_ => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
|
||||
tenant_shard_id,
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
WaitFor::Tenant(tenant) => tenant,
|
||||
};
|
||||
|
||||
tracing::debug!("Waiting for tenant to enter active state...");
|
||||
tenant
|
||||
.wait_to_become_active(deadline.duration_since(Instant::now()))
|
||||
.await?;
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum DeleteTimelineError {
|
||||
#[error("Tenant {0}")]
|
||||
@@ -2163,7 +2276,7 @@ pub(crate) async fn load_tenant(
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
@@ -2767,79 +2880,86 @@ use {
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub(crate) async fn immediate_gc(
|
||||
pub(crate) fn immediate_gc(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<GcResult, ApiError> {
|
||||
let tenant = {
|
||||
let guard = TENANTS.read().unwrap();
|
||||
guard
|
||||
.get(&tenant_shard_id)
|
||||
.cloned()
|
||||
.with_context(|| format!("tenant {tenant_shard_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?
|
||||
};
|
||||
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
||||
let guard = TENANTS.read().unwrap();
|
||||
|
||||
let tenant = guard
|
||||
.get(&tenant_shard_id)
|
||||
.cloned()
|
||||
.with_context(|| format!("tenant {tenant_shard_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
// Run in task_mgr to avoid race with tenant_detach operation
|
||||
let ctx: RequestContext =
|
||||
ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
|
||||
|
||||
let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
|
||||
// TODO: spawning is redundant now, need to hold the gate
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::GarbageCollector,
|
||||
Some(tenant_shard_id),
|
||||
Some(timeline_id),
|
||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
#[allow(unused_mut)]
|
||||
let mut result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
|
||||
#[allow(unused_mut)]
|
||||
let mut result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
// we need to synchronize with drop completion for python tests without polling for
|
||||
// log messages
|
||||
if let Ok(result) = result.as_mut() {
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
for layer in std::mem::take(&mut result.doomed_layers) {
|
||||
js.spawn(layer.wait_drop());
|
||||
}
|
||||
tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
|
||||
while let Some(res) = js.join_next().await {
|
||||
res.expect("wait_drop should not panic");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
// we need to synchronize with drop completion for python tests without polling for
|
||||
// log messages
|
||||
if let Ok(result) = result.as_mut() {
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
for layer in std::mem::take(&mut result.doomed_layers) {
|
||||
js.spawn(layer.wait_drop());
|
||||
let timeline = tenant.get_timeline(timeline_id, false).ok();
|
||||
let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
|
||||
|
||||
if let Some(rtc) = rtc {
|
||||
// layer drops schedule actions on remote timeline client to actually do the
|
||||
// deletions; don't care about the shutdown error, just exit fast
|
||||
drop(rtc.wait_completion().await);
|
||||
}
|
||||
}
|
||||
tracing::info!(
|
||||
total = js.len(),
|
||||
"starting to wait for the gc'd layers to be dropped"
|
||||
);
|
||||
while let Some(res) = js.join_next().await {
|
||||
res.expect("wait_drop should not panic");
|
||||
|
||||
match task_done.send(result) {
|
||||
Ok(_) => (),
|
||||
Err(result) => error!("failed to send gc result: {result:?}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
.instrument(span)
|
||||
);
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, false).ok();
|
||||
let rtc = timeline.as_ref().map(|x| &x.remote_client);
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
drop(guard);
|
||||
|
||||
if let Some(rtc) = rtc {
|
||||
// layer drops schedule actions on remote timeline client to actually do the
|
||||
// deletions; don't care about the shutdown error, just exit fast
|
||||
drop(rtc.wait_completion().await);
|
||||
}
|
||||
}
|
||||
|
||||
result.map_err(|e| match e {
|
||||
GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
|
||||
GcError::TimelineNotFound => {
|
||||
ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
|
||||
}
|
||||
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
|
||||
})
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -189,7 +189,6 @@ use camino::Utf8Path;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
|
||||
pub(crate) use download::download_initdb_tar_zst;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -197,7 +196,6 @@ pub(crate) use upload::upload_initdb_dir;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::pausable_failpoint;
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
@@ -319,7 +317,7 @@ pub struct RemoteTimelineClient {
|
||||
|
||||
upload_queue: Mutex<UploadQueue>,
|
||||
|
||||
pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
|
||||
storage_impl: GenericRemoteStorage,
|
||||
|
||||
@@ -463,11 +461,11 @@ impl RemoteTimelineClient {
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.metrics.remote_physical_size_gauge.set(size);
|
||||
self.metrics.remote_physical_size_set(size);
|
||||
}
|
||||
|
||||
pub fn get_remote_physical_size(&self) -> u64 {
|
||||
self.metrics.remote_physical_size_gauge.get()
|
||||
self.metrics.remote_physical_size_get()
|
||||
}
|
||||
|
||||
//
|
||||
@@ -520,7 +518,6 @@ impl RemoteTimelineClient {
|
||||
&self,
|
||||
layer_file_name: &LayerName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<u64> {
|
||||
@@ -539,7 +536,6 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
layer_file_name,
|
||||
layer_metadata,
|
||||
local_path,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
@@ -613,17 +609,6 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
|
||||
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
|
||||
self: &Arc<Self>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.last_aux_file_policy = last_aux_file_policy;
|
||||
self.schedule_index_upload(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
@@ -1142,11 +1127,6 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn is_deleting(&self) -> bool {
|
||||
let mut locked = self.upload_queue.lock().unwrap();
|
||||
locked.stopped_mut().is_ok()
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
self: &Arc<Self>,
|
||||
tenant_id: &TenantId,
|
||||
@@ -1193,7 +1173,7 @@ impl RemoteTimelineClient {
|
||||
&self.storage_impl,
|
||||
uploaded.local_path(),
|
||||
&remote_path,
|
||||
uploaded.metadata().file_size,
|
||||
uploaded.metadata().file_size(),
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
@@ -1574,7 +1554,7 @@ impl RemoteTimelineClient {
|
||||
&self.storage_impl,
|
||||
local_path,
|
||||
&remote_path,
|
||||
layer_metadata.file_size,
|
||||
layer_metadata.file_size(),
|
||||
&self.cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -1769,7 +1749,7 @@ impl RemoteTimelineClient {
|
||||
UploadOp::UploadLayer(_, m) => (
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
|
||||
),
|
||||
UploadOp::UploadMetadata(_, _) => (
|
||||
RemoteOpFileKind::Index,
|
||||
@@ -1864,7 +1844,6 @@ impl RemoteTimelineClient {
|
||||
dangling_files: HashMap::default(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: initialized.last_aux_file_policy,
|
||||
};
|
||||
|
||||
let upload_queue = std::mem::replace(
|
||||
@@ -2153,7 +2132,7 @@ mod tests {
|
||||
tenant_ctx: _tenant_ctx,
|
||||
} = test_setup;
|
||||
|
||||
let client = &timeline.remote_client;
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let initial_index_part = match client
|
||||
@@ -2344,7 +2323,7 @@ mod tests {
|
||||
timeline,
|
||||
..
|
||||
} = TestSetup::new("metrics").await.unwrap();
|
||||
let client = &timeline.remote_client;
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
|
||||
let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
let local_path = local_layer_path(
|
||||
|
||||
@@ -21,6 +21,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
@@ -49,13 +50,19 @@ pub async fn download_layer_file<'a>(
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &'a LayerName,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
layer_file_name,
|
||||
&layer_metadata.generation,
|
||||
);
|
||||
|
||||
let remote_path = remote_layer_path(
|
||||
&tenant_shard_id.tenant_id,
|
||||
@@ -75,7 +82,7 @@ pub async fn download_layer_file<'a>(
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let bytes_amount = download_retry(
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
|
||||
@@ -84,7 +91,7 @@ pub async fn download_layer_file<'a>(
|
||||
)
|
||||
.await?;
|
||||
|
||||
let expected = layer_metadata.file_size;
|
||||
let expected = layer_metadata.file_size();
|
||||
if expected != bytes_amount {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -17,6 +16,46 @@ use pageserver_api::shard::ShardIndex;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
//#[cfg_attr(test, derive(Default))]
|
||||
pub struct LayerFileMetadata {
|
||||
file_size: u64,
|
||||
|
||||
pub(crate) generation: Generation,
|
||||
|
||||
pub(crate) shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||
fn from(other: &IndexLayerMetadata) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
shard: other.shard,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size,
|
||||
generation,
|
||||
shard,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
}
|
||||
|
||||
// TODO seems like another part of the remote storage file format
|
||||
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
|
||||
/// In-memory representation of an `index_part.json` file
|
||||
///
|
||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||
@@ -37,7 +76,7 @@ pub struct IndexPart {
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
|
||||
pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated for convenience when reading the serialized structure, but is
|
||||
@@ -49,16 +88,6 @@ pub struct IndexPart {
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) lineage: Lineage,
|
||||
|
||||
/// Describes the kind of aux files stored in the timeline.
|
||||
///
|
||||
/// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
|
||||
/// A V1 setting after V2 files have been committed is not accepted.
|
||||
///
|
||||
/// None means no aux files have been written to the storage before the point
|
||||
/// when this flag is introduced.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
@@ -72,11 +101,10 @@ impl IndexPart {
|
||||
/// is always generated from the keys of `layer_metadata`)
|
||||
/// - 4: timeline_layers is fully removed.
|
||||
/// - 5: lineage was added
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
const LATEST_VERSION: usize = 6;
|
||||
const LATEST_VERSION: usize = 5;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -85,9 +113,11 @@ impl IndexPart {
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata: TimelineMetadata,
|
||||
lineage: Lineage,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> Self {
|
||||
let layer_metadata = layers_and_metadata.clone();
|
||||
let layer_metadata = layers_and_metadata
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
version: Self::LATEST_VERSION,
|
||||
@@ -96,7 +126,6 @@ impl IndexPart {
|
||||
metadata,
|
||||
deleted_at: None,
|
||||
lineage,
|
||||
last_aux_file_policy,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,13 +155,8 @@ impl IndexPart {
|
||||
example_metadata.disk_consistent_lsn(),
|
||||
example_metadata,
|
||||
Default::default(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
|
||||
self.last_aux_file_policy
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&UploadQueueInitialized> for IndexPart {
|
||||
@@ -141,22 +165,13 @@ impl From<&UploadQueueInitialized> for IndexPart {
|
||||
let metadata = uq.latest_metadata.clone();
|
||||
let lineage = uq.latest_lineage.clone();
|
||||
|
||||
Self::new(
|
||||
&uq.latest_files,
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
lineage,
|
||||
uq.last_aux_file_policy,
|
||||
)
|
||||
Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub struct LayerFileMetadata {
|
||||
/// Serialized form of [`LayerFileMetadata`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexLayerMetadata {
|
||||
pub file_size: u64,
|
||||
|
||||
#[serde(default = "Generation::none")]
|
||||
@@ -168,12 +183,12 @@ pub struct LayerFileMetadata {
|
||||
pub shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size,
|
||||
generation,
|
||||
shard,
|
||||
impl From<&LayerFileMetadata> for IndexLayerMetadata {
|
||||
fn from(other: &LayerFileMetadata) -> Self {
|
||||
IndexLayerMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
shard: other.shard,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -267,12 +282,12 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -284,7 +299,6 @@ mod tests {
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -309,12 +323,12 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -326,7 +340,6 @@ mod tests {
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -352,12 +365,12 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 2,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -370,7 +383,6 @@ mod tests {
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -416,7 +428,6 @@ mod tests {
|
||||
.unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||
@@ -440,12 +451,12 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
version: 4,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -457,7 +468,6 @@ mod tests {
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -482,12 +492,12 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
version: 5,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 23289856,
|
||||
generation: Generation::new(1),
|
||||
shard: ShardIndex::unsharded(),
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 1015808,
|
||||
generation: Generation::new(1),
|
||||
shard: ShardIndex::unsharded(),
|
||||
@@ -501,57 +511,6 @@ mod tests {
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v6_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version":6,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
"deleted_at": "2023-07-31T09:00:00.123",
|
||||
"lineage":{
|
||||
"original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
|
||||
"reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
|
||||
},
|
||||
"last_aux_file_policy": "V2"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 6,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
lineage: Lineage {
|
||||
reparenting_history_truncated: false,
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
last_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
|
||||
@@ -9,7 +9,7 @@ use std::time::SystemTime;
|
||||
use tokio::fs::{self, File};
|
||||
use tokio::io::AsyncSeekExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, pausable_failpoint};
|
||||
use utils::backoff;
|
||||
|
||||
use super::Generation;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
|
||||
@@ -6,9 +6,11 @@ mod scheduler;
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
virtual_file::MaybeFatalIo,
|
||||
};
|
||||
|
||||
use self::{
|
||||
@@ -19,8 +21,9 @@ use self::{
|
||||
use super::{
|
||||
config::{SecondaryLocationConfig, TenantConfOpt},
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::LayerName,
|
||||
storage_layer::{layer::local_layer_path, LayerName},
|
||||
};
|
||||
|
||||
use pageserver_api::{
|
||||
@@ -175,7 +178,13 @@ impl SecondaryTenant {
|
||||
|
||||
/// Cancellation safe, but on cancellation the eviction will go through
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
|
||||
pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
|
||||
pub(crate) async fn evict_layer(
|
||||
self: &Arc<Self>,
|
||||
conf: &PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
name: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
) {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let guard = match self.gate.enter() {
|
||||
@@ -187,13 +196,42 @@ impl SecondaryTenant {
|
||||
};
|
||||
|
||||
let now = SystemTime::now();
|
||||
tracing::info!("Evicting secondary layer");
|
||||
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&self.tenant_shard_id,
|
||||
&timeline_id,
|
||||
&name,
|
||||
&metadata.generation,
|
||||
);
|
||||
|
||||
let this = self.clone();
|
||||
|
||||
// spawn it to be cancellation safe
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _guard = guard;
|
||||
// We tolerate ENOENT, because between planning eviction and executing
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
let deleted = std::fs::remove_file(local_path);
|
||||
|
||||
let not_found = deleted
|
||||
.as_ref()
|
||||
.is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
|
||||
|
||||
let deleted = if not_found {
|
||||
false
|
||||
} else {
|
||||
deleted
|
||||
.map(|()| true)
|
||||
.fatal_err("Deleting layer during eviction")
|
||||
};
|
||||
|
||||
if !deleted {
|
||||
// skip updating accounting and putting perhaps later timestamp
|
||||
return;
|
||||
}
|
||||
|
||||
// Update the timeline's state. This does not have to be synchronized with
|
||||
// the download process, because:
|
||||
@@ -212,15 +250,8 @@ impl SecondaryTenant {
|
||||
// of the cache.
|
||||
let mut detail = this.detail.lock().unwrap();
|
||||
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
|
||||
let removed = timeline_detail.on_disk_layers.remove(&name);
|
||||
|
||||
// We might race with removal of the same layer during downloads, if it was removed
|
||||
// from the heatmap. If we see that the OnDiskState is gone, then no need to
|
||||
// do a physical deletion or store in evicted_at.
|
||||
if let Some(removed) = removed {
|
||||
removed.remove_blocking();
|
||||
timeline_detail.evicted_at.insert(name, now);
|
||||
}
|
||||
timeline_detail.on_disk_layers.remove(&name);
|
||||
timeline_detail.evicted_at.insert(name, now);
|
||||
}
|
||||
})
|
||||
.await
|
||||
|
||||
@@ -26,7 +26,7 @@ use crate::{
|
||||
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||
},
|
||||
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
|
||||
TEMP_FILE_SUFFIX,
|
||||
METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -62,10 +62,14 @@ use super::{
|
||||
CommandRequest, DownloadCommand,
|
||||
};
|
||||
|
||||
/// For each tenant, default period for how long must have passed since the last download_tenant call before
|
||||
/// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
|
||||
/// download, if the uploader populated it.
|
||||
const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
/// For each tenant, how long must have passed since the last download_tenant call before
|
||||
/// calling it again. This is approximately the time by which local data is allowed
|
||||
/// to fall behind remote data.
|
||||
///
|
||||
/// TODO: this should just be a default, and the actual period should be controlled
|
||||
/// via the heatmap itself
|
||||
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
|
||||
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
|
||||
pub(super) async fn downloader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
@@ -86,7 +90,7 @@ pub(super) async fn downloader_task(
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("secondary_download_scheduler"))
|
||||
.instrument(info_span!("secondary_downloads"))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -100,7 +104,6 @@ struct SecondaryDownloader {
|
||||
pub(super) struct OnDiskState {
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
local_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
impl OnDiskState {
|
||||
@@ -111,26 +114,12 @@ impl OnDiskState {
|
||||
_ame: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
local_path: Utf8PathBuf,
|
||||
) -> Self {
|
||||
Self {
|
||||
metadata,
|
||||
access_time,
|
||||
local_path,
|
||||
}
|
||||
}
|
||||
|
||||
// This is infallible, because all errors are either acceptable (ENOENT), or totally
|
||||
// unexpected (fatal).
|
||||
pub(super) fn remove_blocking(&self) {
|
||||
// We tolerate ENOENT, because between planning eviction and executing
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
std::fs::remove_file(&self.local_path)
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.fatal_err("Deleting secondary layer")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
@@ -141,22 +130,14 @@ pub(super) struct SecondaryDetailTimeline {
|
||||
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
|
||||
}
|
||||
|
||||
// Aspects of a heatmap that we remember after downloading it
|
||||
#[derive(Clone, Debug)]
|
||||
struct DownloadSummary {
|
||||
etag: Etag,
|
||||
#[allow(unused)]
|
||||
mtime: SystemTime,
|
||||
upload_period: Duration,
|
||||
}
|
||||
|
||||
/// This state is written by the secondary downloader, it is opaque
|
||||
/// to TenantManager
|
||||
#[derive(Debug)]
|
||||
pub(super) struct SecondaryDetail {
|
||||
pub(super) config: SecondaryLocationConfig,
|
||||
|
||||
last_download: Option<DownloadSummary>,
|
||||
last_download: Option<Instant>,
|
||||
last_etag: Option<Etag>,
|
||||
next_download: Option<Instant>,
|
||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
}
|
||||
@@ -186,6 +167,7 @@ impl SecondaryDetail {
|
||||
Self {
|
||||
config,
|
||||
last_download: None,
|
||||
last_etag: None,
|
||||
next_download: None,
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
@@ -239,8 +221,9 @@ impl SecondaryDetail {
|
||||
|
||||
struct PendingDownload {
|
||||
secondary_state: Arc<SecondaryTenant>,
|
||||
last_download: Option<DownloadSummary>,
|
||||
last_download: Option<Instant>,
|
||||
target_time: Option<Instant>,
|
||||
period: Option<Duration>,
|
||||
}
|
||||
|
||||
impl scheduler::PendingJob for PendingDownload {
|
||||
@@ -290,17 +273,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
|
||||
tracing::debug!("Secondary tenant download completed");
|
||||
|
||||
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
|
||||
// take priority to run again.
|
||||
let mut detail = secondary_state.detail.lock().unwrap();
|
||||
|
||||
let period = detail
|
||||
.last_download
|
||||
.as_ref()
|
||||
.map(|d| d.upload_period)
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
|
||||
|
||||
// We advance next_download irrespective of errors: we don't want error cases to result in
|
||||
// expensive busy-polling.
|
||||
detail.next_download = Some(Instant::now() + period_jitter(period, 5));
|
||||
detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
|
||||
}
|
||||
|
||||
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
|
||||
@@ -333,11 +309,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
if detail.next_download.is_none() {
|
||||
// Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times. Subsequent
|
||||
// rounds will use a smaller jitter to avoid accidentally synchronizing later.
|
||||
detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
|
||||
detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
|
||||
"Using our constant, which is known to be small compared with clock range",
|
||||
));
|
||||
}
|
||||
(detail.last_download.clone(), detail.next_download.unwrap())
|
||||
(detail.last_download, detail.next_download.unwrap())
|
||||
};
|
||||
|
||||
if now > next_download {
|
||||
@@ -345,6 +321,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
secondary_state: secondary_tenant,
|
||||
last_download,
|
||||
target_time: Some(next_download),
|
||||
period: Some(DOWNLOAD_FRESHEN_INTERVAL),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
@@ -370,6 +347,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
|
||||
Ok(PendingDownload {
|
||||
target_time: None,
|
||||
period: None,
|
||||
last_download: None,
|
||||
secondary_state: tenant,
|
||||
})
|
||||
@@ -386,6 +364,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
secondary_state,
|
||||
last_download,
|
||||
target_time,
|
||||
period,
|
||||
} = job;
|
||||
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
@@ -407,7 +386,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
tracing::warn!("Insufficient space while downloading. Will retry later.");
|
||||
}
|
||||
Err(UpdateError::Cancelled) => {
|
||||
tracing::info!("Shut down while downloading");
|
||||
tracing::debug!("Shut down while downloading");
|
||||
},
|
||||
Err(UpdateError::Deserialize(e)) => {
|
||||
tracing::error!("Corrupt content while downloading tenant: {e}");
|
||||
@@ -422,15 +401,20 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
|
||||
// If the job had a target execution time, we may check our final execution
|
||||
// time against that for observability purposes.
|
||||
if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
|
||||
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||
let elapsed = Instant::now().duration_since(target_time);
|
||||
if let (Some(target_time), Some(period)) = (target_time, period) {
|
||||
// Only track execution lag if this isn't our first download: otherwise, it is expected
|
||||
// that execution will have taken longer than our configured interval, for example
|
||||
// when starting up a pageserver and
|
||||
if last_download.is_some() {
|
||||
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||
let elapsed = Instant::now().duration_since(target_time);
|
||||
|
||||
warn_when_period_overrun(
|
||||
elapsed,
|
||||
last_download.upload_period,
|
||||
BackgroundLoopKind::SecondaryDownload,
|
||||
);
|
||||
warn_when_period_overrun(
|
||||
elapsed,
|
||||
period,
|
||||
BackgroundLoopKind::SecondaryDownload,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
CompleteDownload {
|
||||
@@ -519,12 +503,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
|
||||
// We will use the etag from last successful download to make the download conditional on changes
|
||||
let last_download = self
|
||||
let last_etag = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.last_download
|
||||
.last_etag
|
||||
.clone();
|
||||
|
||||
// Download the tenant's heatmap
|
||||
@@ -533,7 +517,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
etag: heatmap_etag,
|
||||
bytes: heatmap_bytes,
|
||||
} = match tokio::select!(
|
||||
bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
|
||||
bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
|
||||
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
||||
) {
|
||||
HeatMapDownload::Unmodified => {
|
||||
@@ -562,39 +546,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
heatmap.timelines.len()
|
||||
);
|
||||
|
||||
// Get or initialize the local disk state for the timelines we will update
|
||||
let mut timeline_states = HashMap::new();
|
||||
for timeline in &heatmap.timelines {
|
||||
let timeline_state = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.get(&timeline.timeline_id)
|
||||
.cloned();
|
||||
|
||||
let timeline_state = match timeline_state {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
// We have no existing state: need to scan local disk for layers first.
|
||||
let timeline_state =
|
||||
init_timeline_state(self.conf, tenant_shard_id, timeline).await;
|
||||
|
||||
// Re-acquire detail lock now that we're done with async load from local FS
|
||||
self.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(timeline.timeline_id, timeline_state.clone());
|
||||
timeline_state
|
||||
}
|
||||
};
|
||||
|
||||
timeline_states.insert(timeline.timeline_id, timeline_state);
|
||||
}
|
||||
|
||||
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
|
||||
// principle that deletions should be done before writes wherever possible, and so that we can use this
|
||||
// phase to initialize our SecondaryProgress.
|
||||
@@ -605,10 +556,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
let timeline_state = timeline_states
|
||||
.remove(&timeline.timeline_id)
|
||||
.expect("Just populated above");
|
||||
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!(
|
||||
"Cancelled before downloading timeline {}",
|
||||
@@ -618,7 +565,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
let timeline_id = timeline.timeline_id;
|
||||
self.download_timeline(timeline, timeline_state, ctx)
|
||||
self.download_timeline(timeline, ctx)
|
||||
.instrument(tracing::info_span!(
|
||||
"secondary_download_timeline",
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
@@ -630,30 +577,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
// Only update last_etag after a full successful download: this way will not skip
|
||||
// the next download, even if the heatmap's actual etag is unchanged.
|
||||
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
|
||||
etag: heatmap_etag,
|
||||
mtime: heatmap_mtime,
|
||||
upload_period: heatmap
|
||||
.upload_period_ms
|
||||
.map(|ms| Duration::from_millis(ms as u64))
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
|
||||
});
|
||||
|
||||
// Robustness: we should have updated progress properly, but in case we didn't, make sure
|
||||
// we don't leave the tenant in a state where we claim to have successfully downloaded
|
||||
// everything, but our progress is incomplete. The invariant here should be that if
|
||||
// we have set `last_download` to this heatmap's etag, then the next time we see that
|
||||
// etag we can safely do no work (i.e. we must be complete).
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
debug_assert!(progress.layers_downloaded == progress.layers_total);
|
||||
debug_assert!(progress.bytes_downloaded == progress.bytes_total);
|
||||
if progress.layers_downloaded != progress.layers_total
|
||||
|| progress.bytes_downloaded != progress.bytes_total
|
||||
{
|
||||
tracing::warn!("Correcting drift in progress stats ({progress:?})");
|
||||
progress.layers_downloaded = progress.layers_total;
|
||||
progress.bytes_downloaded = progress.bytes_total;
|
||||
}
|
||||
self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -709,7 +633,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let mut layer_byte_count: u64 = timeline_state
|
||||
.on_disk_layers
|
||||
.values()
|
||||
.map(|l| l.metadata.file_size)
|
||||
.map(|l| l.metadata.file_size())
|
||||
.sum();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
@@ -720,7 +644,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
.get(layer_file_name)
|
||||
.unwrap()
|
||||
.metadata
|
||||
.file_size;
|
||||
.file_size();
|
||||
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
@@ -830,7 +754,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
async fn download_timeline(
|
||||
&self,
|
||||
timeline: HeatMapTimeline,
|
||||
timeline_state: SecondaryDetailTimeline,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -839,6 +762,34 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
// Clone a view of what layers already exist on disk
|
||||
let timeline_state = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.get(&timeline.timeline_id)
|
||||
.cloned();
|
||||
|
||||
let timeline_state = match timeline_state {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
// We have no existing state: need to scan local disk for layers first.
|
||||
let timeline_state =
|
||||
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
|
||||
|
||||
// Re-acquire detail lock now that we're done with async load from local FS
|
||||
self.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(timeline.timeline_id, timeline_state.clone());
|
||||
timeline_state
|
||||
}
|
||||
};
|
||||
|
||||
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
@@ -856,12 +807,20 @@ impl<'a> TenantDownloader<'a> {
|
||||
if cfg!(debug_assertions) {
|
||||
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||
// are already present on disk are really there.
|
||||
match tokio::fs::metadata(&on_disk.local_path).await {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
match tokio::fs::metadata(&local_path).await {
|
||||
Ok(meta) => {
|
||||
tracing::debug!(
|
||||
"Layer {} present at {}, size {}",
|
||||
layer.name,
|
||||
on_disk.local_path,
|
||||
local_path,
|
||||
meta.len(),
|
||||
);
|
||||
}
|
||||
@@ -869,7 +828,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
tracing::warn!(
|
||||
"Layer {} not found at {} ({})",
|
||||
layer.name,
|
||||
on_disk.local_path,
|
||||
local_path,
|
||||
e
|
||||
);
|
||||
debug_assert!(false);
|
||||
@@ -877,7 +836,9 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
|
||||
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
||||
|| on_disk.access_time != layer.access_time
|
||||
{
|
||||
// We already have this layer on disk. Update its access time.
|
||||
tracing::debug!(
|
||||
"Access time updated for layer {}: {} -> {}",
|
||||
@@ -909,21 +870,71 @@ impl<'a> TenantDownloader<'a> {
|
||||
strftime(&layer.access_time),
|
||||
strftime(evicted_at)
|
||||
);
|
||||
self.skip_layer(layer);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
match self
|
||||
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
|
||||
.await?
|
||||
// Failpoint for simulating slow remote storage
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"secondary-layer-download-sleep",
|
||||
&self.secondary_state.cancel
|
||||
);
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
self.remote_storage,
|
||||
*tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
&layer.name,
|
||||
&LayerFileMetadata::from(&layer.metadata),
|
||||
&self.secondary_state.cancel,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(layer) => touched.push(layer),
|
||||
None => {
|
||||
// Not an error but we didn't download it: remote layer is missing. Don't add it to the list of
|
||||
// things to consider touched.
|
||||
Ok(bytes) => bytes,
|
||||
Err(DownloadError::NotFound) => {
|
||||
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
|
||||
// This is harmless: continue to download the next layer. It is expected during compaction
|
||||
// GC.
|
||||
tracing::debug!(
|
||||
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||
layer.name
|
||||
);
|
||||
continue;
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
if downloaded_bytes != layer.metadata.file_size {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
tracing::warn!(
|
||||
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
|
||||
layer.name,
|
||||
downloaded_bytes,
|
||||
layer.metadata.file_size
|
||||
);
|
||||
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)?;
|
||||
} else {
|
||||
tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
progress.bytes_downloaded += downloaded_bytes;
|
||||
progress.layers_downloaded += 1;
|
||||
}
|
||||
|
||||
SECONDARY_MODE.download_layer.inc();
|
||||
touched.push(layer)
|
||||
}
|
||||
|
||||
// Write updates to state to record layers we just downloaded or touched.
|
||||
@@ -940,21 +951,13 @@ impl<'a> TenantDownloader<'a> {
|
||||
v.get_mut().access_time = t.access_time;
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&t.name,
|
||||
&t.metadata.generation,
|
||||
);
|
||||
e.insert(OnDiskState::new(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
t.name,
|
||||
t.metadata.clone(),
|
||||
LayerFileMetadata::from(&t.metadata),
|
||||
t.access_time,
|
||||
local_path,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -963,102 +966,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
|
||||
fn skip_layer(&self, layer: HeatMapLayer) {
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
progress.layers_total = progress.layers_total.saturating_sub(1);
|
||||
progress.bytes_total = progress
|
||||
.bytes_total
|
||||
.saturating_sub(layer.metadata.file_size);
|
||||
}
|
||||
|
||||
async fn download_layer(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
layer: HeatMapLayer,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<HeatMapLayer>, UpdateError> {
|
||||
// Failpoint for simulating slow remote storage
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"secondary-layer-download-sleep",
|
||||
&self.secondary_state.cancel
|
||||
);
|
||||
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
tracing::info!(
|
||||
"Starting download of layer {}, size {}",
|
||||
layer.name,
|
||||
layer.metadata.file_size
|
||||
);
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
self.remote_storage,
|
||||
*tenant_shard_id,
|
||||
*timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata,
|
||||
&local_path,
|
||||
&self.secondary_state.cancel,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(bytes) => bytes,
|
||||
Err(DownloadError::NotFound) => {
|
||||
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
|
||||
// This is harmless: continue to download the next layer. It is expected during compaction
|
||||
// GC.
|
||||
tracing::debug!(
|
||||
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||
layer.name
|
||||
);
|
||||
self.skip_layer(layer);
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
if downloaded_bytes != layer.metadata.file_size {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
tracing::warn!(
|
||||
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
|
||||
layer.name,
|
||||
downloaded_bytes,
|
||||
layer.metadata.file_size
|
||||
);
|
||||
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)?;
|
||||
} else {
|
||||
tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
progress.bytes_downloaded += downloaded_bytes;
|
||||
progress.layers_downloaded += 1;
|
||||
}
|
||||
|
||||
SECONDARY_MODE.download_layer.inc();
|
||||
|
||||
Ok(Some(layer))
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
|
||||
@@ -1108,7 +1015,11 @@ async fn init_timeline_state(
|
||||
.fatal_err(&format!("Read metadata on {}", file_path));
|
||||
|
||||
let file_name = file_path.file_name().expect("created it from the dentry");
|
||||
if crate::is_temporary(&file_path)
|
||||
if file_name == METADATA_FILE_NAME {
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
|
||||
continue;
|
||||
} else if crate::is_temporary(&file_path)
|
||||
|| is_temp_download_file(&file_path)
|
||||
|| is_ephemeral_file(file_name)
|
||||
{
|
||||
@@ -1148,9 +1059,8 @@ async fn init_timeline_state(
|
||||
tenant_shard_id,
|
||||
&heatmap.timeline_id,
|
||||
name,
|
||||
remote_meta.metadata.clone(),
|
||||
LayerFileMetadata::from(&remote_meta.metadata),
|
||||
remote_meta.access_time,
|
||||
file_path,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::time::SystemTime;
|
||||
|
||||
use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
|
||||
use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
|
||||
@@ -15,14 +15,6 @@ pub(super) struct HeatMapTenant {
|
||||
pub(super) generation: Generation,
|
||||
|
||||
pub(super) timelines: Vec<HeatMapTimeline>,
|
||||
|
||||
/// Uploaders provide their own upload period in the heatmap, as a hint to downloaders
|
||||
/// of how frequently it is worthwhile to check for updates.
|
||||
///
|
||||
/// This is optional for backward compat, and because we sometimes might upload
|
||||
/// a heatmap explicitly via API for a tenant that has no periodic upload configured.
|
||||
#[serde(default)]
|
||||
pub(super) upload_period_ms: Option<u128>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -38,7 +30,7 @@ pub(crate) struct HeatMapTimeline {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapLayer {
|
||||
pub(super) name: LayerName,
|
||||
pub(super) metadata: LayerFileMetadata,
|
||||
pub(super) metadata: IndexLayerMetadata,
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(super) access_time: SystemTime,
|
||||
@@ -49,7 +41,7 @@ pub(crate) struct HeatMapLayer {
|
||||
impl HeatMapLayer {
|
||||
pub(crate) fn new(
|
||||
name: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
metadata: IndexLayerMetadata,
|
||||
access_time: SystemTime,
|
||||
) -> Self {
|
||||
Self {
|
||||
@@ -89,21 +81,4 @@ impl HeatMapTenant {
|
||||
|
||||
stats
|
||||
}
|
||||
|
||||
pub(crate) fn strip_atimes(self) -> Self {
|
||||
Self {
|
||||
timelines: self
|
||||
.timelines
|
||||
.into_iter()
|
||||
.map(|mut tl| {
|
||||
for layer in &mut tl.layers {
|
||||
layer.access_time = SystemTime::UNIX_EPOCH;
|
||||
}
|
||||
tl
|
||||
})
|
||||
.collect(),
|
||||
generation: self.generation,
|
||||
upload_period_ms: self.upload_period_ms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("heatmap_upload_scheduler"))
|
||||
.instrument(info_span!("heatmap_uploader"))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -80,7 +80,7 @@ impl RunningJob for WriteInProgress {
|
||||
|
||||
struct UploadPending {
|
||||
tenant: Arc<Tenant>,
|
||||
last_upload: Option<LastUploadState>,
|
||||
last_digest: Option<md5::Digest>,
|
||||
target_time: Option<Instant>,
|
||||
period: Option<Duration>,
|
||||
}
|
||||
@@ -94,7 +94,7 @@ impl scheduler::PendingJob for UploadPending {
|
||||
struct WriteComplete {
|
||||
tenant_shard_id: TenantShardId,
|
||||
completed_at: Instant,
|
||||
uploaded: Option<LastUploadState>,
|
||||
digest: Option<md5::Digest>,
|
||||
next_upload: Option<Instant>,
|
||||
}
|
||||
|
||||
@@ -115,7 +115,10 @@ struct UploaderTenantState {
|
||||
tenant: Weak<Tenant>,
|
||||
|
||||
/// Digest of the serialized heatmap that we last successfully uploaded
|
||||
last_upload_state: Option<LastUploadState>,
|
||||
///
|
||||
/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag,
|
||||
/// which is also an md5sum.
|
||||
last_digest: Option<md5::Digest>,
|
||||
|
||||
/// When the last upload attempt completed (may have been successful or failed)
|
||||
last_upload: Option<Instant>,
|
||||
@@ -184,7 +187,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
tenant: Arc::downgrade(&tenant),
|
||||
last_upload: None,
|
||||
next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
|
||||
last_upload_state: None,
|
||||
last_digest: None,
|
||||
});
|
||||
|
||||
// Decline to do the upload if insufficient time has passed
|
||||
@@ -192,10 +195,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
return;
|
||||
}
|
||||
|
||||
let last_upload = state.last_upload_state.clone();
|
||||
let last_digest = state.last_digest;
|
||||
result.jobs.push(UploadPending {
|
||||
tenant,
|
||||
last_upload,
|
||||
last_digest,
|
||||
target_time: state.next_upload,
|
||||
period: Some(period),
|
||||
});
|
||||
@@ -215,7 +218,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
) {
|
||||
let UploadPending {
|
||||
tenant,
|
||||
last_upload,
|
||||
last_digest,
|
||||
target_time,
|
||||
period,
|
||||
} = job;
|
||||
@@ -228,16 +231,16 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
let _completion = completion;
|
||||
|
||||
let started_at = Instant::now();
|
||||
let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
|
||||
Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
|
||||
let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
|
||||
Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
|
||||
let duration = Instant::now().duration_since(started_at);
|
||||
SECONDARY_MODE
|
||||
.upload_heatmap_duration
|
||||
.observe(duration.as_secs_f64());
|
||||
SECONDARY_MODE.upload_heatmap.inc();
|
||||
Some(uploaded)
|
||||
Some(digest)
|
||||
}
|
||||
Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
|
||||
Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
|
||||
Err(UploadHeatmapError::Upload(e)) => {
|
||||
tracing::warn!(
|
||||
"Failed to upload heatmap for tenant {}: {e:#}",
|
||||
@@ -248,11 +251,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
.upload_heatmap_duration
|
||||
.observe(duration.as_secs_f64());
|
||||
SECONDARY_MODE.upload_heatmap_errors.inc();
|
||||
last_upload
|
||||
last_digest
|
||||
}
|
||||
Err(UploadHeatmapError::Cancelled) => {
|
||||
tracing::info!("Cancelled heatmap upload, shutting down");
|
||||
last_upload
|
||||
last_digest
|
||||
}
|
||||
};
|
||||
|
||||
@@ -274,7 +277,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
WriteComplete {
|
||||
tenant_shard_id: *tenant.get_tenant_shard_id(),
|
||||
completed_at: now,
|
||||
uploaded,
|
||||
digest,
|
||||
next_upload,
|
||||
}
|
||||
}.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||
@@ -296,7 +299,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
|
||||
Ok(UploadPending {
|
||||
// Ignore our state for last digest: this forces an upload even if nothing has changed
|
||||
last_upload: None,
|
||||
last_digest: None,
|
||||
tenant,
|
||||
target_time: None,
|
||||
period: None,
|
||||
@@ -309,7 +312,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
let WriteComplete {
|
||||
tenant_shard_id,
|
||||
completed_at,
|
||||
uploaded,
|
||||
digest,
|
||||
next_upload,
|
||||
} = completion;
|
||||
use std::collections::hash_map::Entry;
|
||||
@@ -319,7 +322,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
}
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().last_upload = Some(completed_at);
|
||||
entry.get_mut().last_upload_state = uploaded;
|
||||
entry.get_mut().last_digest = digest;
|
||||
entry.get_mut().next_upload = next_upload
|
||||
}
|
||||
}
|
||||
@@ -328,7 +331,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
|
||||
enum UploadHeatmapOutcome {
|
||||
/// We successfully wrote to remote storage, with this digest.
|
||||
Uploaded(LastUploadState),
|
||||
Uploaded(md5::Digest),
|
||||
/// We did not upload because the heatmap digest was unchanged since the last upload
|
||||
NoChange,
|
||||
/// We skipped the upload for some reason, such as tenant/timeline not ready
|
||||
@@ -344,25 +347,12 @@ enum UploadHeatmapError {
|
||||
Upload(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
/// Digests describing the heatmap we most recently uploaded successfully.
|
||||
///
|
||||
/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag,
|
||||
/// which is also an md5sum.
|
||||
#[derive(Clone)]
|
||||
struct LastUploadState {
|
||||
// Digest of json-encoded HeatMapTenant
|
||||
uploaded_digest: md5::Digest,
|
||||
|
||||
// Digest without atimes set.
|
||||
layers_only_digest: md5::Digest,
|
||||
}
|
||||
|
||||
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
|
||||
/// of the object we would have uploaded.
|
||||
async fn upload_tenant_heatmap(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
tenant: &Arc<Tenant>,
|
||||
last_upload: Option<LastUploadState>,
|
||||
last_digest: Option<md5::Digest>,
|
||||
) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
@@ -378,7 +368,6 @@ async fn upload_tenant_heatmap(
|
||||
let mut heatmap = HeatMapTenant {
|
||||
timelines: Vec::new(),
|
||||
generation,
|
||||
upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()),
|
||||
};
|
||||
let timelines = tenant.timelines.lock().unwrap().clone();
|
||||
|
||||
@@ -407,31 +396,15 @@ async fn upload_tenant_heatmap(
|
||||
|
||||
// Serialize the heatmap
|
||||
let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
|
||||
let bytes = bytes::Bytes::from(bytes);
|
||||
let size = bytes.len();
|
||||
|
||||
// Drop out early if nothing changed since our last upload
|
||||
let digest = md5::compute(&bytes);
|
||||
if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) {
|
||||
if Some(digest) == last_digest {
|
||||
return Ok(UploadHeatmapOutcome::NoChange);
|
||||
}
|
||||
|
||||
// Calculate a digest that omits atimes, so that we can distinguish actual changes in
|
||||
// layers from changes only in atimes.
|
||||
let heatmap_size_bytes = heatmap.get_stats().bytes;
|
||||
let layers_only_bytes =
|
||||
serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?;
|
||||
let layers_only_digest = md5::compute(&layers_only_bytes);
|
||||
if heatmap_size_bytes < tenant.get_checkpoint_distance() {
|
||||
// For small tenants, skip upload if only atimes changed. This avoids doing frequent
|
||||
// uploads from long-idle tenants whose atimes are just incremented by periodic
|
||||
// size calculations.
|
||||
if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) {
|
||||
return Ok(UploadHeatmapOutcome::NoChange);
|
||||
}
|
||||
}
|
||||
|
||||
let bytes = bytes::Bytes::from(bytes);
|
||||
let size = bytes.len();
|
||||
|
||||
let path = remote_heatmap_path(tenant.get_tenant_shard_id());
|
||||
|
||||
let cancel = &tenant.cancel;
|
||||
@@ -463,8 +436,5 @@ async fn upload_tenant_heatmap(
|
||||
|
||||
tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
|
||||
|
||||
Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
|
||||
uploaded_digest: digest,
|
||||
layers_only_digest,
|
||||
}))
|
||||
Ok(UploadHeatmapOutcome::Uploaded(digest))
|
||||
}
|
||||
|
||||
@@ -179,13 +179,6 @@ where
|
||||
// Schedule some work, if concurrency limit permits it
|
||||
self.spawn_pending();
|
||||
|
||||
// This message is printed every scheduling iteration as proof of liveness when looking at logs
|
||||
tracing::info!(
|
||||
"Status: {} tasks running, {} pending",
|
||||
self.running.len(),
|
||||
self.pending.len()
|
||||
);
|
||||
|
||||
// Between scheduling iterations, we will:
|
||||
// - Drain any complete tasks and spawn pending tasks
|
||||
// - Handle incoming administrative commands
|
||||
@@ -265,11 +258,7 @@ where
|
||||
|
||||
self.tasks.spawn(fut);
|
||||
|
||||
let replaced = self.running.insert(tenant_shard_id, in_progress);
|
||||
debug_assert!(replaced.is_none());
|
||||
if replaced.is_some() {
|
||||
tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
|
||||
}
|
||||
self.running.insert(tenant_shard_id, in_progress);
|
||||
}
|
||||
|
||||
/// For all pending tenants that are elegible for execution, spawn their task.
|
||||
@@ -279,9 +268,7 @@ where
|
||||
while !self.pending.is_empty() && self.running.len() < self.concurrency {
|
||||
// unwrap: loop condition includes !is_empty()
|
||||
let pending = self.pending.pop_front().unwrap();
|
||||
if !self.running.contains_key(pending.get_tenant_shard_id()) {
|
||||
self.do_spawn(pending);
|
||||
}
|
||||
self.do_spawn(pending);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -334,8 +321,7 @@ where
|
||||
|
||||
let tenant_shard_id = job.get_tenant_shard_id();
|
||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||
tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Command already running, waiting for it");
|
||||
tracing::info!("Command already running, waiting for it");
|
||||
barrier
|
||||
} else {
|
||||
let running = self.spawn_now(job);
|
||||
|
||||
@@ -113,20 +113,12 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Bag of data accumulated during a vectored get..
|
||||
/// Bag of data accumulated during a vectored get
|
||||
pub(crate) struct ValuesReconstructState {
|
||||
/// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
|
||||
/// should not expect to get anything from this hashmap.
|
||||
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
||||
/// The keys which are already retrieved
|
||||
|
||||
keys_done: KeySpaceRandomAccum,
|
||||
|
||||
/// The keys covered by the image layers
|
||||
keys_with_image_coverage: Option<Range<Key>>,
|
||||
|
||||
// Statistics that are still accessible as a caller of `get_vectored_impl`.
|
||||
layers_visited: u32,
|
||||
delta_layers_visited: u32,
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
@@ -134,9 +126,7 @@ impl ValuesReconstructState {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
keys_with_image_coverage: None,
|
||||
layers_visited: 0,
|
||||
delta_layers_visited: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,17 +140,8 @@ impl ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
|
||||
pub(crate) fn on_layer_visited(&mut self) {
|
||||
self.layers_visited += 1;
|
||||
if let ReadableLayer::PersistentLayer(layer) = layer {
|
||||
if layer.layer_desc().is_delta() {
|
||||
self.delta_layers_visited += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_delta_layers_visited(&self) -> u32 {
|
||||
self.delta_layers_visited
|
||||
}
|
||||
|
||||
pub(crate) fn get_layers_visited(&self) -> u32 {
|
||||
@@ -190,16 +171,6 @@ impl ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
/// On hitting image layer, we can mark all keys in this range as done, because
|
||||
/// if the image layer does not contain a key, it is deleted/never added.
|
||||
pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
|
||||
let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
|
||||
assert_eq!(
|
||||
prev_val, None,
|
||||
"should consume the keyspace before the next iteration"
|
||||
);
|
||||
}
|
||||
|
||||
/// Update the state collected for a given key.
|
||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||
///
|
||||
@@ -262,12 +233,8 @@ impl ValuesReconstructState {
|
||||
|
||||
/// Returns the key space describing the keys that have
|
||||
/// been marked as completed since the last call to this function.
|
||||
/// Returns individual keys done, and the image layer coverage.
|
||||
pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
|
||||
(
|
||||
self.keys_done.consume_keyspace(),
|
||||
self.keys_with_image_coverage.take(),
|
||||
)
|
||||
pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
|
||||
self.keys_done.consume_keyspace()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,7 +285,7 @@ pub(crate) struct LayerFringe {
|
||||
#[derive(Debug)]
|
||||
struct LayerKeyspace {
|
||||
layer: ReadableLayer,
|
||||
target_keyspace: Vec<KeySpace>,
|
||||
target_keyspace: KeySpace,
|
||||
}
|
||||
|
||||
impl LayerFringe {
|
||||
@@ -336,7 +303,6 @@ impl LayerFringe {
|
||||
};
|
||||
|
||||
let removed = self.layers.remove_entry(&read_desc.layer_id);
|
||||
|
||||
match removed {
|
||||
Some((
|
||||
_,
|
||||
@@ -344,15 +310,7 @@ impl LayerFringe {
|
||||
layer,
|
||||
target_keyspace,
|
||||
},
|
||||
)) => {
|
||||
let mut keyspace = KeySpaceRandomAccum::new();
|
||||
for ks in target_keyspace {
|
||||
for part in ks.ranges {
|
||||
keyspace.add_range(part);
|
||||
}
|
||||
}
|
||||
Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
|
||||
}
|
||||
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
}
|
||||
}
|
||||
@@ -367,7 +325,7 @@ impl LayerFringe {
|
||||
let entry = self.layers.entry(layer_id.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().target_keyspace.push(keyspace);
|
||||
entry.get_mut().target_keyspace.merge(&keyspace);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
@@ -376,7 +334,7 @@ impl LayerFringe {
|
||||
});
|
||||
entry.insert(LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace: vec![keyspace],
|
||||
target_keyspace: keyspace,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs::File;
|
||||
@@ -158,7 +158,6 @@ pub struct ImageLayerInner {
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
file: VirtualFile,
|
||||
@@ -420,7 +419,6 @@ impl ImageLayerInner {
|
||||
file,
|
||||
file_id,
|
||||
max_vectored_read_bytes,
|
||||
key_range: actual_summary.key_range,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -473,27 +471,19 @@ impl ImageLayerInner {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let reads = self
|
||||
.plan_reads(keyspace, None, ctx)
|
||||
.plan_reads(keyspace, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
reconstruct_state.on_image_layer_visited(&self.key_range);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Traverse the layer's index to build read operations on the overlap of the input keyspace
|
||||
/// and the keys in this layer.
|
||||
///
|
||||
/// If shard_identity is provided, it will be used to filter keys down to those stored on
|
||||
/// this shard.
|
||||
async fn plan_reads(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
shard_identity: Option<&ShardIdentity>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>> {
|
||||
let mut planner = VectoredReadPlanner::new(
|
||||
@@ -513,6 +503,7 @@ impl ImageLayerInner {
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
range.start.write_to_byte_slice(&mut search_key);
|
||||
|
||||
@@ -525,22 +516,12 @@ impl ImageLayerInner {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
assert!(key >= range.start);
|
||||
|
||||
let flag = if let Some(shard_identity) = shard_identity {
|
||||
if shard_identity.is_key_disposable(&key) {
|
||||
BlobFlag::Ignore
|
||||
} else {
|
||||
BlobFlag::None
|
||||
}
|
||||
} else {
|
||||
BlobFlag::None
|
||||
};
|
||||
|
||||
if key >= range.end {
|
||||
planner.handle_range_end(offset);
|
||||
range_end_handled = true;
|
||||
break;
|
||||
} else {
|
||||
planner.handle(key, self.lsn, offset, flag);
|
||||
planner.handle(key, self.lsn, offset, BlobFlag::None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -553,50 +534,6 @@ impl ImageLayerInner {
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
/// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
|
||||
/// then execute vectored GET operations, passing the results of all read keys into the writer.
|
||||
pub(super) async fn filter(
|
||||
&self,
|
||||
shard_identity: &ShardIdentity,
|
||||
writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
// Fragment the range into the regions owned by this ShardIdentity
|
||||
let plan = self
|
||||
.plan_reads(
|
||||
KeySpace {
|
||||
// If asked for the total key space, plan_reads will give us all the keys in the layer
|
||||
ranges: vec![Key::MIN..Key::MAX],
|
||||
},
|
||||
Some(shard_identity),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
let mut key_count = 0;
|
||||
for read in plan.into_iter() {
|
||||
let buf_size = read.size();
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
|
||||
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
|
||||
key_count += 1;
|
||||
writer
|
||||
.put_image(meta.meta.key, img_buf, ctx)
|
||||
.await
|
||||
.context(format!("Storing key {}", meta.meta.key))?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(key_count)
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
@@ -709,7 +646,7 @@ impl ImageLayerWriterInner {
|
||||
lsn,
|
||||
},
|
||||
);
|
||||
trace!("creating image layer {}", path);
|
||||
info!("new image layer {path}");
|
||||
let mut file = {
|
||||
VirtualFile::open_with_options(
|
||||
&path,
|
||||
@@ -829,7 +766,7 @@ impl ImageLayerWriterInner {
|
||||
// FIXME: why not carry the virtualfile here, it supports renaming?
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
|
||||
info!("created image layer {}", layer.local_path());
|
||||
trace!("created image layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
@@ -914,136 +851,3 @@ impl Drop for ImageLayerWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
};
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
|
||||
|
||||
use super::ImageLayerWriter;
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_rewrite() {
|
||||
let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
// The LSN at which we will create an image layer to filter
|
||||
let lsn = Lsn(0xdeadbeef0000);
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
let range = input_start..input_end;
|
||||
|
||||
// Build an image layer to filter
|
||||
let resident = {
|
||||
let mut writer = ImageLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
writer.finish(&timeline, &ctx).await.unwrap()
|
||||
};
|
||||
let original_size = resident.metadata().file_size;
|
||||
|
||||
// Filter for various shards: this exercises cases like values at start of key range, end of key
|
||||
// range, middle of key range.
|
||||
for shard_number in 0..4 {
|
||||
let mut filtered_writer = ImageLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
|
||||
// to exercise filter()
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount::new(4),
|
||||
ShardStripeSize(0x8000),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let wrote_keys = resident
|
||||
.filter(&shard_identity, &mut filtered_writer, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let replacement = if wrote_keys > 0 {
|
||||
Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// This exact size and those below will need updating as/when the layer encoding changes, but
|
||||
// should be deterministic for a given version of the format, as we used no randomness generating the input.
|
||||
assert_eq!(original_size, 1597440);
|
||||
|
||||
match shard_number {
|
||||
0 => {
|
||||
// We should have written out just one stripe for our shard identity
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
let replacement = replacement.unwrap();
|
||||
|
||||
// We should have dropped some of the data
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
|
||||
// Assert that we dropped ~3/4 of the data.
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
}
|
||||
1 => {
|
||||
// Shard 1 has no keys in our input range
|
||||
assert_eq!(wrote_keys, 0x0);
|
||||
assert!(replacement.is_none());
|
||||
}
|
||||
2 => {
|
||||
// Shard 2 has one stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
}
|
||||
3 => {
|
||||
// Shard 3 has two stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x10000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 811008);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user