mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-26 14:50:36 +00:00
Compare commits
2 Commits
release-45
...
proxy-http
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6d2bbffdab | ||
|
|
7151bcc175 |
@@ -1,2 +0,0 @@
|
||||
[profile.default]
|
||||
slow-timeout = "1m"
|
||||
29
.github/workflows/benchmarking.yml
vendored
29
.github/workflows/benchmarking.yml
vendored
@@ -11,7 +11,7 @@ on:
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
inputs:
|
||||
@@ -23,21 +23,6 @@ on:
|
||||
type: boolean
|
||||
description: 'Publish perf report. If not set, the report will be published only for the main branch'
|
||||
required: false
|
||||
collect_olap_explain:
|
||||
type: boolean
|
||||
description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected'
|
||||
required: false
|
||||
default: false
|
||||
collect_pg_stat_statements:
|
||||
type: boolean
|
||||
description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected'
|
||||
required: false
|
||||
default: false
|
||||
run_AWS_RDS_AND_AURORA:
|
||||
type: boolean
|
||||
description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
|
||||
required: false
|
||||
default: false
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -128,8 +113,6 @@ jobs:
|
||||
# - neon-captest-reuse: Reusing existing project
|
||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
@@ -169,7 +152,7 @@ jobs:
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
||||
{ "platform": "rds-aurora" }]')
|
||||
fi
|
||||
@@ -188,9 +171,9 @@ jobs:
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -354,8 +337,6 @@ jobs:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
|
||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
@@ -418,8 +399,6 @@ jobs:
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }}
|
||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
TEST_OLAP_SCALE: 10
|
||||
|
||||
|
||||
105
.github/workflows/build_and_push_docker_image.yml
vendored
105
.github/workflows/build_and_push_docker_image.yml
vendored
@@ -1,105 +0,0 @@
|
||||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
dockerfile-path:
|
||||
required: true
|
||||
type: string
|
||||
image-name:
|
||||
required: true
|
||||
type: string
|
||||
outputs:
|
||||
build-tools-tag:
|
||||
description: "tag generated for build tools"
|
||||
value: ${{ jobs.tag.outputs.build-tools-tag }}
|
||||
|
||||
jobs:
|
||||
check-if-build-tools-dockerfile-changed:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
|
||||
steps:
|
||||
- name: Check if Dockerfile.buildtools has changed
|
||||
id: dockerfile
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
|
||||
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
|
||||
exit
|
||||
fi
|
||||
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
|
||||
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
|
||||
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
tag:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ check-if-build-tools-dockerfile-changed ]
|
||||
outputs:
|
||||
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
|
||||
|
||||
steps:
|
||||
- name: Get buildtools tag
|
||||
env:
|
||||
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
|
||||
IMAGE_TAG=$GITHUB_RUN_ID
|
||||
else
|
||||
IMAGE_TAG=pinned
|
||||
fi
|
||||
|
||||
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
|
||||
shell: bash
|
||||
id: buildtools-tag
|
||||
|
||||
kaniko:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
|
||||
kaniko-arm:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
manifest:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
name: 'manifest'
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs:
|
||||
- tag
|
||||
- kaniko
|
||||
- kaniko-arm
|
||||
- check-if-build-tools-dockerfile-changed
|
||||
|
||||
steps:
|
||||
- name: Create manifest
|
||||
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
- name: Push manifest
|
||||
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
|
||||
67
.github/workflows/build_and_test.yml
vendored
67
.github/workflows/build_and_test.yml
vendored
@@ -44,6 +44,7 @@ jobs:
|
||||
|
||||
exit 1
|
||||
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -73,19 +74,11 @@ jobs:
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
build-buildtools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/build_and_push_docker_image.yml
|
||||
with:
|
||||
dockerfile-path: Dockerfile.buildtools
|
||||
image-name: build-tools
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -105,20 +98,20 @@ jobs:
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run `ruff check` to ensure code format
|
||||
run: poetry run ruff check .
|
||||
- name: Run ruff to ensure code format
|
||||
run: poetry run ruff .
|
||||
|
||||
- name: Run `ruff format` to ensure code format
|
||||
run: poetry run ruff format --check .
|
||||
- name: Run black to ensure code format
|
||||
run: poetry run black --diff --check .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -182,10 +175,10 @@ jobs:
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
|
||||
build-neon:
|
||||
needs: [ check-permissions, tag, build-buildtools-image ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -339,16 +332,16 @@ jobs:
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run rust tests
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
@@ -358,7 +351,7 @@ jobs:
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
@@ -415,10 +408,10 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, build-neon, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
strategy:
|
||||
@@ -454,10 +447,10 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
benchmarks:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image ]
|
||||
needs: [ check-permissions, build-neon ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
@@ -486,12 +479,12 @@ jobs:
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -533,10 +526,11 @@ jobs:
|
||||
})
|
||||
|
||||
coverage-report:
|
||||
needs: [ check-permissions, regress-tests, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests ]
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -700,7 +694,7 @@ jobs:
|
||||
}"
|
||||
|
||||
neon-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
@@ -739,7 +733,6 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
@@ -750,7 +743,7 @@ jobs:
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
@@ -785,7 +778,6 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
@@ -796,7 +788,7 @@ jobs:
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
@@ -844,7 +836,6 @@ jobs:
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
@@ -866,7 +857,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.21.0
|
||||
VM_BUILDER_VERSION: v0.19.0
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -218,7 +218,7 @@ jobs:
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
130
.github/workflows/update_build_tools_image.yml
vendored
130
.github/workflows/update_build_tools_image.yml
vendored
@@ -1,130 +0,0 @@
|
||||
name: 'Update build tools image tag'
|
||||
|
||||
# This workflow it used to update tag of build tools in ECR.
|
||||
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
to-tag:
|
||||
description: 'Destination tag'
|
||||
required: true
|
||||
type: string
|
||||
default: 'pinned'
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
outputs:
|
||||
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
|
||||
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Get source image digest
|
||||
id: next-digest
|
||||
run: |
|
||||
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
|
||||
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Get destination image digest (if already exists)
|
||||
id: prev-digest
|
||||
run: |
|
||||
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
|
||||
else
|
||||
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
|
||||
|
||||
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Tag image
|
||||
run: |
|
||||
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
|
||||
|
||||
rollback-tag-image:
|
||||
needs: tag-image
|
||||
if: ${{ !success() }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Restore previous tag if needed
|
||||
run: |
|
||||
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
|
||||
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
|
||||
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
# I guess we should delete the tag here/untag the image, but crane does not support it
|
||||
# - https://github.com/google/go-containerregistry/issues/999
|
||||
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
|
||||
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
|
||||
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
|
||||
|
||||
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
|
||||
else
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
|
||||
fi
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -6,7 +6,6 @@ __pycache__/
|
||||
test_output/
|
||||
.vscode
|
||||
.idea
|
||||
neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
|
||||
|
||||
@@ -70,17 +70,3 @@ We're using the following approach to make it work:
|
||||
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
|
||||
|
||||
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
|
||||
|
||||
## How do I add the "pinned" tag to an buildtools image?
|
||||
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
|
||||
|
||||
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
|
||||
or using GitHub CLI:
|
||||
|
||||
```bash
|
||||
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
|
||||
-f from-tag=6254913013 \
|
||||
-f to-tag=pinned \
|
||||
|
||||
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
|
||||
```
|
||||
241
Cargo.lock
generated
241
Cargo.lock
generated
@@ -190,9 +190,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "async-compression"
|
||||
version = "0.4.5"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
|
||||
checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
|
||||
dependencies = [
|
||||
"flate2",
|
||||
"futures-core",
|
||||
@@ -233,7 +233,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -244,7 +244,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -881,7 +881,7 @@ dependencies = [
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
"which",
|
||||
]
|
||||
|
||||
@@ -1095,7 +1095,7 @@ dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1161,7 +1161,6 @@ dependencies = [
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper",
|
||||
"nix 0.26.2",
|
||||
"notify",
|
||||
"num_cpus",
|
||||
"opentelemetry",
|
||||
@@ -1169,10 +1168,8 @@ dependencies = [
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"rust-ini",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -1204,26 +1201,6 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
|
||||
|
||||
[[package]]
|
||||
name = "const-random"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
|
||||
dependencies = [
|
||||
"const-random-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const-random-macro"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
|
||||
dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
"once_cell",
|
||||
"tiny-keccak",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_fn"
|
||||
version = "0.4.9"
|
||||
@@ -1268,19 +1245,16 @@ name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"camino",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hyper",
|
||||
"nix 0.26.2",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
@@ -1294,8 +1268,6 @@ dependencies = [
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"toml",
|
||||
"tracing",
|
||||
"url",
|
||||
@@ -1456,12 +1428,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crunchy"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
|
||||
|
||||
[[package]]
|
||||
name = "crypto-bigint"
|
||||
version = "0.4.9"
|
||||
@@ -1515,7 +1481,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1526,7 +1492,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1601,16 +1567,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dlv-list"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
|
||||
dependencies = [
|
||||
"const-random",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1704,7 +1661,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1958,7 +1915,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2144,20 +2101,6 @@ dependencies = [
|
||||
"hashbrown 0.13.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hdrhistogram"
|
||||
version = "7.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"byteorder",
|
||||
"crossbeam-channel",
|
||||
"flate2",
|
||||
"nom",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapless"
|
||||
version = "0.8.0"
|
||||
@@ -2539,14 +2482,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "jsonwebtoken"
|
||||
version = "9.2.0"
|
||||
version = "8.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
|
||||
checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"js-sys",
|
||||
"pem 3.0.3",
|
||||
"ring 0.17.6",
|
||||
"pem 1.1.1",
|
||||
"ring 0.16.20",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"simple_asn1",
|
||||
@@ -2959,7 +2901,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3081,16 +3023,6 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
|
||||
dependencies = [
|
||||
"dlv-list",
|
||||
"hashbrown 0.14.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "os_info"
|
||||
version = "3.7.0"
|
||||
@@ -3119,28 +3051,6 @@ dependencies = [
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"futures",
|
||||
"hdrhistogram",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pagectl"
|
||||
version = "0.1.0"
|
||||
@@ -3230,7 +3140,6 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
@@ -3253,7 +3162,6 @@ dependencies = [
|
||||
"enum-map",
|
||||
"hex",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
@@ -3264,27 +3172,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_client"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking"
|
||||
version = "2.1.1"
|
||||
@@ -3376,19 +3263,18 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "2.0.1"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
|
||||
checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"serde",
|
||||
"base64 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.3"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
|
||||
checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"serde",
|
||||
@@ -3445,7 +3331,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3652,7 +3538,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4260,20 +4146,10 @@ dependencies = [
|
||||
"regex",
|
||||
"relative-path",
|
||||
"rustc_version",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-ini"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"ordered-multimap",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-demangle"
|
||||
version = "0.1.23"
|
||||
@@ -4405,20 +4281,17 @@ dependencies = [
|
||||
"async-stream",
|
||||
"aws-config",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-async",
|
||||
"bincode",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"either",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"hex",
|
||||
"histogram",
|
||||
"itertools",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
@@ -4451,7 +4324,6 @@ dependencies = [
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"fail",
|
||||
"fs2",
|
||||
"futures",
|
||||
"git-version",
|
||||
@@ -4475,7 +4347,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"sha2",
|
||||
"signal-hook",
|
||||
"storage_broker",
|
||||
"thiserror",
|
||||
@@ -4528,12 +4399,12 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
||||
|
||||
[[package]]
|
||||
name = "sct"
|
||||
version = "0.7.1"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
|
||||
checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4"
|
||||
dependencies = [
|
||||
"ring 0.17.6",
|
||||
"untrusted 0.9.0",
|
||||
"ring 0.16.20",
|
||||
"untrusted 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4709,7 +4580,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4790,7 +4661,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5057,9 +4928,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.32"
|
||||
version = "2.0.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
|
||||
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -5189,7 +5060,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5232,15 +5103,6 @@ dependencies = [
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tiny-keccak"
|
||||
version = "2.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
|
||||
dependencies = [
|
||||
"crunchy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytemplate"
|
||||
version = "1.2.1"
|
||||
@@ -5316,7 +5178,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5617,7 +5479,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5884,7 +5746,6 @@ dependencies = [
|
||||
"chrono",
|
||||
"const_format",
|
||||
"criterion",
|
||||
"fail",
|
||||
"futures",
|
||||
"heapless",
|
||||
"hex",
|
||||
@@ -6063,7 +5924,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
@@ -6097,7 +5958,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
@@ -6409,7 +6270,6 @@ dependencies = [
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"getrandom 0.2.11",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper",
|
||||
@@ -6421,7 +6281,6 @@ dependencies = [
|
||||
"num-bigint",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"prost",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
@@ -6436,7 +6295,7 @@ dependencies = [
|
||||
"smallvec",
|
||||
"subtle",
|
||||
"syn 1.0.109",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
"time",
|
||||
"time-macros",
|
||||
"tokio",
|
||||
@@ -6498,22 +6357,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.7.31"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
|
||||
checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74"
|
||||
dependencies = [
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.7.31"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
|
||||
checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6524,28 +6383,30 @@ checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.13.0"
|
||||
version = "0.12.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110"
|
||||
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "7.0.0"
|
||||
version = "6.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e"
|
||||
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.9+zstd.1.5.5"
|
||||
version = "2.0.8+zstd.1.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656"
|
||||
checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
@@ -5,8 +5,6 @@ members = [
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/pagebench",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
@@ -80,7 +78,6 @@ futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.13"
|
||||
hashlink = "0.8.1"
|
||||
hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
@@ -93,7 +90,7 @@ hyper-tungstenite = "0.11"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
jsonwebtoken = "8"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
@@ -185,7 +182,6 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
|
||||
### inside this image in the real deployments.
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
|
||||
# Build Postgres
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# System deps
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install -y \
|
||||
autoconf \
|
||||
automake \
|
||||
bison \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
curl \
|
||||
flex \
|
||||
git \
|
||||
gnupg \
|
||||
gzip \
|
||||
jq \
|
||||
libcurl4-openssl-dev \
|
||||
libbz2-dev \
|
||||
libffi-dev \
|
||||
liblzma-dev \
|
||||
libncurses5-dev \
|
||||
libncursesw5-dev \
|
||||
libpq-dev \
|
||||
libreadline-dev \
|
||||
libseccomp-dev \
|
||||
libsqlite3-dev \
|
||||
libssl-dev \
|
||||
libstdc++-10-dev \
|
||||
libtool \
|
||||
libxml2-dev \
|
||||
libxmlsec1-dev \
|
||||
libxxhash-dev \
|
||||
lsof \
|
||||
make \
|
||||
netcat \
|
||||
net-tools \
|
||||
openssh-client \
|
||||
parallel \
|
||||
pkg-config \
|
||||
unzip \
|
||||
wget \
|
||||
xz-utils \
|
||||
zlib1g-dev \
|
||||
zstd \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION 25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
&& unzip -q protoc.zip -d protoc \
|
||||
&& mv protoc/bin/protoc /usr/local/bin/protoc \
|
||||
&& mv protoc/include/google /usr/local/include/google \
|
||||
&& rm -rf protoc.zip protoc
|
||||
|
||||
# LLVM
|
||||
ENV LLVM_VERSION=17
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& apt update \
|
||||
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# PostgreSQL 14
|
||||
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
|
||||
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
|
||||
&& apt update \
|
||||
&& apt install -y postgresql-client-14 \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# AWS CLI
|
||||
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
||||
&& unzip -q awscliv2.zip \
|
||||
&& ./aws/install \
|
||||
&& rm awscliv2.zip
|
||||
|
||||
# Mold: A Modern Linker
|
||||
ENV MOLD_VERSION v2.4.0
|
||||
RUN set -e \
|
||||
&& git clone https://github.com/rui314/mold.git \
|
||||
&& mkdir mold/build \
|
||||
&& cd mold/build \
|
||||
&& git checkout ${MOLD_VERSION} \
|
||||
&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
|
||||
&& cmake --build . -j $(nproc) \
|
||||
&& cmake --install . \
|
||||
&& cd .. \
|
||||
&& rm -rf mold
|
||||
|
||||
# LCOV
|
||||
# Build lcov from a fork:
|
||||
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
|
||||
# And patches from us:
|
||||
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
|
||||
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
|
||||
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
|
||||
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
|
||||
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
|
||||
&& cd lcov \
|
||||
&& make install \
|
||||
&& rm -rf ../lcov.tar.gz
|
||||
|
||||
# Switch to nonroot user
|
||||
USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Python
|
||||
ENV PYTHON_VERSION=3.9.2 \
|
||||
PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
|
||||
RUN set -e \
|
||||
&& cd $HOME \
|
||||
&& curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
|
||||
&& chmod +x pyenv-installer \
|
||||
&& ./pyenv-installer \
|
||||
&& export PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
&& export PATH="$PYENV_ROOT/bin:$PATH" \
|
||||
&& export PATH="$PYENV_ROOT/shims:$PATH" \
|
||||
&& pyenv install ${PYTHON_VERSION} \
|
||||
&& pyenv global ${PYTHON_VERSION} \
|
||||
&& python --version \
|
||||
&& pip install --upgrade pip \
|
||||
&& pip --version \
|
||||
&& pip install pipenv wheel poetry
|
||||
|
||||
# Switch to nonroot user (again)
|
||||
USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.74.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
|
||||
rm rustup-init && \
|
||||
export PATH="$HOME/.cargo/bin:$PATH" && \
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
||||
cargo install --git https://github.com/paritytech/cachepot && \
|
||||
cargo install rustfilt && \
|
||||
cargo install cargo-hakari && \
|
||||
cargo install cargo-deny && \
|
||||
cargo install cargo-hack && \
|
||||
cargo install cargo-nextest && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
ENV RUSTC_WRAPPER=cachepot
|
||||
|
||||
# Show versions
|
||||
RUN whoami \
|
||||
&& python --version \
|
||||
&& pip --version \
|
||||
&& cargo --version --verbose \
|
||||
&& rustup --version --verbose \
|
||||
&& rustc --version --verbose \
|
||||
&& clang --version
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG PG_VERSION
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
@@ -48,29 +48,7 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -591,23 +569,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-semver-pg-build"
|
||||
# compile pg_semver extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-semver-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
|
||||
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-embedding-pg-build"
|
||||
@@ -807,7 +768,6 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
|
||||
@@ -29,14 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
|
||||
```bash
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
|
||||
libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
|
||||
libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
|
||||
```
|
||||
* On Fedora, these packages are needed:
|
||||
```bash
|
||||
dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \
|
||||
libffi-devel
|
||||
protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
|
||||
```
|
||||
* On Arch based systems, these packages are needed:
|
||||
```bash
|
||||
|
||||
@@ -13,7 +13,6 @@ clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
num_cpus.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
@@ -21,7 +20,6 @@ postgres.workspace = true
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
tar.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
@@ -39,6 +37,5 @@ workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||
zstd = "0.13"
|
||||
zstd = "0.12.4"
|
||||
bytes = "1.0"
|
||||
rust-ini = "0.20.0"
|
||||
|
||||
@@ -31,31 +31,25 @@
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
|
||||
//! --pgbouncer-ini-path /etc/pgbouncer.ini \
|
||||
//! -r http://pg-ext-s3-gateway
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::get_pg_version;
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
@@ -71,13 +65,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
thread::spawn(move || {
|
||||
for sig in signals.forever() {
|
||||
handle_exit_signal(sig);
|
||||
}
|
||||
});
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG")
|
||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||
.to_string();
|
||||
@@ -112,9 +99,6 @@ fn main() -> Result<()> {
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
|
||||
let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
|
||||
let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
|
||||
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
@@ -225,8 +209,6 @@ fn main() -> Result<()> {
|
||||
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
build_tag,
|
||||
pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
|
||||
pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -357,7 +339,6 @@ fn main() -> Result<()> {
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
PG_PID.store(0, Ordering::SeqCst);
|
||||
info!("Postgres exited with code {}, shutting down", ecode);
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
@@ -512,41 +493,6 @@ fn cli() -> clap::Command {
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-connstr")
|
||||
.long("pgbouncer-connstr")
|
||||
.default_value(
|
||||
"host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
|
||||
)
|
||||
.value_name("PGBOUNCER_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-ini-path")
|
||||
.long("pgbouncer-ini-path")
|
||||
// Note: this doesn't match current path for pgbouncer.ini.
|
||||
// Until we fix it, we need to pass the path explicitly
|
||||
// or this will be effectively no-op.
|
||||
.default_value("/etc/pgbouncer.ini")
|
||||
.value_name("PGBOUNCER_INI_PATH"),
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
|
||||
/// wait for termination which would be easy then.
|
||||
fn handle_exit_signal(sig: i32) {
|
||||
info!("received {sig} termination signal");
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
kill(pg_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -6,10 +6,7 @@ use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -36,9 +33,6 @@ use crate::spec::*;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
use crate::{config, extension_server};
|
||||
|
||||
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
|
||||
pub static PG_PID: AtomicU32 = AtomicU32::new(0);
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
// Url type maintains proper escaping
|
||||
@@ -70,10 +64,6 @@ pub struct ComputeNode {
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
// connection string to pgbouncer to change settings
|
||||
pub pgbouncer_connstr: Option<String>,
|
||||
// path to pgbouncer.ini to change settings
|
||||
pub pgbouncer_ini_path: Option<String>,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
@@ -506,7 +496,6 @@ impl ComputeNode {
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
@@ -514,7 +503,6 @@ impl ComputeNode {
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
@@ -669,7 +657,6 @@ impl ComputeNode {
|
||||
})
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
PG_PID.store(pg.id(), Ordering::SeqCst);
|
||||
|
||||
wait_for_postgres(&mut pg, pgdata_path)?;
|
||||
|
||||
@@ -750,31 +737,6 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||
@@ -829,32 +791,6 @@ impl ComputeNode {
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
// tune pgbouncer
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
info!(
|
||||
"start_compute spec.remote_extensions {:?}",
|
||||
pspec.spec.remote_extensions
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{thread, time::Duration};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, info, warn};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
@@ -84,29 +84,6 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
}
|
||||
}
|
||||
|
||||
// If there are existing (logical) walsenders, do not suspend.
|
||||
//
|
||||
// walproposer doesn't currently show up in pg_stat_replication,
|
||||
// but protect if it will be
|
||||
let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
|
||||
match cli.query_one(ws_count_query, &[]) {
|
||||
Ok(r) => match r.try_get::<&str, i64>("count") {
|
||||
Ok(num_ws) => {
|
||||
if num_ws > 0 {
|
||||
last_active = Some(Utc::now());
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("failed to parse ws count: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("failed to get list of walsenders: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Update the last activity in the shared state if we got a more recent one.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
// NB: `Some(<DateTime>)` is always greater than `None`.
|
||||
|
||||
@@ -9,11 +9,9 @@ use std::process::Child;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use ini::Ini;
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
use postgres::{Client, Transaction};
|
||||
use tokio_postgres::NoTls;
|
||||
use tracing::{debug, error, info, instrument};
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
@@ -361,68 +359,3 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update pgbouncer.ini with provided options
|
||||
pub fn update_pgbouncer_ini(
|
||||
pgbouncer_config: HashMap<String, String>,
|
||||
pgbouncer_ini_path: &str,
|
||||
) -> Result<()> {
|
||||
let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
|
||||
let section = conf.section_mut(Some("pgbouncer")).unwrap();
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
section.insert(option_name, value);
|
||||
}
|
||||
|
||||
conf.write_to_file(pgbouncer_ini_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tune pgbouncer.
|
||||
/// 1. Apply new config using pgbouncer admin console
|
||||
/// 2. Add new values to pgbouncer.ini to preserve them after restart
|
||||
pub async fn tune_pgbouncer(
|
||||
pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
pgbouncer_connstr: &str,
|
||||
pgbouncer_ini_path: Option<String>,
|
||||
) -> Result<()> {
|
||||
if let Some(pgbouncer_config) = pgbouncer_settings {
|
||||
// Apply new config
|
||||
let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
|
||||
let (client, connection) = connect_result.unwrap();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
info!(
|
||||
"Applying pgbouncer setting change: {} = {}",
|
||||
option_name, value
|
||||
);
|
||||
let query = format!("SET {} = {}", option_name, value);
|
||||
|
||||
let result = client.simple_query(&query).await;
|
||||
|
||||
info!("Applying pgbouncer setting change: {}", query);
|
||||
info!("pgbouncer setting change result: {:?}", result);
|
||||
|
||||
if let Err(err) = result {
|
||||
// Don't fail on error, just print it into log
|
||||
error!(
|
||||
"Failed to apply pgbouncer setting change: {}, {}",
|
||||
query, err
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
// save values to pgbouncer.ini
|
||||
// so that they are preserved after pgbouncer restart
|
||||
if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
|
||||
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -298,7 +298,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("role create query: '{}'", &query);
|
||||
@@ -370,49 +370,33 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn reassign_owned_objects_in_one_db(
|
||||
conf: Config,
|
||||
role_name: &PgIdent,
|
||||
db_owner: &PgIdent,
|
||||
) -> Result<()> {
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.pg_quote(),
|
||||
db_owner.pg_quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
role_name,
|
||||
conf.get_dbname().unwrap_or(""),
|
||||
db_owner
|
||||
);
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Reassign all owned objects in all databases to the owner of the database.
|
||||
fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
|
||||
for db in &spec.cluster.databases {
|
||||
if db.owner != *role_name {
|
||||
let mut conf = Config::from_str(connstr)?;
|
||||
conf.dbname(&db.name);
|
||||
reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
|
||||
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.pg_quote(),
|
||||
db.owner.pg_quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
role_name, &db.name, &db.owner
|
||||
);
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Also handle case when there are no databases in the spec.
|
||||
// In this case we need to reassign objects in the default database.
|
||||
let conf = Config::from_str(connstr)?;
|
||||
let db_owner = PgIdent::from_str("cloud_admin")?;
|
||||
reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -6,11 +6,9 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
@@ -26,11 +24,10 @@ tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
toml.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
url.workspace = true
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
|
||||
@@ -9,7 +9,7 @@ pub struct AttachmentService {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: PathBuf,
|
||||
client: reqwest::Client,
|
||||
client: reqwest::blocking::Client,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "attachment_service";
|
||||
@@ -53,7 +53,7 @@ impl AttachmentService {
|
||||
env: env.clone(),
|
||||
path,
|
||||
listen,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
client: reqwest::blocking::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
}
|
||||
@@ -64,7 +64,7 @@ impl AttachmentService {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<Child> {
|
||||
pub fn start(&self) -> anyhow::Result<Child> {
|
||||
let path_str = self.path.to_string_lossy();
|
||||
|
||||
background_process::start_process(
|
||||
@@ -73,11 +73,10 @@ impl AttachmentService {
|
||||
&self.env.attachment_service_bin(),
|
||||
["-l", &self.listen, "-p", &path_str],
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
background_process::InitialPidFile::Create(&self.pid_file()),
|
||||
// TODO: a real status check
|
||||
|| async move { anyhow::Ok(true) },
|
||||
|| Ok(true),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
@@ -85,7 +84,7 @@ impl AttachmentService {
|
||||
}
|
||||
|
||||
/// Call into the attach_hook API, for use before handing out attachments to pageservers
|
||||
pub async fn attach_hook(
|
||||
pub fn attach_hook(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
pageserver_id: NodeId,
|
||||
@@ -105,16 +104,16 @@ impl AttachmentService {
|
||||
node_id: Some(pageserver_id),
|
||||
};
|
||||
|
||||
let response = self.client.post(url).json(&request).send().await?;
|
||||
let response = self.client.post(url).json(&request).send()?;
|
||||
if response.status() != StatusCode::OK {
|
||||
return Err(anyhow!("Unexpected status {}", response.status()));
|
||||
}
|
||||
|
||||
let response = response.json::<AttachHookResponse>().await?;
|
||||
let response = response.json::<AttachHookResponse>()?;
|
||||
Ok(response.gen)
|
||||
}
|
||||
|
||||
pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
|
||||
pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
|
||||
use hyper::StatusCode;
|
||||
|
||||
let url = self
|
||||
@@ -127,12 +126,12 @@ impl AttachmentService {
|
||||
|
||||
let request = InspectRequest { tenant_id };
|
||||
|
||||
let response = self.client.post(url).json(&request).send().await?;
|
||||
let response = self.client.post(url).json(&request).send()?;
|
||||
if response.status() != StatusCode::OK {
|
||||
return Err(anyhow!("Unexpected status {}", response.status()));
|
||||
}
|
||||
|
||||
let response = response.json::<InspectResponse>().await?;
|
||||
let response = response.json::<InspectResponse>()?;
|
||||
Ok(response.attachment)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
|
||||
|
||||
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
|
||||
/// it itself.
|
||||
pub enum InitialPidFile {
|
||||
pub enum InitialPidFile<'t> {
|
||||
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
|
||||
Create(Utf8PathBuf),
|
||||
Create(&'t Utf8Path),
|
||||
/// The process will create the pidfile itself, need to wait for that event.
|
||||
Expect(Utf8PathBuf),
|
||||
Expect(&'t Utf8Path),
|
||||
}
|
||||
|
||||
/// Start a background child process using the parameters given.
|
||||
pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
pub fn start_process<F, AI, A, EI>(
|
||||
process_name: &str,
|
||||
datadir: &Path,
|
||||
command: &Path,
|
||||
@@ -62,8 +62,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<Child>
|
||||
where
|
||||
F: Fn() -> Fut,
|
||||
Fut: std::future::Future<Output = anyhow::Result<bool>>,
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
AI: IntoIterator<Item = A>,
|
||||
A: AsRef<OsStr>,
|
||||
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
|
||||
@@ -90,7 +89,7 @@ where
|
||||
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
|
||||
filled_cmd.envs(envs);
|
||||
|
||||
let pid_file_to_check = match &initial_pid_file {
|
||||
let pid_file_to_check = match initial_pid_file {
|
||||
InitialPidFile::Create(path) => {
|
||||
pre_exec_create_pidfile(filled_cmd, path);
|
||||
path
|
||||
@@ -108,7 +107,7 @@ where
|
||||
);
|
||||
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check).await {
|
||||
match process_started(pid, Some(pid_file_to_check), &process_status_check) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started, pid: {pid}");
|
||||
return Ok(spawned_process);
|
||||
@@ -317,20 +316,22 @@ where
|
||||
cmd
|
||||
}
|
||||
|
||||
async fn process_started<F, Fut>(
|
||||
fn process_started<F>(
|
||||
pid: Pid,
|
||||
pid_file_to_check: &Utf8Path,
|
||||
pid_file_to_check: Option<&Utf8Path>,
|
||||
status_check: &F,
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
F: Fn() -> Fut,
|
||||
Fut: std::future::Future<Output = anyhow::Result<bool>>,
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
{
|
||||
match status_check().await {
|
||||
Ok(true) => match pid_file::read(pid_file_to_check)? {
|
||||
PidFileRead::NotExist => Ok(false),
|
||||
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
|
||||
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
|
||||
match status_check() {
|
||||
Ok(true) => match pid_file_to_check {
|
||||
Some(pid_file_path) => match pid_file::read(pid_file_path)? {
|
||||
PidFileRead::NotExist => Ok(false),
|
||||
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
|
||||
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
|
||||
},
|
||||
None => Ok(true),
|
||||
},
|
||||
Ok(false) => Ok(false),
|
||||
Err(e) => anyhow::bail!("process failed to start: {e}"),
|
||||
|
||||
@@ -120,20 +120,15 @@ fn main() -> Result<()> {
|
||||
let mut env = LocalEnv::load_config().context("Error loading config")?;
|
||||
let original_env = env.clone();
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let subcommand_result = match sub_name {
|
||||
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
||||
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
||||
"tenant" => handle_tenant(sub_args, &mut env),
|
||||
"timeline" => handle_timeline(sub_args, &mut env),
|
||||
"start" => handle_start_all(sub_args, &env),
|
||||
"stop" => handle_stop_all(sub_args, &env),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
|
||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
||||
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
|
||||
"pageserver" => handle_pageserver(sub_args, &env),
|
||||
"attachment_service" => handle_attachment_service(sub_args, &env),
|
||||
"safekeeper" => handle_safekeeper(sub_args, &env),
|
||||
"endpoint" => handle_endpoint(sub_args, &env),
|
||||
"mappings" => handle_mappings(sub_args, &mut env),
|
||||
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
|
||||
_ => bail!("unexpected subcommand {sub_name}"),
|
||||
@@ -274,13 +269,12 @@ fn print_timeline(
|
||||
|
||||
/// Returns a map of timeline IDs to timeline_id@lsn strings.
|
||||
/// Connects to the pageserver to query this information.
|
||||
async fn get_timeline_infos(
|
||||
fn get_timeline_infos(
|
||||
env: &local_env::LocalEnv,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<HashMap<TimelineId, TimelineInfo>> {
|
||||
Ok(get_default_pageserver(env)
|
||||
.timeline_list(tenant_id)
|
||||
.await?
|
||||
.timeline_list(tenant_id)?
|
||||
.into_iter()
|
||||
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
|
||||
.collect())
|
||||
@@ -379,14 +373,11 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn handle_tenant(
|
||||
tenant_match: &ArgMatches,
|
||||
env: &mut local_env::LocalEnv,
|
||||
) -> anyhow::Result<()> {
|
||||
fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let pageserver = get_default_pageserver(env);
|
||||
match tenant_match.subcommand() {
|
||||
Some(("list", _)) => {
|
||||
for t in pageserver.tenant_list().await? {
|
||||
for t in pageserver.tenant_list()? {
|
||||
println!("{} {:?}", t.id, t.state);
|
||||
}
|
||||
}
|
||||
@@ -403,16 +394,12 @@ async fn handle_tenant(
|
||||
// We must register the tenant with the attachment service, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.attach_hook(tenant_id, pageserver.conf.id)
|
||||
.await?
|
||||
attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
pageserver
|
||||
.tenant_create(tenant_id, generation, tenant_conf)
|
||||
.await?;
|
||||
pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
|
||||
println!("tenant {tenant_id} successfully created on the pageserver");
|
||||
|
||||
// Create an initial timeline for the new tenant
|
||||
@@ -422,16 +409,14 @@ async fn handle_tenant(
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let timeline_info = pageserver
|
||||
.timeline_create(
|
||||
tenant_id,
|
||||
new_timeline_id,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
tenant_id,
|
||||
new_timeline_id,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
None,
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
|
||||
@@ -465,7 +450,6 @@ async fn handle_tenant(
|
||||
|
||||
pageserver
|
||||
.tenant_config(tenant_id, tenant_conf)
|
||||
.await
|
||||
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||
}
|
||||
@@ -474,7 +458,7 @@ async fn handle_tenant(
|
||||
let new_pageserver = get_pageserver(env, matches)?;
|
||||
let new_pageserver_id = new_pageserver.conf.id;
|
||||
|
||||
migrate_tenant(env, tenant_id, new_pageserver).await?;
|
||||
migrate_tenant(env, tenant_id, new_pageserver)?;
|
||||
println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
|
||||
}
|
||||
|
||||
@@ -484,13 +468,13 @@ async fn handle_tenant(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
|
||||
fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
|
||||
let pageserver = get_default_pageserver(env);
|
||||
|
||||
match timeline_match.subcommand() {
|
||||
Some(("list", list_match)) => {
|
||||
let tenant_id = get_tenant_id(list_match, env)?;
|
||||
let timelines = pageserver.timeline_list(&tenant_id).await?;
|
||||
let timelines = pageserver.timeline_list(&tenant_id)?;
|
||||
print_timelines_tree(timelines, env.timeline_name_mappings())?;
|
||||
}
|
||||
Some(("create", create_match)) => {
|
||||
@@ -506,16 +490,14 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
|
||||
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
||||
|
||||
let timeline_info = pageserver
|
||||
.timeline_create(
|
||||
tenant_id,
|
||||
new_timeline_id_opt,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
tenant_id,
|
||||
new_timeline_id_opt,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
None,
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
@@ -560,9 +542,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
println!("Importing timeline into pageserver ...");
|
||||
pageserver
|
||||
.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
|
||||
.await?;
|
||||
pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
|
||||
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
|
||||
|
||||
println!("Creating endpoint for imported timeline ...");
|
||||
@@ -598,16 +578,14 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||
.transpose()
|
||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||
let timeline_info = pageserver
|
||||
.timeline_create(
|
||||
tenant_id,
|
||||
None,
|
||||
start_lsn,
|
||||
Some(ancestor_timeline_id),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
tenant_id,
|
||||
None,
|
||||
start_lsn,
|
||||
Some(ancestor_timeline_id),
|
||||
None,
|
||||
None,
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
@@ -626,7 +604,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match ep_match.subcommand() {
|
||||
Some(ep_subcommand_data) => ep_subcommand_data,
|
||||
None => bail!("no endpoint subcommand provided"),
|
||||
@@ -636,12 +614,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
match sub_name {
|
||||
"list" => {
|
||||
let tenant_id = get_tenant_id(sub_args, env)?;
|
||||
let timeline_infos = get_timeline_infos(env, &tenant_id)
|
||||
.await
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("Failed to load timeline info: {}", e);
|
||||
HashMap::new()
|
||||
});
|
||||
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
|
||||
eprintln!("Failed to load timeline info: {}", e);
|
||||
HashMap::new()
|
||||
});
|
||||
|
||||
let timeline_name_mappings = env.timeline_name_mappings();
|
||||
|
||||
@@ -815,9 +791,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
};
|
||||
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint
|
||||
.start(&auth_token, safekeepers, remote_ext_config)
|
||||
.await?;
|
||||
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
}
|
||||
"reconfigure" => {
|
||||
let endpoint_id = sub_args
|
||||
@@ -835,7 +809,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
} else {
|
||||
None
|
||||
};
|
||||
endpoint.reconfigure(pageserver_id).await?;
|
||||
endpoint.reconfigure(pageserver_id)?;
|
||||
}
|
||||
"stop" => {
|
||||
let endpoint_id = sub_args
|
||||
@@ -901,12 +875,11 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
))
|
||||
}
|
||||
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
@@ -933,10 +906,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -950,17 +920,14 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("status", subcommand_args)) => {
|
||||
match get_pageserver(env, subcommand_args)?.check_status().await {
|
||||
match get_pageserver(env, subcommand_args)?.check_status() {
|
||||
Ok(_) => println!("Page server is up and running"),
|
||||
Err(err) => {
|
||||
eprintln!("Page server is not available: {}", err);
|
||||
@@ -975,14 +942,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_attachment_service(
|
||||
sub_match: &ArgMatches,
|
||||
env: &local_env::LocalEnv,
|
||||
) -> Result<()> {
|
||||
fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let svc = AttachmentService::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", _start_match)) => {
|
||||
if let Err(e) = svc.start().await {
|
||||
if let Err(e) = svc.start() {
|
||||
eprintln!("start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1023,7 +987,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match sub_match.subcommand() {
|
||||
Some(safekeeper_command_data) => safekeeper_command_data,
|
||||
None => bail!("no safekeeper subcommand provided"),
|
||||
@@ -1041,7 +1005,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
"start" => {
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
|
||||
if let Err(e) = safekeeper.start(extra_opts).await {
|
||||
if let Err(e) = safekeeper.start(extra_opts) {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1067,7 +1031,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
if let Err(e) = safekeeper.start(extra_opts).await {
|
||||
if let Err(e) = safekeeper.start(extra_opts) {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1080,15 +1044,15 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
// Endpoints are not started automatically
|
||||
|
||||
broker::start_broker_process(env).await?;
|
||||
broker::start_broker_process(env)?;
|
||||
|
||||
// Only start the attachment service if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.start().await {
|
||||
if let Err(e) = attachment_service.start() {
|
||||
eprintln!("attachment_service start failed: {:#}", e);
|
||||
try_stop_all(env, true);
|
||||
exit(1);
|
||||
@@ -1097,10 +1061,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(sub_match))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
try_stop_all(env, true);
|
||||
exit(1);
|
||||
@@ -1109,7 +1070,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
|
||||
for node in env.safekeepers.iter() {
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start(vec![]).await {
|
||||
if let Err(e) = safekeeper.start(vec![]) {
|
||||
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
|
||||
try_stop_all(env, false);
|
||||
exit(1);
|
||||
|
||||
@@ -11,7 +11,7 @@ use camino::Utf8PathBuf;
|
||||
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let broker = &env.broker;
|
||||
let listen_addr = &broker.listen_addr;
|
||||
|
||||
@@ -19,15 +19,15 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
|
||||
|
||||
let args = [format!("--listen-addr={listen_addr}")];
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let client = reqwest::blocking::Client::new();
|
||||
background_process::start_process(
|
||||
"storage_broker",
|
||||
&env.base_data_dir,
|
||||
&env.storage_broker_bin(),
|
||||
args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
|
||||
|| async {
|
||||
background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
|
||||
|| {
|
||||
let url = broker.client_url();
|
||||
let status_url = url.join("status").with_context(|| {
|
||||
format!("Failed to append /status path to broker endpoint {url}")
|
||||
@@ -36,13 +36,12 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
|
||||
.get(status_url)
|
||||
.build()
|
||||
.with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
|
||||
match client.execute(request).await {
|
||||
match client.execute(request) {
|
||||
Ok(resp) => Ok(resp.status().is_success()),
|
||||
Err(_) => Ok(false),
|
||||
}
|
||||
},
|
||||
)
|
||||
.await
|
||||
.context("Failed to spawn storage_broker subprocess")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -46,8 +46,6 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
@@ -441,14 +439,11 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
||||
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
if send_sigterm {
|
||||
kill(pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -469,7 +464,7 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn start(
|
||||
pub fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
@@ -542,7 +537,6 @@ impl Endpoint {
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -593,7 +587,7 @@ impl Endpoint {
|
||||
const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
|
||||
loop {
|
||||
attempt += 1;
|
||||
match self.get_status().await {
|
||||
match self.get_status() {
|
||||
Ok(state) => {
|
||||
match state.status {
|
||||
ComputeStatus::Init => {
|
||||
@@ -635,8 +629,8 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
// Call the /status HTTP API
|
||||
pub async fn get_status(&self) -> Result<ComputeState> {
|
||||
let client = reqwest::Client::new();
|
||||
pub fn get_status(&self) -> Result<ComputeState> {
|
||||
let client = reqwest::blocking::Client::new();
|
||||
|
||||
let response = client
|
||||
.request(
|
||||
@@ -647,17 +641,16 @@ impl Endpoint {
|
||||
self.http_address.port()
|
||||
),
|
||||
)
|
||||
.send()
|
||||
.await?;
|
||||
.send()?;
|
||||
|
||||
// Interpret the response
|
||||
let status = response.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
Ok(response.json().await?)
|
||||
Ok(response.json()?)
|
||||
} else {
|
||||
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
||||
let url = response.url().to_owned();
|
||||
let msg = match response.text().await {
|
||||
let msg = match response.text() {
|
||||
Ok(err_body) => format!("Error: {}", err_body),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
};
|
||||
@@ -665,7 +658,7 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
|
||||
pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
let file = std::fs::File::open(spec_path)?;
|
||||
@@ -694,7 +687,7 @@ impl Endpoint {
|
||||
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
|
||||
}
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let client = reqwest::blocking::Client::new();
|
||||
let response = client
|
||||
.post(format!(
|
||||
"http://{}:{}/configure",
|
||||
@@ -705,15 +698,14 @@ impl Endpoint {
|
||||
"{{\"spec\":{}}}",
|
||||
serde_json::to_string_pretty(&spec)?
|
||||
))
|
||||
.send()
|
||||
.await?;
|
||||
.send()?;
|
||||
|
||||
let status = response.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
Ok(())
|
||||
} else {
|
||||
let url = response.url().to_owned();
|
||||
let msg = match response.text().await {
|
||||
let msg = match response.text() {
|
||||
Ok(err_body) => format!("Error: {}", err_body),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
};
|
||||
@@ -738,15 +730,10 @@ impl Endpoint {
|
||||
&None,
|
||||
)?;
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some
|
||||
// cleanup work to do after postgres stops, like syncing safekeepers,
|
||||
// etc.
|
||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||
//
|
||||
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
|
||||
// want this cleanup: tests intentionally do stop when majority of
|
||||
// safekeepers is down, so sync-safekeepers would hang otherwise. This
|
||||
// could be a separate flag though.
|
||||
self.wait_for_compute_ctl_to_exit(destroy)?;
|
||||
self.wait_for_compute_ctl_to_exit()?;
|
||||
if destroy {
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
|
||||
@@ -6,24 +6,28 @@
|
||||
//!
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Write};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Child, Command};
|
||||
use std::time::Duration;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::models::{
|
||||
self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
@@ -34,6 +38,45 @@ use crate::{background_process, local_env::LocalEnv};
|
||||
/// Directory within .neon which will be used by default for LocalFs remote storage.
|
||||
pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
#[error("Reqwest error: {0}")]
|
||||
Transport(#[from] reqwest::Error),
|
||||
|
||||
#[error("Error: {0}")]
|
||||
Response(String),
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for PageserverHttpError {
|
||||
fn from(e: anyhow::Error) -> Self {
|
||||
Self::Response(e.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
type Result<T> = result::Result<T, PageserverHttpError>;
|
||||
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
fn error_from_body(self) -> Result<Self>;
|
||||
}
|
||||
|
||||
impl ResponseErrorMessageExt for Response {
|
||||
fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
}
|
||||
|
||||
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
||||
let url = self.url().to_owned();
|
||||
Err(PageserverHttpError::Response(
|
||||
match self.json::<HttpErrorBody>() {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Control routines for pageserver.
|
||||
//
|
||||
@@ -44,7 +87,8 @@ pub struct PageServerNode {
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub conf: PageServerConf,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: mgmt_api::Client,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
@@ -56,19 +100,8 @@ impl PageServerNode {
|
||||
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
|
||||
conf: conf.clone(),
|
||||
env: env.clone(),
|
||||
http_client: mgmt_api::Client::new(
|
||||
format!("http://{}", conf.listen_http_addr),
|
||||
{
|
||||
match conf.http_auth_type {
|
||||
AuthType::Trust => None,
|
||||
AuthType::NeonJWT => Some(
|
||||
env.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
||||
.unwrap(),
|
||||
),
|
||||
}
|
||||
}
|
||||
.as_deref(),
|
||||
),
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://{}/v1", conf.listen_http_addr),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,8 +182,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
self.start_node(config_overrides, false).await
|
||||
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
self.start_node(config_overrides, false)
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
@@ -191,12 +224,7 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn start_node(
|
||||
&self,
|
||||
config_overrides: &[&str],
|
||||
update_config: bool,
|
||||
) -> anyhow::Result<Child> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
"Starting pageserver node {} at '{}' in {:?}",
|
||||
@@ -204,7 +232,7 @@ impl PageServerNode {
|
||||
self.pg_connection_config.raw_address(),
|
||||
datadir
|
||||
);
|
||||
io::stdout().flush().context("flush stdout")?;
|
||||
io::stdout().flush()?;
|
||||
|
||||
let datadir_path_str = datadir.to_str().with_context(|| {
|
||||
format!(
|
||||
@@ -216,23 +244,20 @@ impl PageServerNode {
|
||||
if update_config {
|
||||
args.push(Cow::Borrowed("--update-config"));
|
||||
}
|
||||
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
&datadir,
|
||||
&self.env.pageserver_bin(),
|
||||
args.iter().map(Cow::as_ref),
|
||||
self.pageserver_env_variables()?,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
|| async {
|
||||
let st = self.check_status().await;
|
||||
match st {
|
||||
Ok(()) => Ok(true),
|
||||
Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
}
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(PageserverHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
fn pageserver_basic_args<'a>(
|
||||
@@ -278,12 +303,7 @@ impl PageServerNode {
|
||||
background_process::stop_process(immediate, "pageserver", &self.pid_file())
|
||||
}
|
||||
|
||||
pub async fn page_server_psql_client(
|
||||
&self,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
|
||||
let mut config = self.pg_connection_config.clone();
|
||||
if self.conf.pg_auth_type == AuthType::NeonJWT {
|
||||
let token = self
|
||||
@@ -291,18 +311,36 @@ impl PageServerNode {
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
config = config.set_password(Some(token));
|
||||
}
|
||||
Ok(config.connect_no_tls().await?)
|
||||
Ok(config.connect_no_tls()?)
|
||||
}
|
||||
|
||||
pub async fn check_status(&self) -> mgmt_api::Result<()> {
|
||||
self.http_client.status().await
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
|
||||
let mut builder = self.http_client.request(method, url);
|
||||
if self.conf.http_auth_type == AuthType::NeonJWT {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
builder = builder.bearer_auth(token)
|
||||
}
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
|
||||
self.http_client.list_tenants().await
|
||||
pub fn check_status(&self) -> Result<()> {
|
||||
self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_create(
|
||||
pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
|
||||
Ok(self
|
||||
.http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json()?)
|
||||
}
|
||||
|
||||
pub fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
@@ -380,10 +418,23 @@ impl PageServerNode {
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
Ok(self.http_client.tenant_create(&request).await?)
|
||||
self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
|
||||
.json(&request)
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json::<Option<String>>()
|
||||
.with_context(|| {
|
||||
format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}")
|
||||
})?
|
||||
.context("No tenant id was found in the tenant creation response")
|
||||
.and_then(|tenant_id_string| {
|
||||
tenant_id_string.parse().with_context(|| {
|
||||
format!("Failed to parse response string as tenant id: '{tenant_id_string}'")
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn tenant_config(
|
||||
pub fn tenant_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
mut settings: HashMap<&str, &str>,
|
||||
@@ -462,37 +513,54 @@ impl PageServerNode {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
|
||||
self.http_client
|
||||
.tenant_config(&models::TenantConfigRequest { tenant_id, config })
|
||||
.await?;
|
||||
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
|
||||
.json(&models::TenantConfigRequest { tenant_id, config })
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
pub fn location_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<Duration>,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.location_config(tenant_id, config, flush_ms)
|
||||
.await?)
|
||||
let req_body = TenantLocationConfigRequest { tenant_id, config };
|
||||
|
||||
let path = format!(
|
||||
"{}/tenant/{}/location_config",
|
||||
self.http_base_url, tenant_id
|
||||
);
|
||||
let path = if let Some(flush_ms) = flush_ms {
|
||||
format!("{}?flush_ms={}", path, flush_ms.as_millis())
|
||||
} else {
|
||||
path
|
||||
};
|
||||
|
||||
self.http_request(Method::PUT, path)?
|
||||
.json(&req_body)
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
Ok(self.http_client.list_timelines(*tenant_id).await?)
|
||||
pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
let timeline_infos: Vec<TimelineInfo> = self
|
||||
.http_request(
|
||||
Method::GET,
|
||||
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
|
||||
)?
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json()?;
|
||||
|
||||
Ok(timeline_infos)
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_secondary_download(*tenant_id)
|
||||
.await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
pub fn timeline_create(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
@@ -503,14 +571,29 @@ impl PageServerNode {
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
// If timeline ID was not specified, generate one
|
||||
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
||||
let req = models::TimelineCreateRequest {
|
||||
|
||||
self.http_request(
|
||||
Method::POST,
|
||||
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
|
||||
)?
|
||||
.json(&models::TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
ancestor_timeline_id,
|
||||
pg_version,
|
||||
existing_initdb_timeline_id,
|
||||
};
|
||||
Ok(self.http_client.timeline_create(tenant_id, &req).await?)
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json::<Option<TimelineInfo>>()
|
||||
.with_context(|| {
|
||||
format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
|
||||
})?
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"No timeline id was found in the timeline creation response for tenant {tenant_id}"
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Import a basebackup prepared using either:
|
||||
@@ -522,7 +605,7 @@ impl PageServerNode {
|
||||
/// * `timeline_id` - id to assign to imported timeline
|
||||
/// * `base` - (start lsn of basebackup, path to `base.tar` file)
|
||||
/// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
|
||||
pub async fn timeline_import(
|
||||
pub fn timeline_import(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -530,60 +613,36 @@ impl PageServerNode {
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let (client, conn) = self.page_server_psql_client().await?;
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = conn.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
tokio::pin!(client);
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
|
||||
let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
|
||||
let base_tarfile = File::open(base_tarfile_path)?;
|
||||
let mut base_reader = BufReader::new(base_tarfile);
|
||||
|
||||
// Init wal reader if necessary
|
||||
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
|
||||
let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
|
||||
let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
|
||||
let wal_tarfile = File::open(wal_tarfile_path)?;
|
||||
let wal_reader = BufReader::new(wal_tarfile);
|
||||
(end_lsn, Some(wal_reader))
|
||||
} else {
|
||||
(start_lsn, None)
|
||||
};
|
||||
|
||||
let copy_in = |reader, cmd| {
|
||||
let client = &client;
|
||||
async move {
|
||||
let writer = client.copy_in(&cmd).await?;
|
||||
let writer = std::pin::pin!(writer);
|
||||
let mut writer = writer.sink_map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
|
||||
});
|
||||
let mut reader = std::pin::pin!(reader);
|
||||
writer.send_all(&mut reader).await?;
|
||||
writer.into_inner().finish().await?;
|
||||
anyhow::Ok(())
|
||||
}
|
||||
};
|
||||
|
||||
// Import base
|
||||
copy_in(
|
||||
base_tarfile,
|
||||
format!(
|
||||
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
|
||||
),
|
||||
)
|
||||
.await?;
|
||||
let import_cmd = format!(
|
||||
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
|
||||
);
|
||||
let mut writer = client.copy_in(&import_cmd)?;
|
||||
io::copy(&mut base_reader, &mut writer)?;
|
||||
writer.finish()?;
|
||||
|
||||
// Import wal if necessary
|
||||
if let Some(wal_reader) = wal_reader {
|
||||
copy_in(
|
||||
wal_reader,
|
||||
format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
|
||||
)
|
||||
.await?;
|
||||
if let Some(mut wal_reader) = wal_reader {
|
||||
let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
|
||||
let mut writer = client.copy_in(&import_cmd)?;
|
||||
io::copy(&mut wal_reader, &mut writer)?;
|
||||
writer.finish()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -13,6 +13,7 @@ use std::{io, result};
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{http::error::HttpErrorBody, id::NodeId};
|
||||
@@ -33,14 +34,12 @@ pub enum SafekeeperHttpError {
|
||||
|
||||
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
async fn error_from_body(self) -> Result<Self>;
|
||||
fn error_from_body(self) -> Result<Self>;
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ResponseErrorMessageExt for reqwest::Response {
|
||||
async fn error_from_body(self) -> Result<Self> {
|
||||
impl ResponseErrorMessageExt for Response {
|
||||
fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
@@ -49,7 +48,7 @@ impl ResponseErrorMessageExt for reqwest::Response {
|
||||
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
||||
let url = self.url().to_owned();
|
||||
Err(SafekeeperHttpError::Response(
|
||||
match self.json::<HttpErrorBody>().await {
|
||||
match self.json::<HttpErrorBody>() {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
@@ -70,7 +69,7 @@ pub struct SafekeeperNode {
|
||||
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: reqwest::Client,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
}
|
||||
|
||||
@@ -81,7 +80,7 @@ impl SafekeeperNode {
|
||||
conf: conf.clone(),
|
||||
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
|
||||
env: env.clone(),
|
||||
http_client: reqwest::Client::new(),
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||
}
|
||||
}
|
||||
@@ -104,7 +103,7 @@ impl SafekeeperNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
|
||||
pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
@@ -192,16 +191,13 @@ impl SafekeeperNode {
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
[],
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
|| async {
|
||||
match self.check_status().await {
|
||||
Ok(()) => Ok(true),
|
||||
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
}
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
///
|
||||
@@ -220,7 +216,7 @@ impl SafekeeperNode {
|
||||
)
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
// TODO: authentication
|
||||
//if self.env.auth_type == AuthType::NeonJWT {
|
||||
// builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
|
||||
@@ -228,12 +224,10 @@ impl SafekeeperNode {
|
||||
self.http_client.request(method, url)
|
||||
}
|
||||
|
||||
pub async fn check_status(&self) -> Result<()> {
|
||||
pub fn check_status(&self) -> Result<()> {
|
||||
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
|
||||
.send()
|
||||
.await?
|
||||
.error_from_body()
|
||||
.await?;
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,6 @@ use crate::{
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use utils::{
|
||||
@@ -20,11 +19,11 @@ use utils::{
|
||||
};
|
||||
|
||||
/// Given an attached pageserver, retrieve the LSN for all timelines
|
||||
async fn get_lsns(
|
||||
fn get_lsns(
|
||||
tenant_id: TenantId,
|
||||
pageserver: &PageServerNode,
|
||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||
let timelines = pageserver.timeline_list(&tenant_id).await?;
|
||||
let timelines = pageserver.timeline_list(&tenant_id)?;
|
||||
Ok(timelines
|
||||
.into_iter()
|
||||
.map(|t| (t.timeline_id, t.last_record_lsn))
|
||||
@@ -33,17 +32,17 @@ async fn get_lsns(
|
||||
|
||||
/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
|
||||
/// `baseline`.
|
||||
async fn await_lsn(
|
||||
fn await_lsn(
|
||||
tenant_id: TenantId,
|
||||
pageserver: &PageServerNode,
|
||||
baseline: HashMap<TimelineId, Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let latest = match get_lsns(tenant_id, pageserver).await {
|
||||
let latest = match get_lsns(tenant_id, pageserver) {
|
||||
Ok(l) => l,
|
||||
Err(_e) => {
|
||||
Err(e) => {
|
||||
println!(
|
||||
"🕑 Waiting for pageserver {} to activate...",
|
||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
||||
pageserver.conf.id
|
||||
);
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
@@ -85,12 +84,12 @@ async fn await_lsn(
|
||||
/// - Coordinate attach/secondary/detach on pageservers
|
||||
/// - call into attachment_service for generations
|
||||
/// - reconfigure compute endpoints to point to new attached pageserver
|
||||
pub async fn migrate_tenant(
|
||||
pub fn migrate_tenant(
|
||||
env: &LocalEnv,
|
||||
tenant_id: TenantId,
|
||||
dest_ps: PageServerNode,
|
||||
) -> anyhow::Result<()> {
|
||||
println!("🤔 Checking existing status...");
|
||||
// Get a new generation
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
|
||||
fn build_location_config(
|
||||
@@ -109,18 +108,16 @@ pub async fn migrate_tenant(
|
||||
}
|
||||
}
|
||||
|
||||
let previous = attachment_service.inspect(tenant_id).await?;
|
||||
let previous = attachment_service.inspect(tenant_id)?;
|
||||
let mut baseline_lsns = None;
|
||||
if let Some((generation, origin_ps_id)) = &previous {
|
||||
let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
|
||||
|
||||
if origin_ps_id == &dest_ps.conf.id {
|
||||
println!("🔁 Already attached to {origin_ps_id}, freshening...");
|
||||
let gen = attachment_service
|
||||
.attach_hook(tenant_id, dest_ps.conf.id)
|
||||
.await?;
|
||||
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
||||
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
||||
dest_ps.location_config(tenant_id, dest_conf, None).await?;
|
||||
dest_ps.location_config(tenant_id, dest_conf, None)?;
|
||||
println!("✅ Migration complete");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -129,38 +126,20 @@ pub async fn migrate_tenant(
|
||||
|
||||
let stale_conf =
|
||||
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
|
||||
origin_ps
|
||||
.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
|
||||
.await?;
|
||||
origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
|
||||
|
||||
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
|
||||
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
|
||||
}
|
||||
|
||||
println!(
|
||||
"🔁 Downloading latest layers to destination pageserver {}",
|
||||
dest_ps.conf.id
|
||||
);
|
||||
match dest_ps
|
||||
.tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
Err(_) => {
|
||||
println!(" (skipping, destination wasn't in secondary mode)")
|
||||
}
|
||||
}
|
||||
|
||||
let gen = attachment_service
|
||||
.attach_hook(tenant_id, dest_ps.conf.id)
|
||||
.await?;
|
||||
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
||||
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
|
||||
|
||||
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
|
||||
dest_ps.location_config(tenant_id, dest_conf, None).await?;
|
||||
dest_ps.location_config(tenant_id, dest_conf, None)?;
|
||||
|
||||
if let Some(baseline) = baseline_lsns {
|
||||
println!("🕑 Waiting for LSN to catch up...");
|
||||
await_lsn(tenant_id, &dest_ps, baseline).await?;
|
||||
await_lsn(tenant_id, &dest_ps, baseline)?;
|
||||
}
|
||||
|
||||
let cplane = ComputeControlPlane::load(env.clone())?;
|
||||
@@ -170,7 +149,7 @@ pub async fn migrate_tenant(
|
||||
"🔁 Reconfiguring endpoint {} to use pageserver {}",
|
||||
endpoint_name, dest_ps.conf.id
|
||||
);
|
||||
endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
|
||||
endpoint.reconfigure(Some(dest_ps.conf.id))?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,7 +159,7 @@ pub async fn migrate_tenant(
|
||||
}
|
||||
|
||||
let other_ps = PageServerNode::from_env(env, other_ps_conf);
|
||||
let other_ps_tenants = other_ps.tenant_list().await?;
|
||||
let other_ps_tenants = other_ps.tenant_list()?;
|
||||
|
||||
// Check if this tenant is attached
|
||||
let found = other_ps_tenants
|
||||
@@ -202,9 +181,7 @@ pub async fn migrate_tenant(
|
||||
"💤 Switching to secondary mode on pageserver {}",
|
||||
other_ps.conf.id
|
||||
);
|
||||
other_ps
|
||||
.location_config(tenant_id, secondary_conf, None)
|
||||
.await?;
|
||||
other_ps.location_config(tenant_id, secondary_conf, None)?;
|
||||
}
|
||||
|
||||
println!(
|
||||
@@ -212,7 +189,7 @@ pub async fn migrate_tenant(
|
||||
dest_ps.conf.id
|
||||
);
|
||||
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
||||
dest_ps.location_config(tenant_id, dest_conf, None).await?;
|
||||
dest_ps.location_config(tenant_id, dest_conf, None)?;
|
||||
|
||||
println!("✅ Migration complete");
|
||||
|
||||
|
||||
@@ -35,7 +35,6 @@ allow = [
|
||||
"Artistic-2.0",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
"CC0-1.0",
|
||||
"ISC",
|
||||
"MIT",
|
||||
"MPL-2.0",
|
||||
|
||||
@@ -1,197 +0,0 @@
|
||||
# Per-Tenant GetPage@LSN Throttling
|
||||
|
||||
Author: Christian Schwarz
|
||||
Date: Oct 24, 2023
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver
|
||||
and the interactions with its client, i.e., the neon_smgr component in Compute.
|
||||
|
||||
The result of implementing & executing this RFC will be a fleet-wide upper limit for
|
||||
**"the highest GetPage/second that Pageserver can support for a single tenant/shard"**.
|
||||
|
||||
## Background
|
||||
|
||||
### GetPage@LSN Request Flow
|
||||
|
||||
Pageserver exposes its `page_service.rs` as a libpq listener.
|
||||
The Computes' `neon_smgr` module connects to that libpq listener.
|
||||
Once a connection is established, the protocol allows Compute to request page images at a given LSN.
|
||||
We call these requests GetPage@LSN requests, or GetPage requests for short.
|
||||
Other request types can be sent, but these are low traffic compared to GetPage requests
|
||||
and are not the concern of this RFC.
|
||||
|
||||
Pageserver associates one libpq connection with one tokio task.
|
||||
|
||||
Per connection/task, the pq protocol is handled by the common `postgres_backend` crate.
|
||||
Its `run_message_loop` function invokes the `page_service` specific `impl<IO> postgres_backend::Handler<IO> for PageServerHandler`.
|
||||
Requests are processed in the order in which they arrive via the TCP-based pq protocol.
|
||||
So, there is no concurrent request processing within one connection/task.
|
||||
|
||||
There is a degree of natural pipelining:
|
||||
Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream.
|
||||
And Pageserver can fill the pipe with responses in the other direction.
|
||||
Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc.
|
||||
|
||||
### GetPage@LSN Access Pattern
|
||||
|
||||
The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC).
|
||||
Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches.
|
||||
|
||||
If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*.
|
||||
|
||||
## Motivation
|
||||
|
||||
In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h,
|
||||
then dropping to ca 18k GetPage/second for a period of 9h.
|
||||
|
||||
We noticed this because of an internal GetPage latency SLO burn rate alert, i.e.,
|
||||
the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO.
|
||||
|
||||
Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants.
|
||||
|
||||
However, here are some illustrative data points for the 155k period:
|
||||
The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance.
|
||||
We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`)
|
||||
The CPU utilization of the instance was 75% user+system.
|
||||
Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%.
|
||||
The hit rate for materialized pages was ca. 40%.
|
||||
Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100.
|
||||
|
||||
The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**.
|
||||
The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM.
|
||||
The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**.
|
||||
|
||||
My claim is that it was **unhealthy to serve this workload at the pace we did**:
|
||||
* it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this)
|
||||
* more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons:
|
||||
* **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops.
|
||||
At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit.
|
||||
The result is an **uneven** performance profile from the Compute perspective.
|
||||
|
||||
* **economics**: Neon currently does not charge for IOPS, only capacity.
|
||||
**We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.**
|
||||
For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume.
|
||||
Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume.
|
||||
We charge 0$.
|
||||
It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free.
|
||||
|
||||
|
||||
Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits:
|
||||
vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver.
|
||||
So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity.
|
||||
|
||||
## Solution: Throttling GetPage Requests
|
||||
|
||||
**The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**.
|
||||
That is, unless we want to start charging for provisioned GetPage@LSN/second.
|
||||
Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size.
|
||||
Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913).
|
||||
|
||||
## The Design Space
|
||||
|
||||
What that remains is the question about *policy* and *mechanism*:
|
||||
|
||||
**Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant.
|
||||
Candidates are:
|
||||
|
||||
* hard limit, same limit value per connection|timeline|tenant
|
||||
* Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance.
|
||||
This is a major operational pain point / risk right now.
|
||||
* hard limit, configurable per connection|timeline|tenant
|
||||
* This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers.
|
||||
* Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant.
|
||||
* fair share among active connections|timelines|tenants per instance
|
||||
* example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity
|
||||
* NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance
|
||||
* ...
|
||||
|
||||
|
||||
Regarding **mechanism**, it's clear that **backpressure** is the way to go.
|
||||
However, we must choose between
|
||||
* **implicit** backpressure through pq/TCP and
|
||||
* **explicit** rejection of requests + retries with exponential backoff
|
||||
|
||||
Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**:
|
||||
where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling?
|
||||
|
||||
And when we eventually move the measurement point into the Computes (to avoid coordinated omission),
|
||||
how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO?
|
||||
|
||||
## Scope Of This RFC
|
||||
|
||||
**This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**.
|
||||
|
||||
This proposal is easy to implement and significantly de-risks operating large Pageservers,
|
||||
based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants.
|
||||
|
||||
For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate.
|
||||
Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate.
|
||||
If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation.
|
||||
|
||||
The mechanism for backpressure will be TCP-based implicit backpressure.
|
||||
The compute team isn't concerned about prefetch queue depth.
|
||||
Pageserver will implement it by delaying the reading of requests from the libpq connection(s).
|
||||
|
||||
The rate limit will be implemented using a per-tenant token bucket.
|
||||
The bucket will be be shared among all connections to the tenant.
|
||||
The bucket implementation supports starvation-preventing `await`ing.
|
||||
The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/).
|
||||
The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771
|
||||
can be used to evaluate the overhead of sharing the bucket among connections of a tenant.
|
||||
A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler.
|
||||
|
||||
Regarding metrics / the internal GetPage latency SLO:
|
||||
we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by:
|
||||
- histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver)
|
||||
- histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver)
|
||||
|
||||
Further observability measures:
|
||||
- an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute.
|
||||
The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats.
|
||||
|
||||
Rollout will happen as follows:
|
||||
- deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf
|
||||
- experimentation in staging and later production to study impact & interaction with auto-scaling
|
||||
- determination of a sensible global default value
|
||||
- the value will be chosen as high as possible ...
|
||||
- ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance.
|
||||
- deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default
|
||||
- reset of the experimental per-tenant overrides
|
||||
- gain experience & lower the limit over time
|
||||
- we stop lowering the limit as soon as this RFC's goal is achieved, i.e.,
|
||||
once we decide that in practice the chosen value sufficiently de-risks operating large pageservers
|
||||
|
||||
The per-tenant override will remain for emergencies and testing.
|
||||
But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant.
|
||||
|
||||
Toward the upper layers of the Neon stack, the resulting limit will be
|
||||
**"the highest GetPage/second that Pageserver can support for a single tenant"**.
|
||||
|
||||
### Rationale
|
||||
|
||||
We decided against error + retry because of worries about starvation.
|
||||
|
||||
## Future Work
|
||||
|
||||
Enable per-tenant emergency override of the limit via Console.
|
||||
Should be part of a more general framework to specify tenant config overrides.
|
||||
**NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users,
|
||||
or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that
|
||||
concerns itself with GetPage/second capacity planning.
|
||||
|
||||
Compute-side metrics for GetPage latency.
|
||||
|
||||
Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled.
|
||||
|
||||
Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss.
|
||||
|
||||
Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant.
|
||||
Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance.
|
||||
With per-tenant rate limiting, we will not meet that expectation.
|
||||
However, we can currently only scale per tenant.
|
||||
Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis.
|
||||
But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit.
|
||||
To solve this properly, I think we'll need replicas for tenants / shard.
|
||||
To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas.
|
||||
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
|
||||
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
||||
|
||||
### Obligatory checks
|
||||
We force code formatting via `ruff`, and type hints via `mypy`.
|
||||
We force code formatting via `black`, `ruff`, and type hints via `mypy`.
|
||||
Run the following commands in the repository's root (next to `pyproject.toml`):
|
||||
|
||||
```bash
|
||||
poetry run ruff format . # All code is reformatted
|
||||
poetry run ruff check . # Python linter
|
||||
poetry run mypy . # Ensure there are no typing errors
|
||||
poetry run black . # All code is reformatted
|
||||
poetry run ruff . # Python linter
|
||||
poetry run mypy . # Ensure there are no typing errors
|
||||
```
|
||||
|
||||
**WARNING**: do not run `mypy` from a directory other than the root of the repository.
|
||||
|
||||
@@ -73,8 +73,6 @@ pub struct ComputeSpec {
|
||||
|
||||
// information about available remote extensions
|
||||
pub remote_extensions: Option<RemoteExtSpec>,
|
||||
|
||||
pub pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -243,9 +243,5 @@
|
||||
"public_extensions": [
|
||||
"postgis"
|
||||
]
|
||||
},
|
||||
"pgbouncer_settings": {
|
||||
"default_pool_size": "42",
|
||||
"pool_mode": "session"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,4 +24,3 @@ workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
bincode.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
@@ -142,39 +142,5 @@ impl Key {
|
||||
}
|
||||
|
||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
Self::from_hex(s)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use crate::key::Key;
|
||||
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
|
||||
#[test]
|
||||
fn display_fromstr_bijection() {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||
|
||||
let key = Key {
|
||||
field1: rng.gen(),
|
||||
field2: rng.gen(),
|
||||
field3: rng.gen(),
|
||||
field4: rng.gen(),
|
||||
field5: rng.gen(),
|
||||
field6: rng.gen(),
|
||||
};
|
||||
|
||||
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
|
||||
}
|
||||
key.field1 == 0x00 && key.field4 != 0
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ use const_format::formatcp;
|
||||
/// Public API types
|
||||
pub mod control_api;
|
||||
pub mod key;
|
||||
pub mod keyspace;
|
||||
pub mod models;
|
||||
pub mod reltag;
|
||||
pub mod shard;
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
pub mod partitioning;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::Read,
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
time::SystemTime,
|
||||
};
|
||||
@@ -20,7 +17,7 @@ use utils::{
|
||||
|
||||
use crate::{reltag::RelTag, shard::TenantShardId};
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
///
|
||||
@@ -370,14 +367,6 @@ pub struct TenantInfo {
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TenantDetails {
|
||||
#[serde(flatten)]
|
||||
pub tenant_info: TenantInfo,
|
||||
|
||||
pub timelines: Vec<TimelineId>,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
@@ -557,6 +546,19 @@ pub enum DownloadRemoteLayersTaskState {
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
@@ -572,7 +574,6 @@ pub enum PagestreamFeMessage {
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(strum_macros::EnumProperty)]
|
||||
pub enum PagestreamBeMessage {
|
||||
Exists(PagestreamExistsResponse),
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
@@ -581,29 +582,6 @@ pub enum PagestreamBeMessage {
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamBeMessageTag {
|
||||
Exists = 100,
|
||||
Nblocks = 101,
|
||||
GetPage = 102,
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
}
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
100 => Ok(PagestreamBeMessageTag::Exists),
|
||||
101 => Ok(PagestreamBeMessageTag::Nblocks),
|
||||
102 => Ok(PagestreamBeMessageTag::GetPage),
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamExistsRequest {
|
||||
pub latest: bool,
|
||||
@@ -759,91 +737,35 @@ impl PagestreamBeMessage {
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u8(100); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u8(101); /* tag from pagestore_client.h */
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put_u8(102); /* tag from pagestore_client.h */
|
||||
bytes.put(&resp.page[..]);
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put_u8(103); /* tag from pagestore_client.h */
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_u8(104); /* tag from pagestore_client.h */
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
|
||||
let mut buf = buf.reader();
|
||||
let msg_tag = buf.read_u8()?;
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
let ok =
|
||||
match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
|
||||
Tag::Exists => {
|
||||
let exists = buf.read_u8()?;
|
||||
Self::Exists(PagestreamExistsResponse {
|
||||
exists: exists != 0,
|
||||
})
|
||||
}
|
||||
Tag::Nblocks => {
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
Self::Nblocks(PagestreamNblocksResponse { n_blocks })
|
||||
}
|
||||
Tag::GetPage => {
|
||||
let mut page = vec![0; 8192]; // TODO: use MaybeUninit
|
||||
buf.read_exact(&mut page)?;
|
||||
PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
|
||||
}
|
||||
Tag::Error => {
|
||||
let buf = buf.get_ref();
|
||||
let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
|
||||
let rust_str = cstr.to_str()?;
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: rust_str.to_owned(),
|
||||
})
|
||||
}
|
||||
Tag::DbSize => {
|
||||
let db_size = buf.read_i64::<BigEndian>()?;
|
||||
Self::DbSize(PagestreamDbSizeResponse { db_size })
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
anyhow::bail!(
|
||||
"remaining bytes in msg with tag={msg_tag}: {}",
|
||||
remaining.len()
|
||||
);
|
||||
}
|
||||
Ok(ok)
|
||||
}
|
||||
|
||||
pub fn kind(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Exists(_) => "Exists",
|
||||
Self::Nblocks(_) => "Nblocks",
|
||||
Self::GetPage(_) => "GetPage",
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,151 +0,0 @@
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct Partitioning {
|
||||
pub keys: crate::keyspace::KeySpace,
|
||||
|
||||
pub at_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl serde::Serialize for Partitioning {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
|
||||
|
||||
impl<'a> serde::Serialize for KeySpace<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeSeq;
|
||||
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
|
||||
for kr in &self.0.ranges {
|
||||
seq.serialize_element(&KeyRange(kr))?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
}
|
||||
|
||||
use serde::ser::SerializeMap;
|
||||
let mut map = serializer.serialize_map(Some(2))?;
|
||||
map.serialize_key("keys")?;
|
||||
map.serialize_value(&KeySpace(&self.keys))?;
|
||||
map.serialize_key("at_lsn")?;
|
||||
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
||||
map.end()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WithDisplay<'a, T>(&'a T);
|
||||
|
||||
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
|
||||
|
||||
impl<'a> serde::Serialize for KeyRange<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeTuple;
|
||||
let mut t = serializer.serialize_tuple(2)?;
|
||||
t.serialize_element(&WithDisplay(&self.0.start))?;
|
||||
t.serialize_element(&WithDisplay(&self.0.end))?;
|
||||
t.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> serde::Deserialize<'a> for Partitioning {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'a>,
|
||||
{
|
||||
pub struct KeySpace(crate::keyspace::KeySpace);
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for KeySpace {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key);
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Deserialize)]
|
||||
struct Range(Key, Key);
|
||||
|
||||
let ranges: Vec<Range> = serde::Deserialize::deserialize(deserializer)?;
|
||||
Ok(Self(crate::keyspace::KeySpace {
|
||||
ranges: ranges
|
||||
.into_iter()
|
||||
.map(|Range(start, end)| (start.0..end.0))
|
||||
.collect(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Deserialize)]
|
||||
struct De {
|
||||
keys: KeySpace,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
at_lsn: Lsn,
|
||||
}
|
||||
|
||||
let de: De = serde::Deserialize::deserialize(deserializer)?;
|
||||
Ok(Self {
|
||||
at_lsn: de.at_lsn,
|
||||
keys: de.keys.0,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_serialization_roundtrip() {
|
||||
let reference = r#"
|
||||
{
|
||||
"keys": [
|
||||
[
|
||||
"000000000000000000000000000000000000",
|
||||
"000000000000000000000000000000000001"
|
||||
],
|
||||
[
|
||||
"000000067F00000001000000000000000000",
|
||||
"000000067F00000001000000000000000002"
|
||||
],
|
||||
[
|
||||
"030000000000000000000000000000000000",
|
||||
"030000000000000000000000000000000003"
|
||||
]
|
||||
],
|
||||
"at_lsn": "0/2240160"
|
||||
}
|
||||
"#;
|
||||
|
||||
let de: Partitioning = serde_json::from_str(reference).unwrap();
|
||||
|
||||
let ser = serde_json::to_string(&de).unwrap();
|
||||
|
||||
let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
ser_de,
|
||||
serde_json::from_str::<'_, serde_json::Value>(reference).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -81,10 +81,6 @@ impl TenantShardId {
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0)
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Formatting helper
|
||||
@@ -163,7 +159,7 @@ impl From<[u8; 18]> for TenantShardId {
|
||||
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
||||
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
||||
/// TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
@@ -422,21 +418,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
// A: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
false
|
||||
} else {
|
||||
!self.is_key_local(key)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
if self.count > ShardCount(0) {
|
||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||
@@ -530,7 +511,12 @@ fn key_is_shard0(key: &Key) -> bool {
|
||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||
// requests, and any request other than those for particular blocks in relations.
|
||||
!is_rel_block_key(key)
|
||||
//
|
||||
// In this condition:
|
||||
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
|
||||
// all metadata.
|
||||
// - field6 is set to -1 for relation size pages.
|
||||
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
|
||||
@@ -35,12 +35,6 @@ pub enum QueryError {
|
||||
/// We were instructed to shutdown while processing the query
|
||||
#[error("Shutting down")]
|
||||
Shutdown,
|
||||
/// Query handler indicated that client should reconnect
|
||||
#[error("Server requested reconnect")]
|
||||
Reconnect,
|
||||
/// Query named an entity that was not found
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(std::borrow::Cow<'static, str>),
|
||||
/// Authentication failure
|
||||
#[error("Unauthorized: {0}")]
|
||||
Unauthorized(std::borrow::Cow<'static, str>),
|
||||
@@ -60,9 +54,9 @@ impl From<io::Error> for QueryError {
|
||||
impl QueryError {
|
||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||
match self {
|
||||
Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
|
||||
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
|
||||
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
||||
Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
|
||||
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
|
||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||
}
|
||||
}
|
||||
@@ -431,11 +425,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
info!("Stopped due to shutdown");
|
||||
Ok(())
|
||||
}
|
||||
Err(QueryError::Reconnect) => {
|
||||
// Dropping out of this loop implicitly disconnects
|
||||
info!("Stopped due to handler reconnect request");
|
||||
Ok(())
|
||||
}
|
||||
Err(QueryError::Disconnected(e)) => {
|
||||
info!("Disconnected ({e:#})");
|
||||
// Disconnection is not an error: we just use it that way internally to drop
|
||||
@@ -985,9 +974,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
|
||||
pub fn short_error(e: &QueryError) -> String {
|
||||
match e {
|
||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||
QueryError::Reconnect => "reconnect".to_string(),
|
||||
QueryError::Shutdown => "shutdown".to_string(),
|
||||
QueryError::NotFound(_) => "not found".to_string(),
|
||||
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
|
||||
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
|
||||
QueryError::Other(e) => format!("{e:#}"),
|
||||
@@ -1009,15 +996,9 @@ fn log_query_error(query: &str, e: &QueryError) {
|
||||
QueryError::SimulatedConnectionError => {
|
||||
error!("query handler for query '{query}' failed due to a simulated connection error")
|
||||
}
|
||||
QueryError::Reconnect => {
|
||||
info!("query handler for '{query}' requested client to reconnect")
|
||||
}
|
||||
QueryError::Shutdown => {
|
||||
info!("query handler for '{query}' cancelled during tenant shutdown")
|
||||
}
|
||||
QueryError::NotFound(reason) => {
|
||||
info!("query handler for '{query}' entity not found: {reason}")
|
||||
}
|
||||
QueryError::Unauthorized(e) => {
|
||||
warn!("query handler for '{query}' failed with authentication error: {e}");
|
||||
}
|
||||
|
||||
@@ -163,18 +163,8 @@ impl PgConnectionConfig {
|
||||
}
|
||||
|
||||
/// Connect using postgres protocol with TLS disabled.
|
||||
pub async fn connect_no_tls(
|
||||
&self,
|
||||
) -> Result<
|
||||
(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
),
|
||||
postgres::Error,
|
||||
> {
|
||||
self.to_tokio_postgres_config()
|
||||
.connect(postgres::NoTls)
|
||||
.await
|
||||
pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -117,8 +117,6 @@ impl AzureBlobStorage {
|
||||
) -> Result<Download, DownloadError> {
|
||||
let mut response = builder.into_stream();
|
||||
|
||||
let mut etag = None;
|
||||
let mut last_modified = None;
|
||||
let mut metadata = HashMap::new();
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
@@ -126,13 +124,6 @@ impl AzureBlobStorage {
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
@@ -145,8 +136,6 @@ impl AzureBlobStorage {
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
last_modified,
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
@@ -322,12 +311,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
|
||||
Err(anyhow::anyhow!(
|
||||
"copy for azure blob storage is not implemented"
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
|
||||
@@ -14,9 +14,7 @@ mod local_fs;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
|
||||
use std::{
|
||||
collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
|
||||
};
|
||||
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -207,18 +205,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||
|
||||
/// Copy a remote object inside a bucket from one path to another.
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||
pub struct Download {
|
||||
pub download_stream: DownloadStream,
|
||||
/// The last time the file was modified (`last-modified` HTTP header)
|
||||
pub last_modified: Option<SystemTime>,
|
||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
||||
pub etag: Option<String>,
|
||||
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
@@ -377,15 +367,6 @@ impl GenericRemoteStorage {
|
||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.copy(from, to).await,
|
||||
Self::AwsS3(s) => s.copy(from, to).await,
|
||||
Self::AzureBlob(s) => s.copy(from, to).await,
|
||||
Self::Unreliable(s) => s.copy(from, to).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
@@ -672,7 +653,6 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::Put => &self.write,
|
||||
RequestKind::List => &self.read,
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
|
||||
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -331,8 +331,6 @@ impl RemoteStorage for LocalFs {
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
} else {
|
||||
@@ -374,17 +372,17 @@ impl RemoteStorage for LocalFs {
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download_stream: DownloadStream = match end_exclusive {
|
||||
Some(end_exclusive) => Box::pin(ReaderStream::new(
|
||||
source.take(end_exclusive - start_inclusive),
|
||||
)),
|
||||
None => Box::pin(ReaderStream::new(source)),
|
||||
};
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream,
|
||||
Ok(match end_exclusive {
|
||||
Some(end_exclusive) => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(ReaderStream::new(
|
||||
source.take(end_exclusive - start_inclusive),
|
||||
)),
|
||||
},
|
||||
None => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(ReaderStream::new(source)),
|
||||
},
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
@@ -409,20 +407,6 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
let from_path = from.with_base(&self.storage_root);
|
||||
let to_path = to.with_base(&self.storage_root);
|
||||
create_target_directory(&to_path).await?;
|
||||
fs::copy(&from_path, &to_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to copy file from '{from_path}' to '{to_path}'",
|
||||
from_path = from_path,
|
||||
to_path = to_path
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
|
||||
@@ -16,7 +16,6 @@ use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
meta::credentials::CredentialsProviderChain,
|
||||
profile::ProfileFileCredentialsProvider,
|
||||
provider_config::ProviderConfig,
|
||||
retry::{RetryConfigBuilder, RetryMode},
|
||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
@@ -75,29 +74,20 @@ impl S3Bucket {
|
||||
|
||||
let region = Some(Region::new(aws_config.bucket_region.clone()));
|
||||
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
)
|
||||
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
.or_else(
|
||||
"profile-sso",
|
||||
ProfileFileCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// needed to access remote extensions bucket
|
||||
.or_else(
|
||||
"token",
|
||||
.or_else("token", {
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
WebIdentityTokenCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
.build()
|
||||
})
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
@@ -228,11 +218,17 @@ impl S3Bucket {
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
if get_object.is_err() {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
}
|
||||
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
let etag = object_output.e_tag.clone();
|
||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||
|
||||
let body = object_output.body;
|
||||
let body = ByteStreamAsStream::from(body);
|
||||
@@ -241,33 +237,15 @@ impl S3Bucket {
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
etag,
|
||||
last_modified,
|
||||
download_stream: Box::pin(body),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
);
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
|
||||
Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
))
|
||||
}
|
||||
Err(e) => Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -493,38 +471,6 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Copy;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
// we need to specify bucket_name as a prefix
|
||||
let copy_source = format!(
|
||||
"{}/{}",
|
||||
self.bucket_name,
|
||||
self.relative_path_to_s3_object(from)
|
||||
);
|
||||
|
||||
let res = self
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(to))
|
||||
.copy_source(copy_source)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
res?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
// if prefix is not none then download file `prefix/from`
|
||||
// if prefix is none then download file `from`
|
||||
|
||||
@@ -11,7 +11,6 @@ pub(crate) enum RequestKind {
|
||||
Put = 1,
|
||||
Delete = 2,
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
}
|
||||
|
||||
use RequestKind::*;
|
||||
@@ -23,7 +22,6 @@ impl RequestKind {
|
||||
Put => "put_object",
|
||||
Delete => "delete_object",
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -31,7 +29,7 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct RequestTyped<C>([C; 5]);
|
||||
pub(super) struct RequestTyped<C>([C; 4]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -40,8 +38,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy].into_iter();
|
||||
let arr = std::array::from_fn::<C, 5, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List].into_iter();
|
||||
let arr = std::array::from_fn::<C, 4, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -162,11 +162,4 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
// copy is equivalent to download + upload
|
||||
self.attempt(RemoteOp::Download(from.clone()))?;
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.copy_object(from, to).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
use std::collections::HashSet;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{Download, GenericRemoteStorage, RemotePath};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
pub(crate) fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
pub(crate) fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy_buf(
|
||||
&mut tokio_util::io::StreamReader::new(dl.download_stream),
|
||||
&mut buf,
|
||||
)
|
||||
.await?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
pub(crate) async fn upload_simple_remote_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn cleanup(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
objects_to_delete: HashSet<RemotePath>,
|
||||
) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) struct Uploads {
|
||||
pub(crate) prefixes: HashSet<RemotePath>,
|
||||
pub(crate) blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
pub(crate) async fn upload_remote_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
@@ -2,23 +2,23 @@ use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
};
|
||||
use test_context::{test_context, AsyncTestContext};
|
||||
use tracing::{debug, info};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
mod common;
|
||||
|
||||
use common::{
|
||||
cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
|
||||
upload_stream, wrap_stream,
|
||||
};
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
|
||||
|
||||
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
|
||||
/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
|
||||
/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
@@ -97,7 +97,7 @@ async fn azure_pagination_should_work(
|
||||
/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `Azure_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||
/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
@@ -218,9 +218,18 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
|
||||
async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy_buf(
|
||||
&mut tokio_util::io::StreamReader::new(dl.download_stream),
|
||||
&mut buf,
|
||||
)
|
||||
.await?;
|
||||
Ok(buf)
|
||||
}
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// Full range (end specified)
|
||||
@@ -228,12 +237,12 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(len as u64))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
@@ -241,17 +250,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
@@ -263,6 +272,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
|
||||
struct EnabledAzure {
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
base_prefix: &'static str,
|
||||
@@ -332,7 +352,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
|
||||
|
||||
let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -394,7 +414,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
|
||||
|
||||
let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
|
||||
match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -458,3 +478,166 @@ fn create_azure_client(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
struct Uploads {
|
||||
prefixes: HashSet<RemotePath>,
|
||||
blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
async fn upload_azure_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} Azure files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
async fn upload_simple_azure_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} Azure files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
|
||||
// to binary
|
||||
fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
@@ -2,23 +2,23 @@ use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
};
|
||||
use test_context::{test_context, AsyncTestContext};
|
||||
use tracing::{debug, info};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
mod common;
|
||||
|
||||
use common::{
|
||||
cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
|
||||
upload_stream, wrap_stream,
|
||||
};
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
|
||||
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
|
||||
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
@@ -95,7 +95,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `s3_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
@@ -198,65 +198,15 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
let MaybeEnabledS3::Enabled(ctx) = ctx else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
|
||||
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// Full range (end specified)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(len as u64))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete(&path)
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
Ok(())
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
|
||||
struct EnabledS3 {
|
||||
@@ -328,7 +278,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
|
||||
|
||||
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -390,7 +340,7 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
|
||||
|
||||
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
|
||||
match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -453,3 +403,166 @@ fn create_s3_client(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
struct Uploads {
|
||||
prefixes: HashSet<RemotePath>,
|
||||
blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
async fn upload_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
async fn upload_simple_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
@@ -51,9 +51,3 @@ pub struct SkTimelineInfo {
|
||||
#[serde(default)]
|
||||
pub http_connstr: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct TimelineCopyRequest {
|
||||
pub target_timeline_id: TimelineId,
|
||||
pub until_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -4,12 +4,6 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
@@ -22,7 +16,6 @@ chrono.workspace = true
|
||||
heapless.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
|
||||
@@ -31,9 +31,6 @@ pub enum ApiError {
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
|
||||
#[error("Timeout")]
|
||||
Timeout(Cow<'static, str>),
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
@@ -70,10 +67,6 @@ impl ApiError {
|
||||
err.to_string(),
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
),
|
||||
ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::REQUEST_TIMEOUT,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -83,10 +83,6 @@ pub mod timeout;
|
||||
|
||||
pub mod sync;
|
||||
|
||||
pub mod failpoint_support;
|
||||
|
||||
pub mod yielding_loop;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -366,49 +366,6 @@ impl MonotonicCounter<Lsn> for RecordLsn {
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
|
||||
///
|
||||
/// This is used by the `pagebench` pageserver benchmarking tool.
|
||||
pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
|
||||
|
||||
impl rand::distributions::uniform::SampleUniform for Lsn {
|
||||
type Sampler = LsnSampler;
|
||||
}
|
||||
|
||||
impl rand::distributions::uniform::UniformSampler for LsnSampler {
|
||||
type X = Lsn;
|
||||
|
||||
fn new<B1, B2>(low: B1, high: B2) -> Self
|
||||
where
|
||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
{
|
||||
Self(
|
||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
|
||||
low.borrow().0,
|
||||
high.borrow().0,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
|
||||
where
|
||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
{
|
||||
Self(
|
||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
|
||||
low.borrow().0,
|
||||
high.borrow().0,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
|
||||
Lsn(self.0.sample(rng))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::bin_ser::BeSer;
|
||||
|
||||
@@ -15,12 +15,6 @@ pub struct Gate {
|
||||
name: String,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Gate {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Gate<{}>", self.name)
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
||||
/// not complete.
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -2,11 +2,8 @@ use std::time::Duration;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum TimeoutCancellableError {
|
||||
#[error("Timed out")]
|
||||
Timeout,
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum YieldingLoopError {
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically
|
||||
/// yields to avoid blocking the executor, and after resuming checks the provided
|
||||
/// cancellation token to drop out promptly on shutdown.
|
||||
#[inline(always)]
|
||||
pub async fn yielding_loop<I, T, F>(
|
||||
interval: usize,
|
||||
cancel: &CancellationToken,
|
||||
iter: I,
|
||||
mut visitor: F,
|
||||
) -> Result<(), YieldingLoopError>
|
||||
where
|
||||
I: Iterator<Item = T>,
|
||||
F: FnMut(T),
|
||||
{
|
||||
for (i, item) in iter.enumerate() {
|
||||
visitor(item);
|
||||
|
||||
if i + 1 % interval == 0 {
|
||||
tokio::task::yield_now().await;
|
||||
if cancel.is_cancelled() {
|
||||
return Err(YieldingLoopError::Cancelled);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,2 +1 @@
|
||||
#include "postgres.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
//! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h
|
||||
//! to generate Rust bindings for it.
|
||||
|
||||
use std::{env, path::PathBuf, process::Command};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
//! A C-Rust shim: defines implementation of C walproposer API, assuming wp
|
||||
//! callback_data stores Box to some Rust implementation.
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::ffi::CStr;
|
||||
@@ -8,12 +5,12 @@ use std::ffi::CString;
|
||||
|
||||
use crate::bindings::uint32;
|
||||
use crate::bindings::walproposer_api;
|
||||
use crate::bindings::NeonWALReadResult;
|
||||
use crate::bindings::PGAsyncReadResult;
|
||||
use crate::bindings::PGAsyncWriteResult;
|
||||
use crate::bindings::Safekeeper;
|
||||
use crate::bindings::Size;
|
||||
use crate::bindings::StringInfoData;
|
||||
use crate::bindings::TimeLineID;
|
||||
use crate::bindings::TimestampTz;
|
||||
use crate::bindings::WalProposer;
|
||||
use crate::bindings::WalProposerConnStatusType;
|
||||
@@ -178,11 +175,31 @@ extern "C" fn conn_blocking_write(
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
|
||||
extern "C" fn recovery_download(
|
||||
sk: *mut Safekeeper,
|
||||
_timeline: TimeLineID,
|
||||
startpos: XLogRecPtr,
|
||||
endpos: XLogRecPtr,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).recovery_download(&mut (*wp), &mut (*sk))
|
||||
(*api).recovery_download(&mut (*sk), startpos, endpos)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
startptr: XLogRecPtr,
|
||||
count: Size,
|
||||
) {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,28 +211,11 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
startptr: XLogRecPtr,
|
||||
count: Size,
|
||||
_errmsg: *mut *mut ::std::os::raw::c_char,
|
||||
) -> NeonWALReadResult {
|
||||
extern "C" fn free_event_set(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
// TODO: errmsg is not forwarded
|
||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_reader_events(&mut (*sk))
|
||||
(*api).free_event_set(&mut (*wp));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,14 +235,6 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).active_state_update_event_set(&mut (*sk));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
@@ -251,14 +243,6 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).rm_safekeeper_event_set(&mut (*sk));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wait_event_set(
|
||||
wp: *mut WalProposer,
|
||||
timeout: ::std::os::raw::c_long,
|
||||
@@ -326,6 +310,14 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).confirm_wal_streamed(&mut (*wp), lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn log_internal(
|
||||
wp: *mut WalProposer,
|
||||
level: ::std::os::raw::c_int,
|
||||
@@ -340,6 +332,14 @@ extern "C" fn log_internal(
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn after_election(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).after_election(&mut (*wp))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Level {
|
||||
Debug5,
|
||||
@@ -398,20 +398,20 @@ pub(crate) fn create_api() -> walproposer_api {
|
||||
conn_async_write: Some(conn_async_write),
|
||||
conn_blocking_write: Some(conn_blocking_write),
|
||||
recovery_download: Some(recovery_download),
|
||||
wal_reader_allocate: Some(wal_reader_allocate),
|
||||
wal_read: Some(wal_read),
|
||||
wal_reader_events: Some(wal_reader_events),
|
||||
wal_reader_allocate: Some(wal_reader_allocate),
|
||||
free_event_set: Some(free_event_set),
|
||||
init_event_set: Some(init_event_set),
|
||||
update_event_set: Some(update_event_set),
|
||||
active_state_update_event_set: Some(active_state_update_event_set),
|
||||
add_safekeeper_event_set: Some(add_safekeeper_event_set),
|
||||
rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
|
||||
wait_event_set: Some(wait_event_set),
|
||||
strong_random: Some(strong_random),
|
||||
get_redo_start_lsn: Some(get_redo_start_lsn),
|
||||
finish_sync_safekeepers: Some(finish_sync_safekeepers),
|
||||
process_safekeeper_feedback: Some(process_safekeeper_feedback),
|
||||
confirm_wal_streamed: Some(confirm_wal_streamed),
|
||||
log_internal: Some(log_internal),
|
||||
after_election: Some(after_election),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
|
||||
use crate::{
|
||||
api_bindings::{create_api, take_vec_u8, Level},
|
||||
bindings::{
|
||||
NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
|
||||
WalProposerFree, WalProposerStart,
|
||||
Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
|
||||
WalProposerStart,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -86,19 +86,19 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
|
||||
fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
|
||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
|
||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
|
||||
fn free_event_set(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
@@ -110,18 +110,10 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
|
||||
todo!()
|
||||
}
|
||||
@@ -142,6 +134,10 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
|
||||
todo!()
|
||||
}
|
||||
@@ -244,7 +240,6 @@ impl Drop for Wrapper {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use core::panic;
|
||||
use std::{
|
||||
cell::Cell,
|
||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||
@@ -252,7 +247,7 @@ mod tests {
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
||||
use crate::{api_bindings::Level, walproposer::Wrapper};
|
||||
|
||||
use super::ApiImpl;
|
||||
|
||||
@@ -360,17 +355,12 @@ mod tests {
|
||||
true
|
||||
}
|
||||
|
||||
fn recovery_download(
|
||||
&self,
|
||||
_wp: &mut crate::bindings::WalProposer,
|
||||
_sk: &mut crate::bindings::Safekeeper,
|
||||
) -> bool {
|
||||
true
|
||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
|
||||
println!("wal_reader_allocate")
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
|
||||
println!("wal_reader_allocate");
|
||||
crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
|
||||
fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
println!("free_event_set")
|
||||
}
|
||||
|
||||
fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
@@ -393,13 +383,6 @@ mod tests {
|
||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||
}
|
||||
|
||||
fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
|
||||
println!(
|
||||
"rm_safekeeper_event_set, sk={:?}",
|
||||
sk as *mut crate::bindings::Safekeeper
|
||||
);
|
||||
}
|
||||
|
||||
fn wait_event_set(
|
||||
&self,
|
||||
_: &mut crate::bindings::WalProposer,
|
||||
@@ -425,7 +408,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
||||
println!("wp_log[{}] {}", level, msg);
|
||||
println!("walprop_log[{}] {}", level, msg);
|
||||
}
|
||||
|
||||
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
||||
|
||||
@@ -63,7 +63,6 @@ thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -13,7 +13,6 @@ use bytes::{Buf, Bytes};
|
||||
use pageserver::{
|
||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
@@ -27,9 +26,9 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
let manager = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
[package]
|
||||
name = "pageserver_client"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
async-trait.workspace = true
|
||||
reqwest.workspace = true
|
||||
utils.workspace = true
|
||||
serde.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio.workspace = true
|
||||
futures.workspace = true
|
||||
tokio-util.workspace = true
|
||||
anyhow.workspace = true
|
||||
postgres.workspace = true
|
||||
bytes.workspace = true
|
||||
@@ -1,2 +0,0 @@
|
||||
pub mod mgmt_api;
|
||||
pub mod page_service;
|
||||
@@ -1,214 +0,0 @@
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
pub mod util;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Client {
|
||||
mgmt_api_endpoint: String,
|
||||
authorization_header: Option<String>,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("receive body: {0}")]
|
||||
ReceiveBody(reqwest::Error),
|
||||
|
||||
#[error("receive error body: {0}")]
|
||||
ReceiveErrorBody(String),
|
||||
|
||||
#[error("pageserver API: {0}")]
|
||||
ApiError(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
async fn error_from_body(self) -> Result<Self>;
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ResponseErrorMessageExt for reqwest::Response {
|
||||
async fn error_from_body(mut self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
}
|
||||
|
||||
let url = self.url().to_owned();
|
||||
Err(match self.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||
Self {
|
||||
mgmt_api_endpoint,
|
||||
authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
|
||||
client: reqwest::Client::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_tenants(&self) -> Result<Vec<pageserver_api::models::TenantInfo>> {
|
||||
let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
|
||||
let resp = self.get(&uri).await?;
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_details(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<pageserver_api::models::TenantDetails> {
|
||||
let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
|
||||
self.get(uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn list_timelines(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
|
||||
let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
|
||||
self.get(&uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn timeline_info(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<pageserver_api::models::TimelineInfo> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
self.get(&uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn keyspace(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<pageserver_api::models::partitioning::Partitioning> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
self.get(&uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
|
||||
self.request(Method::GET, uri, ()).await
|
||||
}
|
||||
|
||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
body: B,
|
||||
) -> Result<reqwest::Response> {
|
||||
let req = self.client.request(method, uri);
|
||||
let req = if let Some(value) = &self.authorization_header {
|
||||
req.header(reqwest::header::AUTHORIZATION, value)
|
||||
} else {
|
||||
req
|
||||
};
|
||||
let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
|
||||
let response = res.error_from_body().await?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub async fn status(&self) -> Result<()> {
|
||||
let uri = format!("{}/v1/status", self.mgmt_api_endpoint);
|
||||
self.get(&uri).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
|
||||
let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
|
||||
self.request(Method::POST, &uri, req)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, &uri, req).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/secondary/download",
|
||||
self.mgmt_api_endpoint, tenant_id
|
||||
);
|
||||
self.request(Method::POST, &uri, ())
|
||||
.await?
|
||||
.error_for_status()
|
||||
.map(|_| ())
|
||||
.map_err(|e| Error::ApiError(format!("{}", e)))
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<std::time::Duration>,
|
||||
) -> Result<()> {
|
||||
let req_body = TenantLocationConfigRequest { tenant_id, config };
|
||||
let path = format!(
|
||||
"{}/v1/tenant/{}/location_config",
|
||||
self.mgmt_api_endpoint, tenant_id
|
||||
);
|
||||
let path = if let Some(flush_ms) = flush_ms {
|
||||
format!("{}?flush_ms={}", path, flush_ms.as_millis())
|
||||
} else {
|
||||
path
|
||||
};
|
||||
self.request(Method::PUT, &path, &req_body).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
req: &TimelineCreateRequest,
|
||||
) -> Result<TimelineInfo> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline",
|
||||
self.mgmt_api_endpoint, tenant_id
|
||||
);
|
||||
self.request(Method::POST, &uri, req)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
@@ -1,49 +0,0 @@
|
||||
//! Helpers to do common higher-level tasks with the [`Client`].
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::task::JoinSet;
|
||||
use utils::id::{TenantId, TenantTimelineId};
|
||||
|
||||
use super::Client;
|
||||
|
||||
/// Retrieve a list of all of the pageserver's timelines.
|
||||
///
|
||||
/// Fails if there are sharded tenants present on the pageserver.
|
||||
pub async fn get_pageserver_tenant_timelines_unsharded(
|
||||
api_client: &Arc<Client>,
|
||||
) -> anyhow::Result<Vec<TenantTimelineId>> {
|
||||
let mut timelines: Vec<TenantTimelineId> = Vec::new();
|
||||
let mut tenants: Vec<TenantId> = Vec::new();
|
||||
for ti in api_client.list_tenants().await? {
|
||||
if !ti.id.is_unsharded() {
|
||||
anyhow::bail!(
|
||||
"only unsharded tenants are supported at this time: {}",
|
||||
ti.id
|
||||
);
|
||||
}
|
||||
tenants.push(ti.id.tenant_id)
|
||||
}
|
||||
let mut js = JoinSet::new();
|
||||
for tenant_id in tenants {
|
||||
js.spawn({
|
||||
let mgmt_api_client = Arc::clone(api_client);
|
||||
async move {
|
||||
(
|
||||
tenant_id,
|
||||
mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
|
||||
)
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(res) = js.join_next().await {
|
||||
let (tenant_id, details) = res.unwrap();
|
||||
for timeline_id in details.timelines {
|
||||
timelines.push(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(timelines)
|
||||
}
|
||||
@@ -1,144 +0,0 @@
|
||||
use std::pin::Pin;
|
||||
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
PagestreamGetPageResponse,
|
||||
},
|
||||
reltag::RelTag,
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_postgres::CopyOutStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub struct Client {
|
||||
client: tokio_postgres::Client,
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
|
||||
pub struct BasebackupRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub lsn: Option<Lsn>,
|
||||
pub gzip: bool,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub async fn new(connstring: String) -> anyhow::Result<Self> {
|
||||
let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?;
|
||||
|
||||
let conn_task_cancel = CancellationToken::new();
|
||||
let conn_task = tokio::spawn({
|
||||
let conn_task_cancel = conn_task_cancel.clone();
|
||||
async move {
|
||||
tokio::select! {
|
||||
_ = conn_task_cancel.cancelled() => { }
|
||||
res = connection => {
|
||||
res.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
Ok(Self {
|
||||
cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
|
||||
conn_task,
|
||||
client,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn pagestream(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<PagestreamClient> {
|
||||
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
|
||||
.client
|
||||
.copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
|
||||
.await?;
|
||||
let Client {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
client: _,
|
||||
} = self;
|
||||
Ok(PagestreamClient {
|
||||
copy_both: Box::pin(copy_both),
|
||||
conn_task,
|
||||
cancel_on_client_drop,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result<CopyOutStream> {
|
||||
let BasebackupRequest {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
} = req;
|
||||
let mut args = Vec::with_capacity(5);
|
||||
args.push("basebackup".to_string());
|
||||
args.push(format!("{tenant_id}"));
|
||||
args.push(format!("{timeline_id}"));
|
||||
if let Some(lsn) = lsn {
|
||||
args.push(format!("{lsn}"));
|
||||
}
|
||||
if *gzip {
|
||||
args.push("--gzip".to_string())
|
||||
}
|
||||
Ok(self.client.copy_out(&args.join(" ")).await?)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create using [`Client::pagestream`].
|
||||
pub struct PagestreamClient {
|
||||
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
|
||||
pub struct RelTagBlockNo {
|
||||
pub rel_tag: RelTag,
|
||||
pub block_no: u32,
|
||||
}
|
||||
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(mut self) {
|
||||
let _ = self.cancel_on_client_drop.take();
|
||||
self.conn_task.await.unwrap();
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
req: PagestreamGetPageRequest,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
let next: bytes::Bytes = next.unwrap()?;
|
||||
|
||||
let msg = PagestreamBeMessage::deserialize(next)?;
|
||||
match msg {
|
||||
PagestreamBeMessage::GetPage(p) => Ok(p),
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::Exists(_)
|
||||
| PagestreamBeMessage::Nblocks(_)
|
||||
| PagestreamBeMessage::DbSize(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
msg.kind()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,26 +0,0 @@
|
||||
[package]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
hdrhistogram.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tracing.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
pageserver = { path = ".." }
|
||||
pageserver_client.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
utils = { path = "../../libs/utils/" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
@@ -1,272 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use pageserver_client::page_service::BasebackupRequest;
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, info, instrument};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
|
||||
/// basebackup@LatestLSN
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "localhost:64000")]
|
||||
page_service_host_port: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long, default_value = "1.0")]
|
||||
gzip_probability: f64,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn inc(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
struct Target {
|
||||
timeline: TenantTimelineId,
|
||||
lsn_range: Option<Range<Lsn>>,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
total: request_stats::Output,
|
||||
}
|
||||
|
||||
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
|
||||
main_impl(args, thread_local_stats)
|
||||
})
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
|
||||
) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let mut js = JoinSet::new();
|
||||
for timeline in &timelines {
|
||||
js.spawn({
|
||||
let timeline = *timeline;
|
||||
// FIXME: this triggers initial logical size calculation
|
||||
// https://github.com/neondatabase/neon/issues/6168
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(timeline.tenant_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
async move {
|
||||
anyhow::Ok(Target {
|
||||
timeline,
|
||||
// TODO: support lsn_range != latest LSN
|
||||
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
|
||||
})
|
||||
}
|
||||
});
|
||||
}
|
||||
let mut all_targets: Vec<Target> = Vec::new();
|
||||
while let Some(res) = js.join_next().await {
|
||||
all_targets.push(res.unwrap().unwrap());
|
||||
}
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
|
||||
));
|
||||
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
|
||||
|
||||
tokio::spawn({
|
||||
let stats = Arc::clone(&live_stats);
|
||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
||||
async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let mut work_senders = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender = async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let (timeline, work) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let target = all_targets.choose(&mut rng).unwrap();
|
||||
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
|
||||
(
|
||||
target.timeline,
|
||||
Work {
|
||||
lsn,
|
||||
gzip: rng.gen_bool(args.gzip_probability),
|
||||
},
|
||||
)
|
||||
};
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
sender.send(work).await.ok().unwrap();
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(runtime) = args.runtime {
|
||||
match tokio::time::timeout(runtime.into(), work_sender).await {
|
||||
Ok(()) => unreachable!("work sender never terminates"),
|
||||
Err(_timeout) => {
|
||||
// this implicitly drops the work_senders, making all the clients exit
|
||||
}
|
||||
}
|
||||
} else {
|
||||
work_sender.await;
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = request_stats::Stats::new();
|
||||
for stats in all_thread_local_stats.lock().unwrap().iter() {
|
||||
let stats = stats.lock().unwrap();
|
||||
agg_stats.add(&stats);
|
||||
}
|
||||
agg_stats.output()
|
||||
},
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||
println!("{output}");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct Work {
|
||||
lsn: Option<Lsn>,
|
||||
gzip: bool,
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<Work>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
|
||||
&args.page_service_host_port,
|
||||
args.pageserver_jwt.as_deref(),
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
let copy_out_stream = client
|
||||
.basebackup(&BasebackupRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
})
|
||||
.await
|
||||
.with_context(|| format!("start basebackup for {timeline}"))
|
||||
.unwrap();
|
||||
|
||||
use futures::StreamExt;
|
||||
let size = Arc::new(AtomicUsize::new(0));
|
||||
copy_out_stream
|
||||
.for_each({
|
||||
|r| {
|
||||
let size = Arc::clone(&size);
|
||||
async move {
|
||||
let size = Arc::clone(&size);
|
||||
size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
})
|
||||
.await;
|
||||
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
@@ -1,351 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use futures::future::join_all;
|
||||
use pageserver::pgdatadir_mapping::key_to_rel_block;
|
||||
use pageserver::repository;
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{info, instrument};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long)]
|
||||
per_target_rate_limit: Option<usize>,
|
||||
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||
#[clap(long, default_value = "1")]
|
||||
req_latest_probability: f64,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn inc(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct KeyRange {
|
||||
timeline: TenantTimelineId,
|
||||
timeline_lsn: Lsn,
|
||||
start: i128,
|
||||
end: i128,
|
||||
}
|
||||
|
||||
impl KeyRange {
|
||||
fn len(&self) -> i128 {
|
||||
self.end - self.start
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
total: request_stats::Output,
|
||||
}
|
||||
|
||||
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
|
||||
main_impl(args, thread_local_stats)
|
||||
})
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
|
||||
) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut js = JoinSet::new();
|
||||
for timeline in &timelines {
|
||||
js.spawn({
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
let timeline = *timeline;
|
||||
async move {
|
||||
let partitioning = mgmt_api_client
|
||||
.keyspace(timeline.tenant_id, timeline.timeline_id)
|
||||
.await?;
|
||||
let lsn = partitioning.at_lsn;
|
||||
|
||||
let ranges = partitioning
|
||||
.keys
|
||||
.ranges
|
||||
.iter()
|
||||
.filter_map(|r| {
|
||||
let start = r.start;
|
||||
let end = r.end;
|
||||
// filter out non-relblock keys
|
||||
match (is_rel_block_key(&start), is_rel_block_key(&end)) {
|
||||
(true, true) => Some(KeyRange {
|
||||
timeline,
|
||||
timeline_lsn: lsn,
|
||||
start: start.to_i128(),
|
||||
end: end.to_i128(),
|
||||
}),
|
||||
(true, false) | (false, true) => {
|
||||
unimplemented!("split up range")
|
||||
}
|
||||
(false, false) => None,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
anyhow::Ok(ranges)
|
||||
}
|
||||
});
|
||||
}
|
||||
let mut all_ranges: Vec<KeyRange> = Vec::new();
|
||||
while let Some(res) = js.join_next().await {
|
||||
all_ranges.extend(res.unwrap().unwrap());
|
||||
}
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
|
||||
));
|
||||
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
|
||||
|
||||
tokio::spawn({
|
||||
let stats = Arc::clone(&live_stats);
|
||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
||||
async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let mut work_senders = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
|
||||
None => Box::pin(async move {
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
all_ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
loop {
|
||||
let (timeline, req) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &all_ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = repository::Key::from_i128(key);
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
(
|
||||
r.timeline,
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
},
|
||||
)
|
||||
};
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
sender.send(req).await.ok().unwrap();
|
||||
}
|
||||
}),
|
||||
Some(rps_limit) => Box::pin(async move {
|
||||
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
|
||||
|
||||
let make_timeline_task: &dyn Fn(
|
||||
TenantTimelineId,
|
||||
)
|
||||
-> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
let ranges: Vec<KeyRange> = all_ranges
|
||||
.iter()
|
||||
.filter(|r| r.timeline == timeline)
|
||||
.cloned()
|
||||
.collect();
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
Box::pin(async move {
|
||||
let mut ticker = tokio::time::interval(period);
|
||||
ticker.set_missed_tick_behavior(
|
||||
/* TODO review this choice */
|
||||
tokio::time::MissedTickBehavior::Burst,
|
||||
);
|
||||
loop {
|
||||
ticker.tick().await;
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = repository::Key::from_i128(key);
|
||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
sender.send(req).await.ok().unwrap();
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
let tasks: Vec<_> = work_senders
|
||||
.keys()
|
||||
.map(|tl| make_timeline_task(**tl))
|
||||
.collect();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
join_all(tasks).await;
|
||||
}),
|
||||
};
|
||||
|
||||
if let Some(runtime) = args.runtime {
|
||||
match tokio::time::timeout(runtime.into(), work_sender).await {
|
||||
Ok(()) => unreachable!("work sender never terminates"),
|
||||
Err(_timeout) => {
|
||||
// this implicitly drops the work_senders, making all the clients exit
|
||||
}
|
||||
}
|
||||
} else {
|
||||
work_sender.await;
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = request_stats::Stats::new();
|
||||
for stats in all_thread_local_stats.lock().unwrap().iter() {
|
||||
let stats = stats.lock().unwrap();
|
||||
agg_stats.add(&stats);
|
||||
}
|
||||
agg_stats.output()
|
||||
},
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||
println!("{output}");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client = client
|
||||
.pagestream(timeline.tenant_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some(req) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
client
|
||||
.getpage(req)
|
||||
.await
|
||||
.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
@@ -1,85 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use humantime::Duration;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "localhost:64000")]
|
||||
page_service_host_port: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(
|
||||
long,
|
||||
help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
|
||||
)]
|
||||
poll_for_completion: Option<Duration>,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(args));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
// kick it off
|
||||
|
||||
let mut js = JoinSet::new();
|
||||
for tl in timelines {
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
js.spawn(async move {
|
||||
// TODO: API to explicitly trigger initial logical size computation.
|
||||
// Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
|
||||
// => https://github.com/neondatabase/neon/issues/6168
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
if let Some(period) = args.poll_for_completion {
|
||||
let mut ticker = tokio::time::interval(period.into());
|
||||
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
|
||||
let mut info = info;
|
||||
while !info.current_logical_size_is_accurate {
|
||||
ticker.tick().await;
|
||||
info = mgmt_api_client
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(res) = js.join_next().await {
|
||||
let _: () = res.unwrap();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
use clap::Parser;
|
||||
use utils::logging;
|
||||
|
||||
/// Re-usable pieces of code that aren't CLI-specific.
|
||||
mod util {
|
||||
pub(crate) mod connstring;
|
||||
pub(crate) mod request_stats;
|
||||
#[macro_use]
|
||||
pub(crate) mod tokio_thread_local_stats;
|
||||
/// Re-usable pieces of CLI-specific code.
|
||||
pub(crate) mod cli {
|
||||
pub(crate) mod targets;
|
||||
}
|
||||
}
|
||||
|
||||
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
|
||||
mod cmd {
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod trigger_initial_size_calculation;
|
||||
}
|
||||
|
||||
/// Component-level performance test for pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
enum Args {
|
||||
Basebackup(cmd::basebackup::Args),
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let args = Args::parse();
|
||||
match args {
|
||||
Args::Basebackup(args) => cmd::basebackup::main(args),
|
||||
Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
|
||||
Args::TriggerInitialSizeCalculation(args) => {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use pageserver_client::mgmt_api;
|
||||
use tracing::info;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
pub(crate) struct Spec {
|
||||
pub(crate) limit_to_first_n_targets: Option<usize>,
|
||||
pub(crate) targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) async fn discover(
|
||||
api_client: &Arc<mgmt_api::Client>,
|
||||
spec: Spec,
|
||||
) -> anyhow::Result<Vec<TenantTimelineId>> {
|
||||
let mut timelines = if let Some(targets) = spec.targets {
|
||||
targets
|
||||
} else {
|
||||
mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
|
||||
};
|
||||
|
||||
if let Some(limit) = spec.limit_to_first_n_targets {
|
||||
timelines.sort(); // for determinism
|
||||
timelines.truncate(limit);
|
||||
if timelines.len() < limit {
|
||||
anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
|
||||
}
|
||||
}
|
||||
|
||||
info!("timelines:\n{:?}", timelines);
|
||||
info!("number of timelines:\n{:?}", timelines.len());
|
||||
|
||||
Ok(timelines)
|
||||
}
|
||||
@@ -1,8 +0,0 @@
|
||||
pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
|
||||
let colon_and_jwt = if let Some(jwt) = jwt {
|
||||
format!(":{jwt}") // TODO: urlescape
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
format!("postgres://postgres{colon_and_jwt}@{host_port}")
|
||||
}
|
||||
@@ -1,88 +0,0 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
pub(crate) struct Stats {
|
||||
latency_histo: hdrhistogram::Histogram<u64>,
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
// Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
|
||||
// which would skew the benchmark results.
|
||||
latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
|
||||
let micros: u64 = latency
|
||||
.as_micros()
|
||||
.try_into()
|
||||
.context("latency greater than u64")?;
|
||||
self.latency_histo
|
||||
.record(micros)
|
||||
.context("add to histogram")?;
|
||||
Ok(())
|
||||
}
|
||||
pub(crate) fn output(&self) -> Output {
|
||||
let latency_percentiles = std::array::from_fn(|idx| {
|
||||
let micros = self
|
||||
.latency_histo
|
||||
.value_at_percentile(LATENCY_PERCENTILES[idx]);
|
||||
Duration::from_micros(micros)
|
||||
});
|
||||
Output {
|
||||
request_count: self.latency_histo.len(),
|
||||
latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
|
||||
latency_percentiles: LatencyPercentiles {
|
||||
latency_percentiles,
|
||||
},
|
||||
}
|
||||
}
|
||||
pub(crate) fn add(&mut self, other: &Self) {
|
||||
let Self {
|
||||
ref mut latency_histo,
|
||||
} = self;
|
||||
latency_histo.add(&other.latency_histo).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Stats {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
|
||||
|
||||
struct LatencyPercentiles {
|
||||
latency_percentiles: [Duration; 4],
|
||||
}
|
||||
|
||||
impl serde::Serialize for LatencyPercentiles {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
|
||||
for p in LATENCY_PERCENTILES {
|
||||
ser.serialize_entry(
|
||||
&format!("p{p}"),
|
||||
&format!(
|
||||
"{}",
|
||||
&humantime::format_duration(self.latency_percentiles[0])
|
||||
),
|
||||
)?;
|
||||
}
|
||||
ser.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
pub(crate) struct Output {
|
||||
request_count: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
latency_mean: Duration,
|
||||
latency_percentiles: LatencyPercentiles,
|
||||
}
|
||||
@@ -1,45 +0,0 @@
|
||||
pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
|
||||
pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
|
||||
|
||||
macro_rules! declare {
|
||||
($THREAD_LOCAL_NAME:ident: $T:ty) => {
|
||||
thread_local! {
|
||||
pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
|
||||
std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
|
||||
);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
pub(crate) use declare;
|
||||
|
||||
macro_rules! main {
|
||||
($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
|
||||
let main_impl = $main_impl;
|
||||
let all = Arc::new(Mutex::new(Vec::new()));
|
||||
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.on_thread_start({
|
||||
let all = Arc::clone(&all);
|
||||
move || {
|
||||
// pre-initialize the thread local stats by accessesing them
|
||||
// (some stats like requests_stats::Stats are quite costly to initialize,
|
||||
// we don't want to pay that cost during the measurement period)
|
||||
$THREAD_LOCAL_NAME.with(|stats| {
|
||||
let stats: Arc<_> = Arc::clone(&*stats.borrow());
|
||||
all.lock().unwrap().push(stats);
|
||||
});
|
||||
}
|
||||
})
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(all));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}};
|
||||
}
|
||||
|
||||
pub(crate) use main;
|
||||
@@ -23,7 +23,6 @@ use tracing::*;
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::tenant::Timeline;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
@@ -175,7 +174,7 @@ where
|
||||
] {
|
||||
for segno in self
|
||||
.timeline
|
||||
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
|
||||
.list_slru_segments(kind, self.lsn, self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
@@ -193,7 +192,7 @@ where
|
||||
// Otherwise only include init forks of unlogged relations.
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
@@ -268,7 +267,7 @@ where
|
||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
|
||||
.get_rel_size(src, self.lsn, false, self.ctx)
|
||||
.await?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
@@ -289,7 +288,7 @@ where
|
||||
for blknum in startblk..endblk {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
|
||||
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
|
||||
.await?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
@@ -311,7 +310,7 @@ where
|
||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
|
||||
.get_slru_segment_size(slru, segno, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||
@@ -353,7 +352,7 @@ where
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
ensure!(
|
||||
@@ -400,7 +399,7 @@ where
|
||||
if !has_relmap_file
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?
|
||||
.is_empty()
|
||||
{
|
||||
|
||||
@@ -31,7 +31,6 @@ use pageserver::{
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::failpoint_support;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::signals::ShutdownSignals;
|
||||
use utils::{
|
||||
@@ -127,7 +126,7 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
// Initialize up failpoints support
|
||||
let scenario = failpoint_support::init();
|
||||
let scenario = pageserver::failpoint_support::init();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
|
||||
@@ -37,12 +37,10 @@ use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
|
||||
TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
|
||||
pub mod defaults {
|
||||
use crate::tenant::config::defaults::*;
|
||||
use const_format::formatcp;
|
||||
@@ -63,8 +61,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_LOG_FORMAT: &str = "plain";
|
||||
|
||||
pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
|
||||
|
||||
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
|
||||
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
|
||||
|
||||
@@ -75,9 +71,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
||||
|
||||
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
|
||||
pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
@@ -91,7 +84,6 @@ pub mod defaults {
|
||||
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
||||
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
||||
|
||||
#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
|
||||
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
|
||||
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
@@ -102,7 +94,6 @@ pub mod defaults {
|
||||
#log_format = '{DEFAULT_LOG_FORMAT}'
|
||||
|
||||
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
||||
#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
|
||||
|
||||
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
|
||||
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
|
||||
@@ -112,8 +103,6 @@ pub mod defaults {
|
||||
|
||||
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
||||
|
||||
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -131,7 +120,6 @@ pub mod defaults {
|
||||
#gc_feedback = false
|
||||
|
||||
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
||||
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
||||
|
||||
[remote_storage]
|
||||
|
||||
@@ -192,11 +180,6 @@ pub struct PageServerConf {
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
|
||||
/// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes
|
||||
/// loading such tenants, vs. other work in the system.
|
||||
pub concurrent_tenant_warmup: ConfigurableSemaphore,
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
|
||||
@@ -240,13 +223,6 @@ pub struct PageServerConf {
|
||||
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
|
||||
/// heatmap uploads vs. other remote storage operations.
|
||||
pub heatmap_upload_concurrency: usize,
|
||||
|
||||
/// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly
|
||||
/// deprioritises secondary downloads vs. remote storage operations for attached tenants.
|
||||
pub secondary_download_concurrency: usize,
|
||||
|
||||
/// Maximum number of WAL records to be ingested and committed at the same time
|
||||
pub ingest_batch_size: u64,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -307,7 +283,6 @@ struct PageServerConfigBuilder {
|
||||
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
|
||||
concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
|
||||
|
||||
metric_collection_interval: BuilderValue<Duration>,
|
||||
@@ -328,9 +303,6 @@ struct PageServerConfigBuilder {
|
||||
control_plane_emergency_mode: BuilderValue<bool>,
|
||||
|
||||
heatmap_upload_concurrency: BuilderValue<usize>,
|
||||
secondary_download_concurrency: BuilderValue<usize>,
|
||||
|
||||
ingest_batch_size: BuilderValue<u64>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -368,8 +340,6 @@ impl Default for PageServerConfigBuilder {
|
||||
.expect("cannot parse default keepalive interval")),
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
|
||||
.expect("Invalid default constant")),
|
||||
concurrent_tenant_size_logical_size_queries: Set(
|
||||
ConfigurableSemaphore::DEFAULT_INITIAL,
|
||||
),
|
||||
@@ -403,9 +373,6 @@ impl Default for PageServerConfigBuilder {
|
||||
control_plane_emergency_mode: Set(false),
|
||||
|
||||
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
||||
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
|
||||
|
||||
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -486,10 +453,6 @@ impl PageServerConfigBuilder {
|
||||
self.log_format = BuilderValue::Set(log_format)
|
||||
}
|
||||
|
||||
pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
|
||||
self.concurrent_tenant_warmup = BuilderValue::Set(u);
|
||||
}
|
||||
|
||||
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
|
||||
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
|
||||
}
|
||||
@@ -554,18 +517,7 @@ impl PageServerConfigBuilder {
|
||||
self.heatmap_upload_concurrency = BuilderValue::Set(value)
|
||||
}
|
||||
|
||||
pub fn secondary_download_concurrency(&mut self, value: usize) {
|
||||
self.secondary_download_concurrency = BuilderValue::Set(value)
|
||||
}
|
||||
|
||||
pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
|
||||
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
.ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
@@ -618,7 +570,6 @@ impl PageServerConfigBuilder {
|
||||
.broker_keepalive_interval
|
||||
.ok_or(anyhow!("No broker keepalive interval provided"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
@@ -660,15 +611,10 @@ impl PageServerConfigBuilder {
|
||||
control_plane_emergency_mode: self
|
||||
.control_plane_emergency_mode
|
||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
||||
|
||||
heatmap_upload_concurrency: self
|
||||
.heatmap_upload_concurrency
|
||||
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
||||
secondary_download_concurrency: self
|
||||
.secondary_download_concurrency
|
||||
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
|
||||
ingest_batch_size: self
|
||||
.ingest_batch_size
|
||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -726,11 +672,6 @@ impl PageServerConf {
|
||||
.join(TENANT_LOCATION_CONFIG_NAME)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TENANT_HEATMAP_BASENAME)
|
||||
}
|
||||
|
||||
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TIMELINES_SEGMENT_NAME)
|
||||
@@ -866,11 +807,6 @@ impl PageServerConf {
|
||||
"log_format" => builder.log_format(
|
||||
LogFormat::from_config(&parse_toml_string(key, item)?)?
|
||||
),
|
||||
"concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
|
||||
let input = parse_toml_string(key, item)?;
|
||||
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
|
||||
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
|
||||
}),
|
||||
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
|
||||
let input = parse_toml_string(key, item)?;
|
||||
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
|
||||
@@ -916,10 +852,6 @@ impl PageServerConf {
|
||||
"heatmap_upload_concurrency" => {
|
||||
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
|
||||
},
|
||||
"secondary_download_concurrency" => {
|
||||
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
|
||||
},
|
||||
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -972,10 +904,6 @@ impl PageServerConf {
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
broker_keepalive_interval: Duration::from_secs(5000),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new(
|
||||
NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
|
||||
),
|
||||
@@ -991,8 +919,6 @@ impl PageServerConf {
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1196,9 +1122,6 @@ background_task_maximum_delay = '334 s'
|
||||
storage_broker::DEFAULT_KEEPALIVE_INTERVAL
|
||||
)?,
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new(
|
||||
NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
|
||||
),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
eviction_task_immitated_concurrent_logical_size_queries:
|
||||
ConfigurableSemaphore::default(),
|
||||
@@ -1221,9 +1144,7 @@ background_task_maximum_delay = '334 s'
|
||||
control_plane_api: None,
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1267,9 +1188,6 @@ background_task_maximum_delay = '334 s'
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
log_format: LogFormat::Json,
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new(
|
||||
NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
|
||||
),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
eviction_task_immitated_concurrent_logical_size_queries:
|
||||
ConfigurableSemaphore::default(),
|
||||
@@ -1284,9 +1202,7 @@ background_task_maximum_delay = '334 s'
|
||||
control_plane_api: None,
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: 100,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -1516,7 +1432,6 @@ threshold = "20m"
|
||||
period: Duration::from_secs(10),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
|
||||
})
|
||||
);
|
||||
match &conf.default_tenant_conf.eviction_policy {
|
||||
|
||||
@@ -74,45 +74,6 @@ pub struct DiskUsageEvictionTaskConfig {
|
||||
pub period: Duration,
|
||||
#[cfg(feature = "testing")]
|
||||
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
||||
/// Select sorting for evicted layers
|
||||
#[serde(default)]
|
||||
pub eviction_order: EvictionOrder,
|
||||
}
|
||||
|
||||
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
|
||||
/// partitioning.
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", content = "args")]
|
||||
pub enum EvictionOrder {
|
||||
/// Order the layers to be evicted by how recently they have been accessed in absolute
|
||||
/// time.
|
||||
///
|
||||
/// This strategy is unfair when some tenants grow faster than others towards the slower
|
||||
/// growing.
|
||||
#[default]
|
||||
AbsoluteAccessed,
|
||||
|
||||
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
||||
/// the set of resident layers of a tenant.
|
||||
///
|
||||
/// This strategy will evict layers more fairly but is untested.
|
||||
RelativeAccessed {
|
||||
#[serde(default)]
|
||||
highest_layer_count_loses_first: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl EvictionOrder {
|
||||
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
|
||||
/// counts should be the first ones to have their layers evicted.
|
||||
fn highest_layer_count_loses_first(&self) -> bool {
|
||||
match self {
|
||||
EvictionOrder::AbsoluteAccessed => false,
|
||||
EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first,
|
||||
} => *highest_layer_count_loses_first,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -231,14 +192,7 @@ async fn disk_usage_eviction_task_iteration(
|
||||
) -> anyhow::Result<()> {
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
.context("get filesystem-level disk usage before evictions")?;
|
||||
let res = disk_usage_eviction_task_iteration_impl(
|
||||
state,
|
||||
storage,
|
||||
usage_pre,
|
||||
task_config.eviction_order,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||
match res {
|
||||
Ok(outcome) => {
|
||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||
@@ -324,7 +278,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
state: &State,
|
||||
_storage: &GenericRemoteStorage,
|
||||
usage_pre: U,
|
||||
eviction_order: EvictionOrder,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<IterationOutcome<U>> {
|
||||
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
|
||||
@@ -344,7 +297,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
"running disk usage based eviction due to pressure"
|
||||
);
|
||||
|
||||
let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
|
||||
let candidates = match collect_eviction_candidates(cancel).await? {
|
||||
EvictionCandidates::Cancelled => {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
@@ -354,16 +307,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// Debug-log the list of candidates
|
||||
let now = SystemTime::now();
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
let nth = i + 1;
|
||||
let desc = candidate.layer.layer_desc();
|
||||
let total_candidates = candidates.len();
|
||||
let size = desc.file_size;
|
||||
let rel = candidate.relative_last_activity;
|
||||
debug!(
|
||||
"cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
|
||||
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
|
||||
i + 1,
|
||||
candidates.len(),
|
||||
desc.file_size,
|
||||
now.duration_since(candidate.last_activity_ts)
|
||||
.unwrap()
|
||||
.as_micros(),
|
||||
partition,
|
||||
desc.tenant_shard_id,
|
||||
desc.timeline_id,
|
||||
candidate.layer,
|
||||
@@ -506,7 +459,6 @@ struct EvictionCandidate {
|
||||
timeline: Arc<Timeline>,
|
||||
layer: Layer,
|
||||
last_activity_ts: SystemTime,
|
||||
relative_last_activity: finite_f32::FiniteF32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@@ -526,24 +478,24 @@ enum EvictionCandidates {
|
||||
/// order. A caller that evicts in that order, until pressure is relieved, implements
|
||||
/// the eviction policy outlined in the module comment.
|
||||
///
|
||||
/// # Example with EvictionOrder::AbsoluteAccessed
|
||||
/// # Example
|
||||
///
|
||||
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
|
||||
/// Each layer has size 100, and both tenant's min_resident_size is 150.
|
||||
/// The eviction order would be
|
||||
///
|
||||
/// ```text
|
||||
/// partition last_activity_ts tenant/layer
|
||||
/// Above 18:30 A/c
|
||||
/// Above 19:00 A/b
|
||||
/// Above 18:29 B/c
|
||||
/// Above 19:05 B/b
|
||||
/// Above 20:00 B/a
|
||||
/// Above 20:03 A/a
|
||||
/// Below 20:30 A/d
|
||||
/// Below 20:40 B/d
|
||||
/// Below 20:45 B/e
|
||||
/// Below 20:58 A/e
|
||||
/// partition last_activity_ts tenant/layer
|
||||
/// Above 18:30 A/c
|
||||
/// Above 19:00 A/b
|
||||
/// Above 18:29 B/c
|
||||
/// Above 19:05 B/b
|
||||
/// Above 20:00 B/a
|
||||
/// Above 20:03 A/a
|
||||
/// Below 20:30 A/d
|
||||
/// Below 20:40 B/d
|
||||
/// Below 20:45 B/e
|
||||
/// Below 20:58 A/e
|
||||
/// ```
|
||||
///
|
||||
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
|
||||
@@ -553,77 +505,7 @@ enum EvictionCandidates {
|
||||
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
|
||||
/// after exhauting the `Above` partition.
|
||||
/// So, we did not respect each tenant's min_resident_size.
|
||||
///
|
||||
/// # Example with EvictionOrder::RelativeAccessed
|
||||
///
|
||||
/// ```text
|
||||
/// partition relative_age last_activity_ts tenant/layer
|
||||
/// Above 0/4 18:30 A/c
|
||||
/// Above 0/4 18:29 B/c
|
||||
/// Above 1/4 19:00 A/b
|
||||
/// Above 1/4 19:05 B/b
|
||||
/// Above 2/4 20:00 B/a
|
||||
/// Above 2/4 20:03 A/a
|
||||
/// Below 3/4 20:30 A/d
|
||||
/// Below 3/4 20:40 B/d
|
||||
/// Below 4/4 20:45 B/e
|
||||
/// Below 4/4 20:58 A/e
|
||||
/// ```
|
||||
///
|
||||
/// With tenants having the same number of layers the picture does not change much. The same with
|
||||
/// A having many more layers **resident** (not all of them listed):
|
||||
///
|
||||
/// ```text
|
||||
/// Above 0/100 18:30 A/c
|
||||
/// Above 0/4 18:29 B/c
|
||||
/// Above 1/100 19:00 A/b
|
||||
/// Above 2/100 20:03 A/a
|
||||
/// Above 3/100 20:03 A/nth_3
|
||||
/// Above 4/100 20:03 A/nth_4
|
||||
/// ...
|
||||
/// Above 1/4 19:05 B/b
|
||||
/// Above 25/100 20:04 A/nth_25
|
||||
/// ...
|
||||
/// Above 2/4 20:00 B/a
|
||||
/// Above 50/100 20:10 A/nth_50
|
||||
/// ...
|
||||
/// Below 3/4 20:40 B/d
|
||||
/// Below 99/100 20:30 A/nth_99
|
||||
/// Below 4/4 20:45 B/e
|
||||
/// Below 100/100 20:58 A/nth_100
|
||||
/// ```
|
||||
///
|
||||
/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
|
||||
/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
|
||||
/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
|
||||
/// appeared:
|
||||
///
|
||||
/// ```text
|
||||
/// Above 0/87 20:04 A/nth_23
|
||||
/// Above 0/3 19:05 B/b
|
||||
/// Above 0/50 20:59 C/nth_0
|
||||
/// Above 1/87 20:04 A/nth_24
|
||||
/// Above 1/50 21:00 C/nth_1
|
||||
/// Above 2/87 20:04 A/nth_25
|
||||
/// ...
|
||||
/// Above 16/50 21:02 C/nth_16
|
||||
/// Above 1/3 20:00 B/a
|
||||
/// Above 27/87 20:10 A/nth_50
|
||||
/// ...
|
||||
/// Below 2/3 20:40 B/d
|
||||
/// Below 49/50 21:05 C/nth_49
|
||||
/// Below 86/87 20:30 A/nth_99
|
||||
/// Below 3/3 20:45 B/e
|
||||
/// Below 50/50 21:05 C/nth_50
|
||||
/// Below 87/87 20:58 A/nth_100
|
||||
/// ```
|
||||
///
|
||||
/// Now relieving pressure with 23 layers would cost:
|
||||
/// - tenant A 14 layers
|
||||
/// - tenant B 1 layer
|
||||
/// - tenant C 8 layers
|
||||
async fn collect_eviction_candidates(
|
||||
eviction_order: EvictionOrder,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<EvictionCandidates> {
|
||||
// get a snapshot of the list of tenants
|
||||
@@ -709,63 +591,12 @@ async fn collect_eviction_candidates(
|
||||
tenant_candidates
|
||||
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
let mut cumsum: i128 = 0;
|
||||
|
||||
// keeping the -1 or not decides if every tenant should lose their least recently accessed
|
||||
// layer OR if this should happen in the order of having highest layer count:
|
||||
let fudge = if eviction_order.highest_layer_count_loses_first() {
|
||||
// relative_age vs. tenant layer count:
|
||||
// - 0.1..=1.0 (10 layers)
|
||||
// - 0.01..=1.0 (100 layers)
|
||||
// - 0.001..=1.0 (1000 layers)
|
||||
//
|
||||
// leading to evicting less of the smallest tenants.
|
||||
0
|
||||
} else {
|
||||
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
|
||||
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
|
||||
// be that less than 10k layer evictions is enough, so we would not need to evict from
|
||||
// all tenants.
|
||||
//
|
||||
// as the tenant ordering is now deterministic this could hit the same tenants
|
||||
// disproportionetly on multiple invocations. alternative could be to remember how many
|
||||
// layers did we evict last time from this tenant, and inject that as an additional
|
||||
// fudge here.
|
||||
1
|
||||
};
|
||||
|
||||
let total = tenant_candidates
|
||||
.len()
|
||||
.checked_sub(fudge)
|
||||
.filter(|&x| x > 0)
|
||||
// support 0 or 1 resident layer tenants as well
|
||||
.unwrap_or(1);
|
||||
let divider = total as f32;
|
||||
|
||||
for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
|
||||
for (timeline, layer_info) in tenant_candidates.into_iter() {
|
||||
let file_size = layer_info.file_size();
|
||||
|
||||
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
||||
// be 1.0; this is for us to evict it last.
|
||||
let relative_last_activity = if matches!(
|
||||
eviction_order,
|
||||
EvictionOrder::RelativeAccessed { .. }
|
||||
) {
|
||||
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
|
||||
// similarly for u16. unsure how it would help.
|
||||
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
|
||||
.unwrap_or_else(|val| {
|
||||
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
|
||||
finite_f32::FiniteF32::ZERO
|
||||
})
|
||||
} else {
|
||||
finite_f32::FiniteF32::ZERO
|
||||
};
|
||||
|
||||
let candidate = EvictionCandidate {
|
||||
timeline,
|
||||
last_activity_ts: layer_info.last_activity_ts,
|
||||
layer: layer_info.layer,
|
||||
relative_last_activity,
|
||||
};
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
@@ -779,19 +610,8 @@ async fn collect_eviction_candidates(
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
|
||||
match eviction_order {
|
||||
EvictionOrder::AbsoluteAccessed => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.last_activity_ts)
|
||||
});
|
||||
}
|
||||
EvictionOrder::RelativeAccessed { .. } => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
});
|
||||
}
|
||||
}
|
||||
candidates
|
||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||
|
||||
Ok(EvictionCandidates::Finished(candidates))
|
||||
}
|
||||
@@ -820,66 +640,6 @@ impl std::ops::Deref for TimelineKey {
|
||||
}
|
||||
}
|
||||
|
||||
/// A totally ordered f32 subset we can use with sorting functions.
|
||||
mod finite_f32 {
|
||||
|
||||
/// A totally ordered f32 subset we can use with sorting functions.
|
||||
#[derive(Clone, Copy, PartialEq)]
|
||||
pub struct FiniteF32(f32);
|
||||
|
||||
impl std::fmt::Debug for FiniteF32 {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for FiniteF32 {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Display::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::cmp::Eq for FiniteF32 {}
|
||||
|
||||
impl std::cmp::PartialOrd for FiniteF32 {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::cmp::Ord for FiniteF32 {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.0.total_cmp(&other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<f32> for FiniteF32 {
|
||||
type Error = f32;
|
||||
|
||||
fn try_from(value: f32) -> Result<Self, Self::Error> {
|
||||
if value.is_finite() {
|
||||
Ok(FiniteF32(value))
|
||||
} else {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FiniteF32 {
|
||||
pub const ZERO: FiniteF32 = FiniteF32(0.0);
|
||||
|
||||
pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
|
||||
if (0.0..=1.0).contains(&value) {
|
||||
// -0.0 is within the range, make sure it is assumed 0.0..=1.0
|
||||
let value = value.abs();
|
||||
Ok(FiniteF32(value))
|
||||
} else {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod filesystem_level_usage {
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
@@ -961,7 +721,6 @@ mod filesystem_level_usage {
|
||||
|
||||
#[test]
|
||||
fn max_usage_pct_pressure() {
|
||||
use super::EvictionOrder;
|
||||
use super::Usage as _;
|
||||
use std::time::Duration;
|
||||
use utils::serde_percent::Percent;
|
||||
@@ -973,7 +732,6 @@ mod filesystem_level_usage {
|
||||
period: Duration::MAX,
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: EvictionOrder::default(),
|
||||
},
|
||||
total_bytes: 100_000,
|
||||
avail_bytes: 0,
|
||||
|
||||
@@ -1,14 +1,3 @@
|
||||
//! Failpoint support code shared between pageserver and safekeepers.
|
||||
|
||||
use crate::http::{
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
};
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
///
|
||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||
@@ -36,7 +25,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async;
|
||||
// Helper function used by the macro. (A function has nicer scoping so we
|
||||
// don't need to decorate everything with "::")
|
||||
#[doc(hidden)]
|
||||
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
||||
pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
||||
let millis = duration_str.parse::<u64>().unwrap();
|
||||
let d = std::time::Duration::from_millis(millis);
|
||||
|
||||
@@ -82,7 +71,7 @@ pub fn init() -> fail::FailScenario<'static> {
|
||||
scenario
|
||||
}
|
||||
|
||||
pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
||||
pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
||||
if actions == "exit" {
|
||||
fail::cfg_callback(name, exit_failpoint)
|
||||
} else {
|
||||
@@ -95,45 +84,3 @@ fn exit_failpoint() {
|
||||
tracing::info!("Exit requested by failpoint");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
/// Configure failpoints through http.
|
||||
pub async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot manage failpoints because storage was compiled without failpoints support"
|
||||
)));
|
||||
}
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1,2 +1,4 @@
|
||||
pub mod routes;
|
||||
pub use routes::make_router;
|
||||
|
||||
pub use pageserver_api::models;
|
||||
|
||||
@@ -159,12 +159,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"412":
|
||||
description: Deletion may not proceed, tenant is not in Active state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
|
||||
@@ -14,7 +14,6 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
@@ -25,18 +24,20 @@ use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::{
|
||||
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||
@@ -49,10 +50,6 @@ use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use pageserver_api::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
generation::Generation,
|
||||
@@ -67,10 +64,8 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
// For APIs that require an Active tenant, how long should we block waiting for that state?
|
||||
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
||||
// failed API calls while tenants are activating.
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
// Imports only used for testing APIs
|
||||
use super::models::ConfigureFailpointsRequest;
|
||||
|
||||
pub struct State {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -152,7 +147,6 @@ impl From<PageReconstructError> for ApiError {
|
||||
PageReconstructError::AncestorStopping(_) => {
|
||||
ApiError::ResourceUnavailable(format!("{pre}").into())
|
||||
}
|
||||
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||
}
|
||||
}
|
||||
@@ -239,19 +233,6 @@ impl From<GetTenantError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetActiveTenantError> for ApiError {
|
||||
fn from(e: GetActiveTenantError) -> ApiError {
|
||||
match e {
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
|
||||
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
|
||||
GetActiveTenantError::NotFound(gte) => gte.into(),
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => {
|
||||
ApiError::ResourceUnavailable(format!("{}", e).into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SetNewTenantConfigError> for ApiError {
|
||||
fn from(e: SetNewTenantConfigError) -> ApiError {
|
||||
match e {
|
||||
@@ -307,7 +288,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
SlotUpsertError(e) => e.into(),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||
Cancelled => ApiError::ShuttingDown,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -455,10 +435,7 @@ async fn timeline_create_handler(
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
@@ -593,6 +570,8 @@ async fn get_lsn_by_timestamp_handler(
|
||||
)));
|
||||
}
|
||||
|
||||
let version: Option<u8> = parse_query_param(&request, "version")?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let timestamp_raw = must_get_query_param(&request, "timestamp")?;
|
||||
let timestamp = humantime::parse_rfc3339(×tamp_raw)
|
||||
@@ -605,18 +584,31 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let result = timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
|
||||
.await?;
|
||||
#[derive(serde::Serialize)]
|
||||
struct Result {
|
||||
lsn: Lsn,
|
||||
kind: &'static str,
|
||||
|
||||
if version.unwrap_or(0) > 1 {
|
||||
#[derive(serde::Serialize)]
|
||||
struct Result {
|
||||
lsn: Lsn,
|
||||
kind: &'static str,
|
||||
}
|
||||
let (lsn, kind) = match result {
|
||||
LsnForTimestamp::Present(lsn) => (lsn, "present"),
|
||||
LsnForTimestamp::Future(lsn) => (lsn, "future"),
|
||||
LsnForTimestamp::Past(lsn) => (lsn, "past"),
|
||||
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
|
||||
};
|
||||
json_response(StatusCode::OK, Result { lsn, kind })
|
||||
} else {
|
||||
// FIXME: this is a temporary crutch not to break backwards compatibility
|
||||
// See https://github.com/neondatabase/neon/pull/5608
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
};
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
let (lsn, kind) = match result {
|
||||
LsnForTimestamp::Present(lsn) => (lsn, "present"),
|
||||
LsnForTimestamp::Future(lsn) => (lsn, "future"),
|
||||
LsnForTimestamp::Past(lsn) => (lsn, "past"),
|
||||
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
|
||||
};
|
||||
json_response(StatusCode::OK, Result { lsn, kind })
|
||||
}
|
||||
|
||||
async fn get_timestamp_of_lsn_handler(
|
||||
@@ -702,23 +694,11 @@ async fn timeline_delete_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)
|
||||
.map_err(|e| {
|
||||
match e {
|
||||
// GetTenantError has a built-in conversion to ApiError, but in this context we don't
|
||||
// want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
|
||||
GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
|
||||
"Requested tenant is missing".to_string().into_boxed_str(),
|
||||
),
|
||||
e => e.into(),
|
||||
}
|
||||
})?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
|
||||
state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
|
||||
.instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
@@ -858,14 +838,11 @@ async fn tenant_status(
|
||||
}
|
||||
|
||||
let state = tenant.current_state();
|
||||
Result::<_, ApiError>::Ok(TenantDetails {
|
||||
tenant_info: TenantInfo {
|
||||
id: tenant_shard_id,
|
||||
state: state.clone(),
|
||||
current_physical_size: Some(current_physical_size),
|
||||
attachment_status: state.attachment_status(),
|
||||
},
|
||||
timelines: tenant.list_timeline_ids(),
|
||||
Result::<_, ApiError>::Ok(TenantInfo {
|
||||
id: tenant_shard_id,
|
||||
state: state.clone(),
|
||||
current_physical_size: Some(current_physical_size),
|
||||
attachment_status: state.attachment_status(),
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("tenant_status_handler",
|
||||
@@ -886,9 +863,7 @@ async fn tenant_delete_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
state
|
||||
.tenant_manager
|
||||
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
|
||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
|
||||
.instrument(info_span!("tenant_delete_handler",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard = %tenant_shard_id.shard_slug()
|
||||
@@ -1161,10 +1136,7 @@ async fn tenant_create_handler(
|
||||
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
if let res @ Err(_) = new_tenant
|
||||
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||
.await
|
||||
{
|
||||
if let res @ Err(_) = new_tenant.wait_to_become_active().await {
|
||||
// This shouldn't happen because we just created the tenant directory
|
||||
// in tenant::mgr::create_tenant, and there aren't any remote timelines
|
||||
// to load, so, nothing can really fail during load.
|
||||
@@ -1274,23 +1246,6 @@ async fn put_tenant_location_config_handler(
|
||||
// which is not a 400 but a 409.
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
if let Some(_flush_ms) = flush {
|
||||
match state
|
||||
.secondary_controller
|
||||
.upload_tenant(tenant_shard_id)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
tracing::info!("Uploaded heatmap during flush");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to flush heatmap: {e}");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tracing::info!("No flush requested when configuring");
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
@@ -1309,6 +1264,34 @@ async fn handle_tenant_break(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Cannot manage failpoints because pageserver was compiled without failpoints support"
|
||||
)));
|
||||
}
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
async fn timeline_gc_handler(
|
||||
mut request: Request<Body>,
|
||||
@@ -1504,6 +1487,69 @@ async fn timeline_collect_keyspace(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
struct Partitioning {
|
||||
keys: crate::keyspace::KeySpace,
|
||||
|
||||
at_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl serde::Serialize for Partitioning {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let mut map = serializer.serialize_map(Some(2))?;
|
||||
map.serialize_key("keys")?;
|
||||
map.serialize_value(&KeySpace(&self.keys))?;
|
||||
map.serialize_key("at_lsn")?;
|
||||
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
||||
map.end()
|
||||
}
|
||||
}
|
||||
|
||||
struct WithDisplay<'a, T>(&'a T);
|
||||
|
||||
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
|
||||
|
||||
impl<'a> serde::Serialize for KeySpace<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeSeq;
|
||||
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
|
||||
for kr in &self.0.ranges {
|
||||
seq.serialize_element(&KeyRange(kr))?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
}
|
||||
|
||||
struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
|
||||
|
||||
impl<'a> serde::Serialize for KeyRange<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeTuple;
|
||||
let mut t = serializer.serialize_tuple(2)?;
|
||||
t.serialize_element(&WithDisplay(&self.0.start))?;
|
||||
t.serialize_element(&WithDisplay(&self.0.end))?;
|
||||
t.end()
|
||||
}
|
||||
}
|
||||
|
||||
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
|
||||
|
||||
async {
|
||||
@@ -1515,9 +1561,7 @@ async fn timeline_collect_keyspace(
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
|
||||
|
||||
json_response(StatusCode::OK, res)
|
||||
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
|
||||
}
|
||||
.instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await
|
||||
@@ -1557,22 +1601,19 @@ async fn disk_usage_eviction_run(
|
||||
struct Config {
|
||||
/// How many bytes to evict before reporting that pressure is relieved.
|
||||
evict_bytes: u64,
|
||||
|
||||
#[serde(default)]
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
||||
struct Usage {
|
||||
// remains unchanged after instantiation of the struct
|
||||
evict_bytes: u64,
|
||||
config: Config,
|
||||
// updated by `add_available_bytes`
|
||||
freed_bytes: u64,
|
||||
}
|
||||
|
||||
impl crate::disk_usage_eviction_task::Usage for Usage {
|
||||
fn has_pressure(&self) -> bool {
|
||||
self.evict_bytes > self.freed_bytes
|
||||
self.config.evict_bytes > self.freed_bytes
|
||||
}
|
||||
|
||||
fn add_available_bytes(&mut self, bytes: u64) {
|
||||
@@ -1583,7 +1624,7 @@ async fn disk_usage_eviction_run(
|
||||
let config = json_request::<Config>(&mut r).await?;
|
||||
|
||||
let usage = Usage {
|
||||
evict_bytes: config.evict_bytes,
|
||||
config,
|
||||
freed_bytes: 0,
|
||||
};
|
||||
|
||||
@@ -1598,11 +1639,7 @@ async fn disk_usage_eviction_run(
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&state,
|
||||
storage,
|
||||
usage,
|
||||
config.eviction_order,
|
||||
&cancel,
|
||||
&state, storage, usage, &cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -1628,21 +1665,6 @@ async fn secondary_upload_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn secondary_download_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
state
|
||||
.secondary_controller
|
||||
.download_tenant(tenant_shard_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -1911,9 +1933,6 @@ pub fn make_router(
|
||||
.put("/v1/deletion_queue/flush", |r| {
|
||||
api_handler(r, deletion_queue_flush)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
|
||||
api_handler(r, secondary_download_handler)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_shard_id/break", |r| {
|
||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||
})
|
||||
|
||||
@@ -21,7 +21,6 @@ use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::Timeline;
|
||||
@@ -313,16 +312,13 @@ async fn import_wal(
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut modification = tline.begin_modification(endpoint);
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.await?;
|
||||
WAL_INGEST.records_committed.inc();
|
||||
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
nrecords += 1;
|
||||
@@ -452,14 +448,13 @@ pub async fn import_wal_from_tar(
|
||||
|
||||
waldecoder.feed_bytes(&bytes[offset..]);
|
||||
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut modification = tline.begin_modification(end_lsn);
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.await?;
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use crate::repository::{key_range_size, singleton_range, Key};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::key::Key;
|
||||
|
||||
///
|
||||
/// Represents a set of Keys, in a compact form.
|
||||
///
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct KeySpace {
|
||||
/// Contiguous ranges of keys that belong to the key space. In key order,
|
||||
/// and with no overlap.
|
||||
@@ -124,9 +123,6 @@ impl KeySpaceAccum {
|
||||
if range.start == accum.end {
|
||||
accum.end = range.end;
|
||||
} else {
|
||||
// TODO: to efficiently support small sharding stripe sizes, we should avoid starting
|
||||
// a new range here if the skipped region was all keys that don't belong on this shard.
|
||||
// (https://github.com/neondatabase/neon/issues/6247)
|
||||
assert!(range.start > accum.end);
|
||||
self.ranges.push(accum.clone());
|
||||
*accum = range;
|
||||
@@ -190,33 +186,6 @@ impl KeySpaceRandomAccum {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
||||
let start = key_range.start;
|
||||
let end = key_range.end;
|
||||
|
||||
if end.field1 != start.field1
|
||||
|| end.field2 != start.field2
|
||||
|| end.field3 != start.field3
|
||||
|| end.field4 != start.field4
|
||||
{
|
||||
return u32::MAX;
|
||||
}
|
||||
|
||||
let start = (start.field5 as u64) << 32 | start.field6 as u64;
|
||||
let end = (end.field5 as u64) << 32 | end.field6 as u64;
|
||||
|
||||
let diff = end - start;
|
||||
if diff > u32::MAX as u64 {
|
||||
u32::MAX
|
||||
} else {
|
||||
diff as u32
|
||||
}
|
||||
}
|
||||
|
||||
pub fn singleton_range(key: Key) -> Range<Key> {
|
||||
key..key.next()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -10,7 +10,7 @@ pub mod deletion_queue;
|
||||
pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub use pageserver_api::keyspace;
|
||||
pub mod keyspace;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
@@ -25,6 +25,8 @@ pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
pub mod failpoint_support;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
@@ -117,10 +119,6 @@ pub const TENANT_CONFIG_NAME: &str = "config";
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
|
||||
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
||||
/// tenant path while in secondary mode.
|
||||
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||
|
||||
/// A suffix used for various temporary files. Any temporary files found in the
|
||||
/// data directory at pageserver startup can be automatically removed.
|
||||
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
|
||||
// Metrics collected on operations on the storage repository.
|
||||
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum StorageTimeOperation {
|
||||
pub enum StorageTimeOperation {
|
||||
#[strum(serialize = "layer flush")]
|
||||
LayerFlush,
|
||||
|
||||
@@ -55,7 +55,7 @@ pub(crate) enum StorageTimeOperation {
|
||||
CreateTenant,
|
||||
}
|
||||
|
||||
pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
||||
pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
||||
register_counter_vec!(
|
||||
"pageserver_storage_operations_seconds_sum",
|
||||
"Total time spent on storage operations with operation, tenant and timeline dimensions",
|
||||
@@ -64,7 +64,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_storage_operations_seconds_count",
|
||||
"Count of storage operations with operation, tenant and timeline dimensions",
|
||||
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) struct PageCacheMetricsForTaskKind {
|
||||
pub struct PageCacheMetricsForTaskKind {
|
||||
pub read_accesses_materialized_page: IntCounter,
|
||||
pub read_accesses_immutable: IntCounter,
|
||||
|
||||
@@ -159,7 +159,7 @@ pub(crate) struct PageCacheMetricsForTaskKind {
|
||||
pub read_hits_materialized_page_older_lsn: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) struct PageCacheMetrics {
|
||||
pub struct PageCacheMetrics {
|
||||
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
|
||||
}
|
||||
|
||||
@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
||||
pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
||||
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
|
||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
|
||||
let task_kind: &'static str = task_kind.into();
|
||||
@@ -243,9 +243,10 @@ impl PageCacheMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct PageCacheSizeMetrics {
|
||||
pub struct PageCacheSizeMetrics {
|
||||
pub max_bytes: UIntGauge,
|
||||
|
||||
pub current_bytes_ephemeral: UIntGauge,
|
||||
pub current_bytes_immutable: UIntGauge,
|
||||
pub current_bytes_materialized_page: UIntGauge,
|
||||
}
|
||||
@@ -259,26 +260,31 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
|
||||
Lazy::new(|| PageCacheSizeMetrics {
|
||||
max_bytes: {
|
||||
register_uint_gauge!(
|
||||
"pageserver_page_cache_size_max_bytes",
|
||||
"Maximum size of the page cache in bytes"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
},
|
||||
current_bytes_immutable: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["immutable"])
|
||||
.unwrap()
|
||||
},
|
||||
current_bytes_materialized_page: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["materialized_page"])
|
||||
.unwrap()
|
||||
},
|
||||
});
|
||||
pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
|
||||
max_bytes: {
|
||||
register_uint_gauge!(
|
||||
"pageserver_page_cache_size_max_bytes",
|
||||
"Maximum size of the page cache in bytes"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
},
|
||||
|
||||
current_bytes_ephemeral: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["ephemeral"])
|
||||
.unwrap()
|
||||
},
|
||||
current_bytes_immutable: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["immutable"])
|
||||
.unwrap()
|
||||
},
|
||||
current_bytes_materialized_page: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["materialized_page"])
|
||||
.unwrap()
|
||||
},
|
||||
});
|
||||
|
||||
pub(crate) mod page_cache_eviction_metrics {
|
||||
use std::num::NonZeroUsize;
|
||||
@@ -516,18 +522,14 @@ pub(crate) mod initial_logical_size {
|
||||
impl StartCalculation {
|
||||
pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0
|
||||
.with_label_values(&["first", circumstances_label])
|
||||
.inc();
|
||||
self.0.with_label_values(&["first", circumstances_label]);
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
|
||||
}
|
||||
}
|
||||
pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0
|
||||
.with_label_values(&["retry", circumstances_label])
|
||||
.inc();
|
||||
self.0.with_label_values(&["retry", circumstances_label]);
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
|
||||
}
|
||||
@@ -682,65 +684,25 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
.expect("Failed to register pageserver_startup_is_loading")
|
||||
});
|
||||
|
||||
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
|
||||
/// like how long it took to load.
|
||||
///
|
||||
/// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant
|
||||
/// metrics are rather expensive, and usually fine grained stuff makes more sense
|
||||
/// at a timeline level than tenant level.
|
||||
pub(crate) struct TenantMetrics {
|
||||
/// How long did tenants take to go from construction to active state?
|
||||
pub(crate) activation: Histogram,
|
||||
pub(crate) preload: Histogram,
|
||||
pub(crate) attach: Histogram,
|
||||
|
||||
/// How many tenants are included in the initial startup of the pagesrever?
|
||||
pub(crate) startup_scheduled: IntCounter,
|
||||
pub(crate) startup_complete: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
|
||||
TenantMetrics {
|
||||
activation: register_histogram!(
|
||||
/// How long did tenants take to go from construction to active state?
|
||||
pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_tenant_activation_seconds",
|
||||
"Time taken by tenants to activate, in seconds",
|
||||
CRITICAL_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("Failed to register metric"),
|
||||
preload: register_histogram!(
|
||||
"pageserver_tenant_preload_seconds",
|
||||
"Time taken by tenants to load remote metadata on startup/attach, in seconds",
|
||||
CRITICAL_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("Failed to register metric"),
|
||||
attach: register_histogram!(
|
||||
"pageserver_tenant_attach_seconds",
|
||||
"Time taken by tenants to intialize, after remote metadata is already loaded",
|
||||
CRITICAL_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("Failed to register metric"),
|
||||
startup_scheduled: register_int_counter!(
|
||||
"pageserver_tenant_startup_scheduled",
|
||||
"Number of tenants included in pageserver startup (doesn't count tenants attached later)"
|
||||
).expect("Failed to register metric"),
|
||||
startup_complete: register_int_counter!(
|
||||
"pageserver_tenant_startup_complete",
|
||||
"Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
|
||||
should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \
|
||||
tenants: such cases will lead to this metric never reaching the scheduled count."
|
||||
).expect("Failed to register metric"),
|
||||
}
|
||||
.expect("Failed to register pageserver_tenant_activation_seconds metric")
|
||||
});
|
||||
|
||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct EvictionsWithLowResidenceDuration {
|
||||
pub struct EvictionsWithLowResidenceDuration {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
counter: Option<IntCounter>,
|
||||
}
|
||||
|
||||
pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
|
||||
pub struct EvictionsWithLowResidenceDurationBuilder {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
}
|
||||
@@ -1003,7 +965,7 @@ pub enum SmgrQueryType {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
pub struct SmgrQueryTimePerTimeline {
|
||||
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
||||
}
|
||||
|
||||
@@ -1017,62 +979,12 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
|
||||
[
|
||||
1,
|
||||
10,
|
||||
20,
|
||||
40,
|
||||
60,
|
||||
80,
|
||||
100,
|
||||
200,
|
||||
300,
|
||||
400,
|
||||
500,
|
||||
600,
|
||||
700,
|
||||
800,
|
||||
900,
|
||||
1_000, // 1ms
|
||||
2_000,
|
||||
4_000,
|
||||
6_000,
|
||||
8_000,
|
||||
10_000, // 10ms
|
||||
20_000,
|
||||
40_000,
|
||||
60_000,
|
||||
80_000,
|
||||
100_000,
|
||||
200_000,
|
||||
400_000,
|
||||
600_000,
|
||||
800_000,
|
||||
1_000_000, // 1s
|
||||
2_000_000,
|
||||
4_000_000,
|
||||
6_000_000,
|
||||
8_000_000,
|
||||
10_000_000, // 10s
|
||||
20_000_000,
|
||||
50_000_000,
|
||||
100_000_000,
|
||||
200_000_000,
|
||||
1_000_000_000, // 1000s
|
||||
]
|
||||
.into_iter()
|
||||
.map(Duration::from_micros)
|
||||
.map(|d| d.as_secs_f64())
|
||||
.collect()
|
||||
});
|
||||
|
||||
static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds_global",
|
||||
"Time spent on smgr query handling, aggregated by query type.",
|
||||
&["smgr_query_type"],
|
||||
SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1175,8 +1087,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
.map(|ms| (ms as f64) / 1000.0)
|
||||
});
|
||||
|
||||
pub(crate) struct BasebackupQueryTime(HistogramVec);
|
||||
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
pub struct BasebackupQueryTime(HistogramVec);
|
||||
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
BasebackupQueryTime({
|
||||
register_histogram_vec!(
|
||||
"pageserver_basebackup_query_seconds",
|
||||
@@ -1196,7 +1108,7 @@ impl DurationResultObserver for BasebackupQueryTime {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_live_connections",
|
||||
"Number of live network connections",
|
||||
@@ -1363,8 +1275,6 @@ pub(crate) struct SecondaryModeMetrics {
|
||||
pub(crate) upload_heatmap: IntCounter,
|
||||
pub(crate) upload_heatmap_errors: IntCounter,
|
||||
pub(crate) upload_heatmap_duration: Histogram,
|
||||
pub(crate) download_heatmap: IntCounter,
|
||||
pub(crate) download_layer: IntCounter,
|
||||
}
|
||||
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
|
||||
upload_heatmap: register_int_counter!(
|
||||
@@ -1382,16 +1292,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
|
||||
"Time to build and upload a heatmap, including any waiting inside the S3 client"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
download_heatmap: register_int_counter!(
|
||||
"pageserver_secondary_download_heatmap",
|
||||
"Number of downloads of heatmaps by secondary mode locations"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
download_layer: register_int_counter!(
|
||||
"pageserver_secondary_download_layer",
|
||||
"Number of downloads of layers by secondary mode locations"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
@@ -1661,7 +1561,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
Lazy::new(WalRedoProcessCounters::default);
|
||||
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
pub(crate) struct StorageTimeMetricsTimer {
|
||||
pub struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
start: Instant,
|
||||
}
|
||||
@@ -1686,7 +1586,7 @@ impl StorageTimeMetricsTimer {
|
||||
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
|
||||
/// timeline total sum and count.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct StorageTimeMetrics {
|
||||
pub struct StorageTimeMetrics {
|
||||
/// Sum of f64 seconds, per operation, tenant_id and timeline_id
|
||||
timeline_sum: Counter,
|
||||
/// Number of oeprations, per operation, tenant_id and timeline_id
|
||||
@@ -1725,7 +1625,7 @@ impl StorageTimeMetrics {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct TimelineMetrics {
|
||||
pub struct TimelineMetrics {
|
||||
tenant_id: String,
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
@@ -1933,7 +1833,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct RemoteTimelineClientMetrics {
|
||||
pub struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
||||
@@ -2231,7 +2131,7 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
|
||||
/// Wrapper future that measures the time spent by a remote storage operation,
|
||||
/// and records the time and success/failure as a prometheus metric.
|
||||
pub(crate) trait MeasureRemoteOp: Sized {
|
||||
pub trait MeasureRemoteOp: Sized {
|
||||
fn measure_remote_op(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
@@ -2256,7 +2156,7 @@ pub(crate) trait MeasureRemoteOp: Sized {
|
||||
impl<T: Sized> MeasureRemoteOp for T {}
|
||||
|
||||
pin_project! {
|
||||
pub(crate) struct MeasuredRemoteOp<F>
|
||||
pub struct MeasuredRemoteOp<F>
|
||||
{
|
||||
#[pin]
|
||||
inner: F,
|
||||
@@ -2313,9 +2213,6 @@ pub fn preinitialize_metrics() {
|
||||
// Deletion queue stats
|
||||
Lazy::force(&DELETION_QUEUE);
|
||||
|
||||
// Tenant stats
|
||||
Lazy::force(&TENANT);
|
||||
|
||||
// Tenant manager stats
|
||||
Lazy::force(&TENANT_MANAGER);
|
||||
|
||||
|
||||
@@ -25,7 +25,6 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
use std::borrow::Cow;
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::pin::pin;
|
||||
@@ -54,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
|
||||
use crate::pgdatadir_mapping::rel_block_to_key;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -62,9 +61,6 @@ use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::timeline::WaitLsnError;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
|
||||
@@ -287,64 +283,6 @@ struct PageServerHandler {
|
||||
connection_ctx: RequestContext,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum PageStreamError {
|
||||
/// We encountered an error that should prompt the client to reconnect:
|
||||
/// in practice this means we drop the connection without sending a response.
|
||||
#[error("Reconnect required: {0}")]
|
||||
Reconnect(Cow<'static, str>),
|
||||
|
||||
/// We were instructed to shutdown while processing the query
|
||||
#[error("Shutting down")]
|
||||
Shutdown,
|
||||
|
||||
/// Something went wrong reading a page: this likely indicates a pageserver bug
|
||||
#[error("Read error: {0}")]
|
||||
Read(PageReconstructError),
|
||||
|
||||
/// Ran out of time waiting for an LSN
|
||||
#[error("LSN timeout: {0}")]
|
||||
LsnTimeout(WaitLsnError),
|
||||
|
||||
/// The entity required to serve the request (tenant or timeline) is not found,
|
||||
/// or is not found in a suitable state to serve a request.
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(std::borrow::Cow<'static, str>),
|
||||
|
||||
/// Request asked for something that doesn't make sense, like an invalid LSN
|
||||
#[error("Bad request: {0}")]
|
||||
BadRequest(std::borrow::Cow<'static, str>),
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for PageStreamError {
|
||||
fn from(value: PageReconstructError) -> Self {
|
||||
match value {
|
||||
PageReconstructError::Cancelled => Self::Shutdown,
|
||||
e => Self::Read(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetActiveTimelineError> for PageStreamError {
|
||||
fn from(value: GetActiveTimelineError) -> Self {
|
||||
match value {
|
||||
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
|
||||
GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
|
||||
GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<WaitLsnError> for PageStreamError {
|
||||
fn from(value: WaitLsnError) -> Self {
|
||||
match value {
|
||||
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
|
||||
WaitLsnError::Shutdown => Self::Shutdown,
|
||||
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -490,7 +428,7 @@ impl PageServerHandler {
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
// Avoid starting new requests if the timeline has already started shutting down,
|
||||
// and block timeline shutdown until this request is complete, or drops out due
|
||||
@@ -582,44 +520,32 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
match response {
|
||||
Err(PageStreamError::Shutdown) => {
|
||||
if let Err(e) = &response {
|
||||
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
|
||||
// because wait_lsn etc will drop out
|
||||
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
|
||||
// is_canceled(): [`Timeline::shutdown`]` has entered
|
||||
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
// shutdown, then do not send the error to the client. Instead just drop the
|
||||
// connection.
|
||||
span.in_scope(|| info!("dropping connection due to shutdown"));
|
||||
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
Err(PageStreamError::Reconnect(reason)) => {
|
||||
span.in_scope(|| info!("handler requested reconnect: {reason}"));
|
||||
return Err(QueryError::Reconnect);
|
||||
}
|
||||
Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
|
||||
// This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
|
||||
// shutdown error, this may be buried inside a PageReconstructError::Other for example.
|
||||
//
|
||||
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
|
||||
// because wait_lsn etc will drop out
|
||||
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
|
||||
// is_canceled(): [`Timeline::shutdown`]` has entered
|
||||
span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
r => {
|
||||
let response_msg = r.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
}
|
||||
}
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -766,7 +692,7 @@ impl PageServerHandler {
|
||||
latest: bool,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Lsn, PageStreamError> {
|
||||
) -> anyhow::Result<Lsn> {
|
||||
if latest {
|
||||
// Latest page version was requested. If LSN is given, it is a hint
|
||||
// to the page server that there have been no modifications to the
|
||||
@@ -797,19 +723,15 @@ impl PageServerHandler {
|
||||
}
|
||||
} else {
|
||||
if lsn == Lsn(0) {
|
||||
return Err(PageStreamError::BadRequest(
|
||||
"invalid LSN(0) in request".into(),
|
||||
));
|
||||
anyhow::bail!("invalid LSN(0) in request");
|
||||
}
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
}
|
||||
|
||||
if lsn < **latest_gc_cutoff_lsn {
|
||||
return Err(PageStreamError::BadRequest(format!(
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
lsn, **latest_gc_cutoff_lsn
|
||||
).into()));
|
||||
}
|
||||
anyhow::ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
lsn, **latest_gc_cutoff_lsn
|
||||
);
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
@@ -818,14 +740,14 @@ impl PageServerHandler {
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamExistsRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let exists = timeline
|
||||
.get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
|
||||
.get_rel_exists(req.rel, lsn, req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
@@ -838,15 +760,13 @@ impl PageServerHandler {
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamNblocksRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let n_blocks = timeline
|
||||
.get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
|
||||
.await?;
|
||||
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
n_blocks,
|
||||
@@ -858,20 +778,14 @@ impl PageServerHandler {
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamDbSizeRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let total_blocks = timeline
|
||||
.get_db_size(
|
||||
DEFAULTTABLESPACE_OID,
|
||||
req.dbnode,
|
||||
Version::Lsn(lsn),
|
||||
req.latest,
|
||||
ctx,
|
||||
)
|
||||
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
|
||||
.await?;
|
||||
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
||||
|
||||
@@ -880,35 +794,30 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
async fn do_handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
let page = timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
/*
|
||||
// Add a 1s delay to some requests. The delay helps the requests to
|
||||
// hit the race condition from github issue #1047 more easily.
|
||||
use rand::Rng;
|
||||
if rand::thread_rng().gen::<u8>() < 5 {
|
||||
std::thread::sleep(std::time::Duration::from_millis(1000));
|
||||
}
|
||||
*/
|
||||
|
||||
let key = rel_block_to_key(req.rel, req.blkno);
|
||||
if timeline.get_shard_identity().is_key_local(&key) {
|
||||
self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
|
||||
.await
|
||||
let page = if timeline.get_shard_identity().is_key_local(&key) {
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
} else {
|
||||
// The Tenant shard we looked up at connection start does not hold this particular
|
||||
// key: look for other shards in this tenant. This scenario occurs if a pageserver
|
||||
@@ -927,30 +836,30 @@ impl PageServerHandler {
|
||||
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||
// mapping is out of date.
|
||||
tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
|
||||
timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
return Err(PageStreamError::Reconnect(
|
||||
"getpage@lsn request routed to wrong shard".into(),
|
||||
));
|
||||
// the requested page is not present on this node.
|
||||
|
||||
// TODO: this should be some kind of structured error that the client will understand,
|
||||
// so that it can block until its config is updated: this error is expected in the case
|
||||
// that the Tenant's shards' placements are being updated and the client hasn't been
|
||||
// informed yet.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/6038
|
||||
return Err(anyhow::anyhow!("Request routed to wrong shard"));
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
|
||||
// the GateGuard was already held over the whole connection.
|
||||
let _timeline_guard = timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| PageStreamError::Shutdown)?;
|
||||
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
};
|
||||
|
||||
self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
|
||||
.await
|
||||
}
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
}))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -1091,7 +1000,9 @@ impl PageServerHandler {
|
||||
)
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
||||
Ok(timeline)
|
||||
}
|
||||
}
|
||||
@@ -1513,15 +1424,14 @@ enum GetActiveTimelineError {
|
||||
#[error(transparent)]
|
||||
Tenant(GetActiveTenantError),
|
||||
#[error(transparent)]
|
||||
Timeline(#[from] GetTimelineError),
|
||||
Timeline(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<GetActiveTimelineError> for QueryError {
|
||||
fn from(e: GetActiveTimelineError) -> Self {
|
||||
match e {
|
||||
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
|
||||
GetActiveTimelineError::Tenant(e) => e.into(),
|
||||
GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
|
||||
GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::repository::*;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
@@ -147,7 +147,6 @@ impl Timeline {
|
||||
{
|
||||
DatadirModification {
|
||||
tline: self,
|
||||
pending_lsns: Vec::new(),
|
||||
pending_updates: HashMap::new(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
@@ -160,11 +159,11 @@ impl Timeline {
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page version.
|
||||
pub(crate) async fn get_rel_page_at_lsn(
|
||||
pub async fn get_rel_page_at_lsn(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
blknum: BlockNumber,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
@@ -174,47 +173,44 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
|
||||
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
|
||||
if blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag,
|
||||
blknum,
|
||||
version.get_lsn(),
|
||||
nblocks
|
||||
tag, blknum, lsn, nblocks
|
||||
);
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
}
|
||||
|
||||
let key = rel_block_to_key(tag, blknum);
|
||||
version.get(self, key, ctx).await
|
||||
self.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
pub(crate) async fn get_db_size(
|
||||
pub async fn get_db_size(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<usize, PageReconstructError> {
|
||||
let mut total_blocks = 0;
|
||||
|
||||
let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
|
||||
let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
|
||||
|
||||
for rel in rels {
|
||||
let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
|
||||
let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
|
||||
total_blocks += n_blocks as usize;
|
||||
}
|
||||
Ok(total_blocks)
|
||||
}
|
||||
|
||||
/// Get size of a relation file
|
||||
pub(crate) async fn get_rel_size(
|
||||
pub async fn get_rel_size(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
@@ -224,12 +220,12 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return Ok(nblocks);
|
||||
}
|
||||
|
||||
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
||||
&& !self.get_rel_exists(tag, version, latest, ctx).await?
|
||||
&& !self.get_rel_exists(tag, lsn, latest, ctx).await?
|
||||
{
|
||||
// FIXME: Postgres sometimes calls smgrcreate() to create
|
||||
// FSM, and smgrnblocks() on it immediately afterwards,
|
||||
@@ -239,7 +235,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let key = rel_size_to_key(tag);
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
let mut buf = self.get(key, lsn, ctx).await?;
|
||||
let nblocks = buf.get_u32_le();
|
||||
|
||||
if latest {
|
||||
@@ -250,16 +246,16 @@ impl Timeline {
|
||||
// latest=true, then it can not cause cache corruption, because with latest=true
|
||||
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
|
||||
// associated with most recent value of LSN.
|
||||
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
|
||||
self.update_cached_rel_size(tag, lsn, nblocks);
|
||||
}
|
||||
Ok(nblocks)
|
||||
}
|
||||
|
||||
/// Does relation exist?
|
||||
pub(crate) async fn get_rel_exists(
|
||||
pub async fn get_rel_exists(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
_latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
@@ -270,12 +266,12 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// first try to lookup relation in cache
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return Ok(true);
|
||||
}
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -291,16 +287,16 @@ impl Timeline {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub(crate) async fn list_rels(
|
||||
pub async fn list_rels(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -319,7 +315,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Look up given SLRU page version.
|
||||
pub(crate) async fn get_slru_page_at_lsn(
|
||||
pub async fn get_slru_page_at_lsn(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
@@ -332,29 +328,29 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
pub(crate) async fn get_slru_segment_size(
|
||||
pub async fn get_slru_segment_size(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
let key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
let mut buf = self.get(key, lsn, ctx).await?;
|
||||
Ok(buf.get_u32_le())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
pub(crate) async fn get_slru_segment_exists(
|
||||
pub async fn get_slru_segment_exists(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = slru_dir_to_key(kind);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -372,7 +368,7 @@ impl Timeline {
|
||||
/// so it's not well defined which LSN you get if there were multiple commits
|
||||
/// "in flight" at that point in time.
|
||||
///
|
||||
pub(crate) async fn find_lsn_for_timestamp(
|
||||
pub async fn find_lsn_for_timestamp(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
cancel: &CancellationToken,
|
||||
@@ -452,7 +448,7 @@ impl Timeline {
|
||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||
/// with a smaller/larger timestamp.
|
||||
///
|
||||
pub(crate) async fn is_latest_commit_timestamp_ge_than(
|
||||
pub async fn is_latest_commit_timestamp_ge_than(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
probe_lsn: Lsn,
|
||||
@@ -475,7 +471,7 @@ impl Timeline {
|
||||
/// Obtain the possible timestamp range for the given lsn.
|
||||
///
|
||||
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
|
||||
pub(crate) async fn get_timestamp_for_lsn(
|
||||
pub async fn get_timestamp_for_lsn(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -505,11 +501,11 @@ impl Timeline {
|
||||
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
|
||||
) -> Result<T, PageReconstructError> {
|
||||
for segno in self
|
||||
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
|
||||
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
|
||||
.await?
|
||||
{
|
||||
let nblocks = self
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
|
||||
.await?;
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page = self
|
||||
@@ -532,36 +528,36 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub(crate) async fn list_slru_segments(
|
||||
pub async fn list_slru_segments(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<u32>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let key = slru_dir_to_key(kind);
|
||||
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.segments),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_relmap_file(
|
||||
pub async fn get_relmap_file(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = relmap_file_key(spcnode, dbnode);
|
||||
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub(crate) async fn list_dbdirs(
|
||||
pub async fn list_dbdirs(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -575,7 +571,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_twophase_file(
|
||||
pub async fn get_twophase_file(
|
||||
&self,
|
||||
xid: TransactionId,
|
||||
lsn: Lsn,
|
||||
@@ -586,7 +582,7 @@ impl Timeline {
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub(crate) async fn list_twophase_files(
|
||||
pub async fn list_twophase_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -600,7 +596,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_control_file(
|
||||
pub async fn get_control_file(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -608,7 +604,7 @@ impl Timeline {
|
||||
self.get(CONTROLFILE_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn get_checkpoint(
|
||||
pub async fn get_checkpoint(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -616,7 +612,7 @@ impl Timeline {
|
||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn list_aux_files(
|
||||
pub async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -656,10 +652,7 @@ impl Timeline {
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for rel in self
|
||||
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
|
||||
.await?
|
||||
{
|
||||
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
}
|
||||
@@ -699,7 +692,7 @@ impl Timeline {
|
||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||
|
||||
let mut rels: Vec<RelTag> = self
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
|
||||
.list_rels(spcnode, dbnode, lsn, ctx)
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
@@ -806,39 +799,18 @@ pub struct DatadirModification<'a> {
|
||||
/// in the state in 'tline' yet.
|
||||
pub tline: &'a Timeline,
|
||||
|
||||
/// Current LSN of the modification
|
||||
lsn: Lsn,
|
||||
/// Lsn assigned by begin_modification
|
||||
pub lsn: Lsn,
|
||||
|
||||
// The modifications are not applied directly to the underlying key-value store.
|
||||
// The put-functions add the modifications here, and they are flushed to the
|
||||
// underlying key-value store by the 'finish' function.
|
||||
pending_lsns: Vec<Lsn>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_updates: HashMap<Key, Value>,
|
||||
pending_deletions: Vec<Range<Key>>,
|
||||
pending_nblocks: i64,
|
||||
}
|
||||
|
||||
impl<'a> DatadirModification<'a> {
|
||||
/// Get the current lsn
|
||||
pub(crate) fn get_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
lsn >= self.lsn,
|
||||
"setting an older lsn {} than {} is not allowed",
|
||||
lsn,
|
||||
self.lsn
|
||||
);
|
||||
if lsn > self.lsn {
|
||||
self.pending_lsns.push(self.lsn);
|
||||
self.lsn = lsn;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Initialize a completely new repository.
|
||||
///
|
||||
/// This inserts the directory metadata entries that are assumed to
|
||||
@@ -1012,9 +984,11 @@ impl<'a> DatadirModification<'a> {
|
||||
dbnode: Oid,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let req_lsn = self.tline.get_last_record_lsn();
|
||||
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
|
||||
.get_db_size(spcnode, dbnode, req_lsn, true, ctx)
|
||||
.await?;
|
||||
|
||||
// Remove entry from dbdir
|
||||
@@ -1103,11 +1077,8 @@ impl<'a> DatadirModification<'a> {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(self), true, ctx)
|
||||
.await?
|
||||
{
|
||||
let last_lsn = self.tline.get_last_record_lsn();
|
||||
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
|
||||
let size_key = rel_size_to_key(rel);
|
||||
// Fetch the old size first
|
||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||
@@ -1352,23 +1323,17 @@ impl<'a> DatadirModification<'a> {
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
for (key, values) in self.pending_updates.drain() {
|
||||
for (lsn, value) in values {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
} else {
|
||||
retained_pending_updates
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push((lsn, value));
|
||||
}
|
||||
let mut retained_pending_updates = HashMap::new();
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, self.lsn, &value, ctx).await?;
|
||||
} else {
|
||||
retained_pending_updates.insert(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
self.pending_updates = retained_pending_updates;
|
||||
self.pending_updates.extend(retained_pending_updates);
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1385,28 +1350,18 @@ impl<'a> DatadirModification<'a> {
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
let lsn = self.lsn;
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
writer.put_batch(&self.pending_updates, ctx).await?;
|
||||
self.pending_updates.clear();
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
}
|
||||
for key_range in self.pending_deletions.drain(..) {
|
||||
writer.delete(key_range, lsn).await?;
|
||||
}
|
||||
|
||||
if !self.pending_deletions.is_empty() {
|
||||
writer.delete_batch(&self.pending_deletions).await?;
|
||||
self.pending_deletions.clear();
|
||||
}
|
||||
|
||||
self.pending_lsns.push(self.lsn);
|
||||
for pending_lsn in self.pending_lsns.drain(..) {
|
||||
// Ideally, we should be able to call writer.finish_write() only once
|
||||
// with the highest LSN. However, the last_record_lsn variable in the
|
||||
// timeline keeps track of the latest LSN and the immediate previous LSN
|
||||
// so we need to record every LSN to not leave a gap between them.
|
||||
writer.finish_write(pending_lsn);
|
||||
}
|
||||
writer.finish_write(lsn);
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1415,86 +1370,44 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.pending_updates.len() + self.pending_deletions.len()
|
||||
pub(crate) fn is_empty(&self) -> bool {
|
||||
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
|
||||
}
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||
// Have we already updated the same key? Read the latest pending updated
|
||||
// Have we already updated the same key? Read the pending updated
|
||||
// version in that case.
|
||||
//
|
||||
// Note: we don't check pending_deletions. It is an error to request a
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(values) = self.pending_updates.get(&key) {
|
||||
if let Some((_, value)) = values.last() {
|
||||
return if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
};
|
||||
if let Some(value) = self.pending_updates.get(&key) {
|
||||
if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
if let Some((last_lsn, last_value)) = values.last_mut() {
|
||||
if *last_lsn == self.lsn {
|
||||
*last_value = val;
|
||||
return;
|
||||
}
|
||||
}
|
||||
values.push((self.lsn, val));
|
||||
self.pending_updates.insert(key, val);
|
||||
}
|
||||
|
||||
fn delete(&mut self, key_range: Range<Key>) {
|
||||
trace!("DELETE {}-{}", key_range.start, key_range.end);
|
||||
self.pending_deletions.push((key_range, self.lsn));
|
||||
}
|
||||
}
|
||||
|
||||
/// This struct facilitates accessing either a committed key from the timeline at a
|
||||
/// specific LSN, or the latest uncommitted key from a pending modification.
|
||||
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
|
||||
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
|
||||
/// need to look up the keys in the modification first before looking them up in the
|
||||
/// timeline to not miss the latest updates.
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum Version<'a> {
|
||||
Lsn(Lsn),
|
||||
Modified(&'a DatadirModification<'a>),
|
||||
}
|
||||
|
||||
impl<'a> Version<'a> {
|
||||
async fn get(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
key: Key,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
match self {
|
||||
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
|
||||
Version::Modified(modification) => modification.get(key, ctx).await,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
Version::Lsn(lsn) => *lsn,
|
||||
Version::Modified(modification) => modification.lsn,
|
||||
}
|
||||
self.pending_deletions.push(key_range);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1863,7 +1776,6 @@ pub fn is_inherited_key(key: Key) -> bool {
|
||||
key != AUX_FILES_KEY
|
||||
}
|
||||
|
||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
@@ -1878,6 +1790,7 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
@@ -2,11 +2,38 @@ use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::AddAssign;
|
||||
use std::ops::{AddAssign, Range};
|
||||
use std::time::Duration;
|
||||
|
||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
||||
let start = key_range.start;
|
||||
let end = key_range.end;
|
||||
|
||||
if end.field1 != start.field1
|
||||
|| end.field2 != start.field2
|
||||
|| end.field3 != start.field3
|
||||
|| end.field4 != start.field4
|
||||
{
|
||||
return u32::MAX;
|
||||
}
|
||||
|
||||
let start = (start.field5 as u64) << 32 | start.field6 as u64;
|
||||
let end = (end.field5 as u64) << 32 | end.field6 as u64;
|
||||
|
||||
let diff = end - start;
|
||||
if diff > u32::MAX as u64 {
|
||||
u32::MAX
|
||||
} else {
|
||||
diff as u32
|
||||
}
|
||||
}
|
||||
|
||||
pub fn singleton_range(key: Key) -> Range<Key> {
|
||||
key..key.next()
|
||||
}
|
||||
|
||||
/// A 'value' stored for a one Key.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
|
||||
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
|
||||
// else, but that has not been needed in a long time.
|
||||
std::env::var("TOKIO_WORKER_THREADS")
|
||||
.map(|s| s.parse::<usize>().unwrap())
|
||||
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
|
||||
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -258,9 +258,6 @@ pub enum TaskKind {
|
||||
/// See [`crate::disk_usage_eviction_task`].
|
||||
DiskUsageEviction,
|
||||
|
||||
/// See [`crate::tenant::secondary`].
|
||||
SecondaryDownloads,
|
||||
|
||||
/// See [`crate::tenant::secondary`].
|
||||
SecondaryUploads,
|
||||
|
||||
|
||||
@@ -33,12 +33,9 @@ use tracing::*;
|
||||
use utils::backoff;
|
||||
use utils::completion;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::failpoint_support;
|
||||
use utils::fs_ext;
|
||||
use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::GateGuard;
|
||||
use utils::timeout::timeout_cancellable;
|
||||
use utils::timeout::TimeoutCancellableError;
|
||||
|
||||
use self::config::AttachedLocationConfig;
|
||||
use self::config::AttachmentMode;
|
||||
@@ -56,14 +53,13 @@ use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
use self::timeline::WaitLsnError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::deletion_queue::DeletionQueueError;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::TENANT;
|
||||
use crate::metrics::TENANT_ACTIVATION;
|
||||
use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
@@ -230,7 +226,7 @@ pub struct Tenant {
|
||||
|
||||
/// The value creation timestamp, used to measure activation delay, see:
|
||||
/// <https://github.com/neondatabase/neon/issues/4025>
|
||||
constructed_at: Instant,
|
||||
loading_started_at: Instant,
|
||||
|
||||
state: watch::Sender<TenantState>,
|
||||
|
||||
@@ -280,11 +276,6 @@ pub struct Tenant {
|
||||
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
|
||||
/// If the tenant is in Activating state, notify this to encourage it
|
||||
/// to proceed to Active as soon as possible, rather than waiting for lazy
|
||||
/// background warmup.
|
||||
pub(crate) activate_now_sem: tokio::sync::Semaphore,
|
||||
|
||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||
|
||||
// Cancellation token fires when we have entered shutdown(). This is a parent of
|
||||
@@ -596,9 +587,10 @@ impl Tenant {
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
// TODO(sharding): make WalRedoManager shard-aware
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
tenant_shard_id.tenant_id,
|
||||
)));
|
||||
|
||||
let TenantSharedResources {
|
||||
@@ -630,14 +622,6 @@ impl Tenant {
|
||||
"attach tenant",
|
||||
false,
|
||||
async move {
|
||||
// Is this tenant being spawned as part of process startup?
|
||||
let starting_up = init_order.is_some();
|
||||
scopeguard::defer! {
|
||||
if starting_up {
|
||||
TENANT.startup_complete.inc();
|
||||
}
|
||||
}
|
||||
|
||||
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
|
||||
let make_broken =
|
||||
|t: &Tenant, err: anyhow::Error| {
|
||||
@@ -664,62 +648,8 @@ impl Tenant {
|
||||
.as_mut()
|
||||
.and_then(|x| x.initial_tenant_load_remote.take());
|
||||
|
||||
enum AttachType<'a> {
|
||||
// During pageserver startup, we are attaching this tenant lazily in the background
|
||||
Warmup(tokio::sync::SemaphorePermit<'a>),
|
||||
// During pageserver startup, we are attaching this tenant as soon as we can,
|
||||
// because a client tried to access it.
|
||||
OnDemand,
|
||||
// During normal operations after startup, we are attaching a tenant.
|
||||
Normal,
|
||||
}
|
||||
|
||||
// Before doing any I/O, wait for either or:
|
||||
// - A client to attempt to access to this tenant (on-demand loading)
|
||||
// - A permit to become available in the warmup semaphore (background warmup)
|
||||
//
|
||||
// Some-ness of init_order is how we know if we're attaching during startup or later
|
||||
// in process lifetime.
|
||||
let attach_type = if init_order.is_some() {
|
||||
tokio::select!(
|
||||
_ = tenant_clone.activate_now_sem.acquire() => {
|
||||
tracing::info!("Activating tenant (on-demand)");
|
||||
AttachType::OnDemand
|
||||
},
|
||||
permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
|
||||
match permit_result {
|
||||
Ok(p) => {
|
||||
tracing::info!("Activating tenant (warmup)");
|
||||
AttachType::Warmup(p)
|
||||
}
|
||||
Err(_) => {
|
||||
// This is unexpected: the warmup semaphore should stay alive
|
||||
// for the lifetime of init_order. Log a warning and proceed.
|
||||
tracing::warn!("warmup_limit semaphore unexpectedly closed");
|
||||
AttachType::Normal
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
_ = tenant_clone.cancel.cancelled() => {
|
||||
// This is safe, but should be pretty rare: it is interesting if a tenant
|
||||
// stayed in Activating for such a long time that shutdown found it in
|
||||
// that state.
|
||||
tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
|
||||
return Ok(());
|
||||
},
|
||||
)
|
||||
} else {
|
||||
AttachType::Normal
|
||||
};
|
||||
|
||||
let preload_timer = TENANT.preload.start_timer();
|
||||
let preload = match mode {
|
||||
SpawnMode::Create => {
|
||||
// Don't count the skipped preload into the histogram of preload durations
|
||||
preload_timer.stop_and_discard();
|
||||
None
|
||||
},
|
||||
SpawnMode::Create => {None},
|
||||
SpawnMode::Normal => {
|
||||
match &remote_storage {
|
||||
Some(remote_storage) => Some(
|
||||
@@ -729,11 +659,7 @@ impl Tenant {
|
||||
tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
|
||||
)
|
||||
.await {
|
||||
Ok(p) => {
|
||||
preload_timer.observe_duration();
|
||||
p
|
||||
}
|
||||
,
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
return Ok(());
|
||||
@@ -795,43 +721,15 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
// We will time the duration of the attach phase unless this is a creation (attach will do no work)
|
||||
let attach_timer = match mode {
|
||||
SpawnMode::Create => None,
|
||||
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
||||
};
|
||||
match tenant_clone.attach(preload, &ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
if let Some(t)= attach_timer {t.observe_duration();}
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
}
|
||||
Err(e) => {
|
||||
if let Some(t)= attach_timer {t.observe_duration();}
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
}
|
||||
|
||||
// If we are doing an opportunistic warmup attachment at startup, initialize
|
||||
// logical size at the same time. This is better than starting a bunch of idle tenants
|
||||
// with cold caches and then coming back later to initialize their logical sizes.
|
||||
//
|
||||
// It also prevents the warmup proccess competing with the concurrency limit on
|
||||
// logical size calculations: if logical size calculation semaphore is saturated,
|
||||
// then warmup will wait for that before proceeding to the next tenant.
|
||||
if let AttachType::Warmup(_permit) = attach_type {
|
||||
let mut futs = FuturesUnordered::new();
|
||||
let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
|
||||
for t in timelines {
|
||||
futs.push(t.await_initial_logical_size())
|
||||
}
|
||||
tracing::info!("Waiting for initial logical sizes while warming up...");
|
||||
while futs.next().await.is_some() {
|
||||
|
||||
}
|
||||
tracing::info!("Warm-up complete");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
@@ -891,7 +789,7 @@ impl Tenant {
|
||||
) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
|
||||
let preload = match preload {
|
||||
Some(p) => p,
|
||||
@@ -1003,7 +901,7 @@ impl Tenant {
|
||||
// IndexPart is the source of truth.
|
||||
self.clean_up_timelines(&existent_timelines)?;
|
||||
|
||||
failpoint_support::sleep_millis_async!("attach-before-activate");
|
||||
crate::failpoint_support::sleep_millis_async!("attach-before-activate");
|
||||
|
||||
info!("Done");
|
||||
|
||||
@@ -1145,9 +1043,10 @@ impl Tenant {
|
||||
tenant_shard_id: TenantShardId,
|
||||
reason: String,
|
||||
) -> Arc<Tenant> {
|
||||
// TODO(sharding): make WalRedoManager shard-aware
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
tenant_shard_id.tenant_id,
|
||||
)));
|
||||
Arc::new(Tenant::new(
|
||||
TenantState::Broken {
|
||||
@@ -1552,10 +1451,6 @@ impl Tenant {
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn list_timeline_ids(&self) -> Vec<TimelineId> {
|
||||
self.timelines.lock().unwrap().keys().cloned().collect()
|
||||
}
|
||||
|
||||
/// This is used to create the initial 'main' timeline during bootstrapping,
|
||||
/// or when importing a new base backup. The caller is expected to load an
|
||||
/// initial image of the datadir to the new timeline after this.
|
||||
@@ -1759,15 +1654,7 @@ impl Tenant {
|
||||
// decoding the new WAL might need to look up previous pages, relation
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
ancestor_timeline
|
||||
.wait_lsn(*lsn, ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
||||
CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
|
||||
}
|
||||
WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
|
||||
})?;
|
||||
ancestor_timeline.wait_lsn(*lsn, ctx).await?;
|
||||
}
|
||||
|
||||
self.branch_timeline(
|
||||
@@ -1809,15 +1696,6 @@ impl Tenant {
|
||||
Ok(loaded_timeline)
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_timeline(
|
||||
self: Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
DeleteTimelineFlow::run(&self, timeline_id, false).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// perform one garbage collection iteration, removing old data files from disk.
|
||||
/// this function is periodically called by gc task.
|
||||
/// also it can be explicitly requested through page server api 'do_gc' command.
|
||||
@@ -1979,7 +1857,7 @@ impl Tenant {
|
||||
);
|
||||
*current_state = TenantState::Active;
|
||||
|
||||
let elapsed = self.constructed_at.elapsed();
|
||||
let elapsed = self.loading_started_at.elapsed();
|
||||
let total_timelines = timelines_accessor.len();
|
||||
|
||||
// log a lot of stuff, because some tenants sometimes suffer from user-visible
|
||||
@@ -1994,7 +1872,7 @@ impl Tenant {
|
||||
"activation attempt finished"
|
||||
);
|
||||
|
||||
TENANT.activation.observe(elapsed.as_secs_f64());
|
||||
TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -2249,41 +2127,18 @@ impl Tenant {
|
||||
self.state.subscribe()
|
||||
}
|
||||
|
||||
/// The activate_now semaphore is initialized with zero units. As soon as
|
||||
/// we add a unit, waiters will be able to acquire a unit and proceed.
|
||||
pub(crate) fn activate_now(&self) {
|
||||
self.activate_now_sem.add_permits(1);
|
||||
}
|
||||
|
||||
pub(crate) async fn wait_to_become_active(
|
||||
&self,
|
||||
timeout: Duration,
|
||||
) -> Result<(), GetActiveTenantError> {
|
||||
pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
|
||||
let mut receiver = self.state.subscribe();
|
||||
loop {
|
||||
let current_state = receiver.borrow_and_update().clone();
|
||||
match current_state {
|
||||
TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
|
||||
// in these states, there's a chance that we can reach ::Active
|
||||
self.activate_now();
|
||||
match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
|
||||
Ok(r) => {
|
||||
r.map_err(
|
||||
|_e: tokio::sync::watch::error::RecvError|
|
||||
// Tenant existed but was dropped: report it as non-existent
|
||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
|
||||
)?
|
||||
}
|
||||
Err(TimeoutCancellableError::Cancelled) => {
|
||||
return Err(GetActiveTenantError::Cancelled);
|
||||
}
|
||||
Err(TimeoutCancellableError::Timeout) => {
|
||||
return Err(GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state: Some(self.current_state()),
|
||||
wait_time: timeout,
|
||||
});
|
||||
}
|
||||
}
|
||||
receiver.changed().await.map_err(
|
||||
|_e: tokio::sync::watch::error::RecvError|
|
||||
// Tenant existed but was dropped: report it as non-existent
|
||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
|
||||
)?;
|
||||
}
|
||||
TenantState::Active { .. } => {
|
||||
return Ok(());
|
||||
@@ -2608,7 +2463,7 @@ impl Tenant {
|
||||
conf,
|
||||
// using now here is good enough approximation to catch tenants with really long
|
||||
// activation times.
|
||||
constructed_at: Instant::now(),
|
||||
loading_started_at: Instant::now(),
|
||||
tenant_conf: Arc::new(RwLock::new(attached_conf)),
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
timelines_creating: Mutex::new(HashSet::new()),
|
||||
@@ -2620,7 +2475,6 @@ impl Tenant {
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
activate_now_sem: tokio::sync::Semaphore::new(0),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||
cancel: CancellationToken::default(),
|
||||
gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
|
||||
@@ -2847,7 +2701,9 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
|
||||
crate::failpoint_support::sleep_millis_async!(
|
||||
"gc_iteration_internal_after_getting_gc_timelines"
|
||||
);
|
||||
|
||||
// If there is nothing to GC, we don't want any messages in the INFO log.
|
||||
if !gc_timelines.is_empty() {
|
||||
@@ -3140,7 +2996,6 @@ impl Tenant {
|
||||
|
||||
/// For unit tests, make this visible so that other modules can directly create timelines
|
||||
#[cfg(test)]
|
||||
#[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub(crate) async fn bootstrap_timeline_test(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
@@ -3204,7 +3059,6 @@ impl Tenant {
|
||||
storage,
|
||||
&self.tenant_shard_id,
|
||||
&existing_initdb_timeline_id,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.context("download initdb tar")?;
|
||||
@@ -3245,7 +3099,6 @@ impl Tenant {
|
||||
&timeline_id,
|
||||
pgdata_zstd.try_clone().await?,
|
||||
tar_zst_size,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
@@ -3253,7 +3106,9 @@ impl Tenant {
|
||||
3,
|
||||
u32::MAX,
|
||||
"persist_initdb_tar_zst",
|
||||
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
backoff::Cancel::new(self.cancel.clone(), || {
|
||||
anyhow::anyhow!("initdb upload cancelled")
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -46,8 +46,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
|
||||
@@ -48,9 +48,6 @@ pub(crate) enum DeleteTenantError {
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] DeleteTimelineError),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
@@ -74,7 +71,6 @@ async fn create_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
|
||||
@@ -91,7 +87,8 @@ async fn create_remote_delete_mark(
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await
|
||||
.context("mark_upload")?;
|
||||
@@ -173,7 +170,6 @@ async fn remove_tenant_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
@@ -183,7 +179,8 @@ async fn remove_tenant_remote_delete_mark(
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
@@ -325,15 +322,9 @@ impl DeleteTenantFlow {
|
||||
// Though sounds scary, different mark name?
|
||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
create_remote_delete_mark(
|
||||
conf,
|
||||
remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
@@ -533,14 +524,8 @@ impl DeleteTenantFlow {
|
||||
.context("timelines dir not empty")?;
|
||||
}
|
||||
|
||||
remove_tenant_remote_delete_mark(
|
||||
conf,
|
||||
remote_storage.as_ref(),
|
||||
&tenant.tenant_shard_id,
|
||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await?;
|
||||
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
|
||||
.await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
@@ -588,7 +573,7 @@ impl DeleteTenantFlow {
|
||||
}
|
||||
break;
|
||||
}
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
|
||||
// This is unexpected: this secondary tenants should not have been created, and we
|
||||
// are not in a position to shut it down from here.
|
||||
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
|
||||
|
||||
@@ -28,7 +28,7 @@ use crate::control_plane_client::{
|
||||
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::metrics::TENANT_MANAGER as METRICS;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
|
||||
@@ -44,7 +44,7 @@ use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::secondary::SecondaryTenant;
|
||||
use super::timeline::delete::DeleteTimelineFlow;
|
||||
use super::TenantSharedResources;
|
||||
|
||||
/// For a tenant that appears in TenantsMap, it may either be
|
||||
@@ -58,7 +58,7 @@ use super::TenantSharedResources;
|
||||
/// having a properly acquired generation (Secondary doesn't need a generation)
|
||||
pub(crate) enum TenantSlot {
|
||||
Attached(Arc<Tenant>),
|
||||
Secondary(Arc<SecondaryTenant>),
|
||||
Secondary,
|
||||
/// In this state, other administrative operations acting on the TenantId should
|
||||
/// block, or return a retry indicator equivalent to HTTP 503.
|
||||
InProgress(utils::completion::Barrier),
|
||||
@@ -68,7 +68,7 @@ impl std::fmt::Debug for TenantSlot {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
|
||||
Self::Secondary(_) => write!(f, "Secondary"),
|
||||
Self::Secondary => write!(f, "Secondary"),
|
||||
Self::InProgress(_) => write!(f, "InProgress"),
|
||||
}
|
||||
}
|
||||
@@ -79,7 +79,7 @@ impl TenantSlot {
|
||||
fn get_attached(&self) -> Option<&Arc<Tenant>> {
|
||||
match self {
|
||||
Self::Attached(t) => Some(t),
|
||||
Self::Secondary(_) => None,
|
||||
Self::Secondary => None,
|
||||
Self::InProgress(_) => None,
|
||||
}
|
||||
}
|
||||
@@ -131,7 +131,7 @@ impl TenantsMap {
|
||||
|
||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||
/// resolve this to a fully qualified TenantShardId.
|
||||
fn resolve_attached_shard(
|
||||
fn resolve_shard(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
selector: ShardSelector,
|
||||
@@ -141,27 +141,25 @@ impl TenantsMap {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
||||
// Ignore all slots that don't contain an attached tenant
|
||||
let tenant = match &slot.1 {
|
||||
TenantSlot::Attached(t) => t,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
match selector {
|
||||
ShardSelector::First => return Some(*slot.0),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return Some(*slot.0)
|
||||
}
|
||||
ShardSelector::Page(key) => {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
if let Some(tenant) = slot.1.get_attached() {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return Some(*slot.0);
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return Some(*slot.0);
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
_ => continue,
|
||||
@@ -432,13 +430,6 @@ pub async fn init_tenant_mgr(
|
||||
let tenant_generations =
|
||||
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||
|
||||
tracing::info!(
|
||||
"Attaching {} tenants at startup, warming up {} at a time",
|
||||
tenant_configs.len(),
|
||||
conf.concurrent_tenant_warmup.initial_permits()
|
||||
);
|
||||
TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
|
||||
|
||||
// Construct `Tenant` objects and start them running
|
||||
for (tenant_shard_id, location_conf) in tenant_configs {
|
||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||
@@ -467,18 +458,12 @@ pub async fn init_tenant_mgr(
|
||||
*gen
|
||||
} else {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Secondary(secondary_config) => {
|
||||
LocationMode::Secondary(_) => {
|
||||
// We do not require the control plane's permission for secondary mode
|
||||
// tenants, because they do no remote writes and hence require no
|
||||
// generation number
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
secondary_config,
|
||||
)),
|
||||
);
|
||||
tenants.insert(tenant_shard_id, TenantSlot::Secondary);
|
||||
}
|
||||
LocationMode::Attached(_) => {
|
||||
// TODO: augment re-attach API to enable the control plane to
|
||||
@@ -523,7 +508,10 @@ pub async fn init_tenant_mgr(
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
|
||||
tenants.insert(
|
||||
TenantShardId::unsharded(tenant.tenant_id()),
|
||||
TenantSlot::Attached(tenant),
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||
@@ -670,14 +658,8 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
|
||||
total_attached += 1;
|
||||
}
|
||||
TenantSlot::Secondary(state) => {
|
||||
// We don't need to wait for this individually per-tenant: the
|
||||
// downloader task will be waited on eventually, this cancel
|
||||
// is just to encourage it to drop out if it is doing work
|
||||
// for this tenant right now.
|
||||
state.cancel.cancel();
|
||||
|
||||
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
|
||||
TenantSlot::Secondary => {
|
||||
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
|
||||
}
|
||||
TenantSlot::InProgress(notify) => {
|
||||
// InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
|
||||
@@ -860,26 +842,21 @@ impl TenantManager {
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||
}
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
None | Some(TenantSlot::Secondary) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_secondary_tenant_shard(
|
||||
pub(crate) async fn delete_timeline(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Option<Arc<SecondaryTenant>> {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.ok()
|
||||
.flatten();
|
||||
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Secondary(s)) => Some(s.clone()),
|
||||
_ => None,
|
||||
}
|
||||
timeline_id: TimelineId,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?;
|
||||
DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
@@ -893,15 +870,10 @@ impl TenantManager {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
info!("configuring tenant location to state {new_location_config:?}");
|
||||
|
||||
enum FastPathModified {
|
||||
Attached(Arc<Tenant>),
|
||||
Secondary(Arc<SecondaryTenant>),
|
||||
}
|
||||
|
||||
// Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
|
||||
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
|
||||
// then we do not need to set the slot to InProgress, we can just call into the
|
||||
// existng tenant.
|
||||
let fast_path_taken = {
|
||||
let modify_tenant = {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
let peek_slot =
|
||||
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
|
||||
@@ -915,19 +887,12 @@ impl TenantManager {
|
||||
new_location_config.clone(),
|
||||
)?);
|
||||
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
Some(tenant.clone())
|
||||
} else {
|
||||
// Different generations, fall through to general case
|
||||
None
|
||||
}
|
||||
}
|
||||
(
|
||||
LocationMode::Secondary(secondary_conf),
|
||||
Some(TenantSlot::Secondary(secondary_tenant)),
|
||||
) => {
|
||||
secondary_tenant.set_config(secondary_conf);
|
||||
Some(FastPathModified::Secondary(secondary_tenant.clone()))
|
||||
}
|
||||
_ => {
|
||||
// Not an Attached->Attached transition, fall through to general case
|
||||
None
|
||||
@@ -936,51 +901,34 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
// Fast-path continued: having dropped out of the self.tenants lock, do the async
|
||||
// phase of writing config and/or waiting for flush, before returning.
|
||||
match fast_path_taken {
|
||||
Some(FastPathModified::Attached(tenant)) => {
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
// Transition to AttachedStale means we may well hold a valid generation
|
||||
// still, and have been requested to go stale as part of a migration. If
|
||||
// the caller set `flush`, then flush to remote storage.
|
||||
if let LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: _,
|
||||
attach_mode: AttachmentMode::Stale,
|
||||
}) = &new_location_config.mode
|
||||
{
|
||||
if let Some(flush_timeout) = flush {
|
||||
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
|
||||
Ok(Err(e)) => {
|
||||
return Err(e);
|
||||
}
|
||||
Ok(Ok(_)) => return Ok(()),
|
||||
Err(_) => {
|
||||
tracing::warn!(
|
||||
// phase of waiting for flush, before returning.
|
||||
if let Some(tenant) = modify_tenant {
|
||||
// Transition to AttachedStale means we may well hold a valid generation
|
||||
// still, and have been requested to go stale as part of a migration. If
|
||||
// the caller set `flush`, then flush to remote storage.
|
||||
if let LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: _,
|
||||
attach_mode: AttachmentMode::Stale,
|
||||
}) = &new_location_config.mode
|
||||
{
|
||||
if let Some(flush_timeout) = flush {
|
||||
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
|
||||
Ok(Err(e)) => {
|
||||
return Err(e);
|
||||
}
|
||||
Ok(Ok(_)) => return Ok(()),
|
||||
Err(_) => {
|
||||
tracing::warn!(
|
||||
timeout_ms = flush_timeout.as_millis(),
|
||||
"Timed out waiting for flush to remote storage, proceeding anyway."
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
Some(FastPathModified::Secondary(_secondary_tenant)) => {
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
None => {
|
||||
// Proceed with the general case procedure, where we will shutdown & remove any existing
|
||||
// slot contents and replace with a fresh one
|
||||
}
|
||||
};
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
|
||||
// InProgress value to the slot while we make whatever changes are required. The state for
|
||||
@@ -989,73 +937,65 @@ impl TenantManager {
|
||||
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
|
||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
|
||||
match slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
// The case where we keep a Tenant alive was covered above in the special case
|
||||
// for Attached->Attached transitions in the same generation. By this point,
|
||||
// if we see an attached tenant we know it will be discarded and should be
|
||||
// shut down.
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
|
||||
// The case where we keep a Tenant alive was covered above in the special case
|
||||
// for Attached->Attached transitions in the same generation. By this point,
|
||||
// if we see an attached tenant we know it will be discarded and should be
|
||||
// shut down.
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
|
||||
match tenant.get_attach_mode() {
|
||||
AttachmentMode::Single | AttachmentMode::Multi => {
|
||||
// Before we leave our state as the presumed holder of the latest generation,
|
||||
// flush any outstanding deletions to reduce the risk of leaking objects.
|
||||
self.resources.deletion_queue_client.flush_advisory()
|
||||
}
|
||||
AttachmentMode::Stale => {
|
||||
// If we're stale there's not point trying to flush deletions
|
||||
}
|
||||
};
|
||||
|
||||
info!("Shutting down attached tenant");
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
barrier.wait().await;
|
||||
}
|
||||
match tenant.get_attach_mode() {
|
||||
AttachmentMode::Single | AttachmentMode::Multi => {
|
||||
// Before we leave our state as the presumed holder of the latest generation,
|
||||
// flush any outstanding deletions to reduce the risk of leaking objects.
|
||||
self.resources.deletion_queue_client.flush_advisory()
|
||||
}
|
||||
AttachmentMode::Stale => {
|
||||
// If we're stale there's not point trying to flush deletions
|
||||
}
|
||||
};
|
||||
|
||||
info!("Shutting down attached tenant");
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
barrier.wait().await;
|
||||
}
|
||||
slot_guard.drop_old_value().expect("We just shut it down");
|
||||
}
|
||||
Some(TenantSlot::Secondary(state)) => {
|
||||
info!("Shutting down secondary tenant");
|
||||
state.shutdown().await;
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
// This should never happen: acquire_slot should error out
|
||||
// if the contents of a slot were InProgress.
|
||||
anyhow::bail!("Acquired an InProgress slot, this is a bug.")
|
||||
}
|
||||
None => {
|
||||
// Slot was vacant, nothing needs shutting down.
|
||||
}
|
||||
slot_guard.drop_old_value().expect("We just shut it down");
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||
|
||||
// Directory structure is the same for attached and secondary modes:
|
||||
// create it if it doesn't exist. Timeline load/creation expects the
|
||||
// timelines/ subdir to already exist.
|
||||
//
|
||||
// Does not need to be fsync'd because local storage is just a cache.
|
||||
tokio::fs::create_dir_all(&timelines_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||
|
||||
// Before activating either secondary or attached mode, persist the
|
||||
// configuration, so that on restart we will re-attach (or re-start
|
||||
// secondary) on the tenant.
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
let new_slot = match &new_location_config.mode {
|
||||
LocationMode::Secondary(secondary_config) => {
|
||||
TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
|
||||
LocationMode::Secondary(_) => {
|
||||
// Directory doesn't need to be fsync'd because if we crash it can
|
||||
// safely be recreated next time this tenant location is configured.
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {tenant_path}"))?;
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
TenantSlot::Secondary
|
||||
}
|
||||
LocationMode::Attached(_attach_config) => {
|
||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||
|
||||
// Directory doesn't need to be fsync'd because we do not depend on
|
||||
// it to exist after crashes: it may be recreated when tenant is
|
||||
// re-attached, see https://github.com/neondatabase/neon/issues/5550
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
let shard_identity = new_location_config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
@@ -1167,95 +1107,6 @@ impl TenantManager {
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
// Do some synchronous work for all tenant slots in Secondary state. The provided
|
||||
// callback should be small and fast, as it will be called inside the global
|
||||
// TenantsMap lock.
|
||||
pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
|
||||
where
|
||||
// TODO: let the callback return a hint to drop out of the loop early
|
||||
F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
|
||||
{
|
||||
let locked = self.tenants.read().unwrap();
|
||||
|
||||
let map = match &*locked {
|
||||
TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
|
||||
TenantsMap::Open(m) => m,
|
||||
};
|
||||
|
||||
for (tenant_id, slot) in map {
|
||||
if let TenantSlot::Secondary(state) = slot {
|
||||
// Only expose secondary tenants that are not currently shutting down
|
||||
if !state.cancel.is_cancelled() {
|
||||
func(tenant_id, state)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_tenant(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
activation_timeout: Duration,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
// have to return the Tenant to the map while the background deletion runs.
|
||||
//
|
||||
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
|
||||
// Currently, deletion requires a reference to the tenants map in order to
|
||||
// keep the Tenant in the map until deletion is complete, and then remove
|
||||
// it at the end.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
let slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// unwrap is safe because we used MustExist mode when acquiring
|
||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||
TenantSlot::Attached(tenant) => tenant.clone(),
|
||||
_ => {
|
||||
// Express "not attached" as equivalent to "not found"
|
||||
return Err(DeleteTenantError::NotAttached);
|
||||
}
|
||||
};
|
||||
|
||||
match tenant.current_state() {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// If a tenant is broken or stopping, DeleteTenantFlow can
|
||||
// handle it: broken tenants proceed to delete, stopping tenants
|
||||
// are checked for deletion already in progress.
|
||||
}
|
||||
_ => {
|
||||
tenant
|
||||
.wait_to_become_active(activation_timeout)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => {
|
||||
DeleteTenantError::InvalidState(tenant.current_state())
|
||||
}
|
||||
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
|
||||
GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
|
||||
GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state: _latest_state,
|
||||
wait_time: _wait_time,
|
||||
} => DeleteTenantError::InvalidState(tenant.current_state()),
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
let result = DeleteTenantFlow::run(
|
||||
self.conf,
|
||||
self.resources.remote_storage.clone(),
|
||||
&TENANTS,
|
||||
tenant,
|
||||
)
|
||||
.await;
|
||||
|
||||
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
|
||||
slot_guard.revert();
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1305,7 +1156,7 @@ pub(crate) fn get_tenant(
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||
}
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
None | Some(TenantSlot::Secondary) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
}
|
||||
}
|
||||
@@ -1357,11 +1208,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
let locked = TENANTS.read().unwrap();
|
||||
|
||||
// Resolve TenantId to TenantShardId
|
||||
let tenant_shard_id = locked
|
||||
.resolve_attached_shard(&tenant_id, shard_selector)
|
||||
.ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||
tenant_id,
|
||||
)))?;
|
||||
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
|
||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
|
||||
)?;
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.map_err(GetTenantError::MapState)?;
|
||||
@@ -1372,13 +1221,10 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
// Fast path: we don't need to do any async waiting.
|
||||
return Ok(tenant.clone());
|
||||
}
|
||||
_ => {
|
||||
tenant.activate_now();
|
||||
(WaitFor::Tenant(tenant.clone()), tenant_shard_id)
|
||||
}
|
||||
_ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
|
||||
}
|
||||
}
|
||||
Some(TenantSlot::Secondary(_)) => {
|
||||
Some(TenantSlot::Secondary) => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
|
||||
tenant_id,
|
||||
)))
|
||||
@@ -1429,10 +1275,63 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
};
|
||||
|
||||
tracing::debug!("Waiting for tenant to enter active state...");
|
||||
tenant
|
||||
.wait_to_become_active(deadline.duration_since(Instant::now()))
|
||||
.await?;
|
||||
Ok(tenant)
|
||||
match timeout_cancellable(
|
||||
deadline.duration_since(Instant::now()),
|
||||
cancel,
|
||||
tenant.wait_to_become_active(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Ok(())) => Ok(tenant),
|
||||
Ok(Err(e)) => Err(e),
|
||||
Err(TimeoutCancellableError::Timeout) => {
|
||||
let latest_state = tenant.current_state();
|
||||
if latest_state == TenantState::Active {
|
||||
Ok(tenant)
|
||||
} else {
|
||||
Err(GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state: Some(latest_state),
|
||||
wait_time: timeout,
|
||||
})
|
||||
}
|
||||
}
|
||||
Err(TimeoutCancellableError::Cancelled) => Err(GetActiveTenantError::Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
// have to return the Tenant to the map while the background deletion runs.
|
||||
//
|
||||
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
|
||||
// Currently, deletion requires a reference to the tenants map in order to
|
||||
// keep the Tenant in the map until deletion is complete, and then remove
|
||||
// it at the end.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
// TODO(sharding): make delete API sharding-aware
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// unwrap is safe because we used MustExist mode when acquiring
|
||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||
TenantSlot::Attached(tenant) => tenant.clone(),
|
||||
_ => {
|
||||
// Express "not attached" as equivalent to "not found"
|
||||
return Err(DeleteTenantError::NotAttached);
|
||||
}
|
||||
};
|
||||
|
||||
let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
|
||||
|
||||
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
|
||||
slot_guard.revert();
|
||||
result
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1642,7 +1541,7 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
|
||||
Ok(m.iter()
|
||||
.filter_map(|(id, tenant)| match tenant {
|
||||
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
|
||||
TenantSlot::Secondary(_) => None,
|
||||
TenantSlot::Secondary => None,
|
||||
TenantSlot::InProgress(_) => None,
|
||||
})
|
||||
.collect())
|
||||
@@ -1899,7 +1798,11 @@ impl SlotGuard {
|
||||
fn old_value_is_shutdown(&self) -> bool {
|
||||
match self.old_value.as_ref() {
|
||||
Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
|
||||
Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
|
||||
Some(TenantSlot::Secondary) => {
|
||||
// TODO: when adding secondary mode tenants, this will check for shutdown
|
||||
// in the same way that we do for `Tenant` above
|
||||
true
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
// A SlotGuard cannot be constructed for a slot that was already InProgress
|
||||
unreachable!()
|
||||
@@ -2109,19 +2012,26 @@ where
|
||||
let mut slot_guard =
|
||||
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// allow pageserver shutdown to await for our completion
|
||||
let (_guard, progress) = completion::channel();
|
||||
|
||||
// The SlotGuard allows us to manipulate the Tenant object without fear of some
|
||||
// concurrent API request doing something else for the same tenant ID.
|
||||
let attached_tenant = match slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
Some(TenantSlot::Attached(t)) => Some(t),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// allow pageserver shutdown to await for our completion
|
||||
let (_guard, progress) = completion::channel();
|
||||
|
||||
// If the tenant was attached, shut it down gracefully. For secondary
|
||||
// locations this part is not necessary
|
||||
match &attached_tenant {
|
||||
Some(attached_tenant) => {
|
||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||
let freeze_and_flush = false;
|
||||
|
||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||
// that we can continue safely to cleanup.
|
||||
match tenant.shutdown(progress, freeze_and_flush).await {
|
||||
match attached_tenant.shutdown(progress, freeze_and_flush).await {
|
||||
Ok(()) => {}
|
||||
Err(_other) => {
|
||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||
@@ -2130,19 +2040,11 @@ where
|
||||
return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
|
||||
}
|
||||
}
|
||||
Some(tenant)
|
||||
}
|
||||
Some(TenantSlot::Secondary(secondary_state)) => {
|
||||
tracing::info!("Shutting down in secondary mode");
|
||||
secondary_state.shutdown().await;
|
||||
None
|
||||
None => {
|
||||
// Nothing to wait on when not attached, proceed.
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
// Acquiring a slot guarantees its old value was not InProgress
|
||||
unreachable!();
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
}
|
||||
|
||||
match tenant_cleanup
|
||||
.await
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user