mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 02:42:56 +00:00
Compare commits
7 Commits
proxy-forw
...
hack/compu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
32d4e4914a | ||
|
|
d4d577e7ff | ||
|
|
f552aa05fa | ||
|
|
779badb7c5 | ||
|
|
e6eb548491 | ||
|
|
16e9eb2832 | ||
|
|
042686183b |
@@ -1,3 +1,17 @@
|
||||
# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
|
||||
# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
|
||||
# optimizations enabled by "opt-level=1" don't affect debuggability too much.
|
||||
#
|
||||
# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
|
||||
#
|
||||
[profile.dev.package."*"]
|
||||
# Set the default for dependencies in Development mode.
|
||||
opt-level = 3
|
||||
|
||||
[profile.dev]
|
||||
# Turn on a small amount of optimization in Development mode.
|
||||
opt-level = 1
|
||||
|
||||
[build]
|
||||
# This is only present for local builds, as it will be overridden
|
||||
# by the RUSTDOCFLAGS env var in CI.
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
[profile.default]
|
||||
slow-timeout = { period = "20s", terminate-after = 3 }
|
||||
@@ -1,28 +1,27 @@
|
||||
*
|
||||
|
||||
# Files
|
||||
!Cargo.lock
|
||||
!Cargo.toml
|
||||
!Makefile
|
||||
!rust-toolchain.toml
|
||||
!scripts/combine_control_files.py
|
||||
!scripts/ninstall.sh
|
||||
!vm-cgconfig.conf
|
||||
!Cargo.toml
|
||||
!Cargo.lock
|
||||
!Makefile
|
||||
|
||||
# Directories
|
||||
!.cargo/
|
||||
!.config/
|
||||
!compute_tools/
|
||||
!control_plane/
|
||||
!compute_tools/
|
||||
!libs/
|
||||
!neon_local/
|
||||
!pageserver/
|
||||
!patches/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!s3_scrubber/
|
||||
!safekeeper/
|
||||
!s3_scrubber/
|
||||
!storage_broker/
|
||||
!trace/
|
||||
!vendor/postgres-*/
|
||||
!vendor/postgres-v14/
|
||||
!vendor/postgres-v15/
|
||||
!vendor/postgres-v16/
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
!scripts/combine_control_files.py
|
||||
!vm-cgconfig.conf
|
||||
|
||||
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -4,8 +4,6 @@ self-hosted-runner:
|
||||
- dev
|
||||
- gen3
|
||||
- large
|
||||
# Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
|
||||
- macos-14
|
||||
- small
|
||||
- us-east-2
|
||||
config-variables:
|
||||
|
||||
@@ -179,6 +179,23 @@ runs:
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
|
||||
run: |
|
||||
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
poetry run python3 scripts/ingest_regress_test_result.py \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--reference ${GITHUB_REF} \
|
||||
--build-type unified \
|
||||
--ingest ${WORKDIR}/report/data/suites.json
|
||||
|
||||
- name: Store Allure test stat in the DB (new)
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
29
.github/workflows/benchmarking.yml
vendored
29
.github/workflows/benchmarking.yml
vendored
@@ -11,7 +11,7 @@ on:
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
inputs:
|
||||
@@ -23,21 +23,6 @@ on:
|
||||
type: boolean
|
||||
description: 'Publish perf report. If not set, the report will be published only for the main branch'
|
||||
required: false
|
||||
collect_olap_explain:
|
||||
type: boolean
|
||||
description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected'
|
||||
required: false
|
||||
default: false
|
||||
collect_pg_stat_statements:
|
||||
type: boolean
|
||||
description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected'
|
||||
required: false
|
||||
default: false
|
||||
run_AWS_RDS_AND_AURORA:
|
||||
type: boolean
|
||||
description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
|
||||
required: false
|
||||
default: false
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -128,8 +113,6 @@ jobs:
|
||||
# - neon-captest-reuse: Reusing existing project
|
||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
@@ -169,7 +152,7 @@ jobs:
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
||||
{ "platform": "rds-aurora" }]')
|
||||
fi
|
||||
@@ -188,9 +171,9 @@ jobs:
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -354,8 +337,6 @@ jobs:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
|
||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
@@ -418,8 +399,6 @@ jobs:
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }}
|
||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
TEST_OLAP_SCALE: 10
|
||||
|
||||
|
||||
124
.github/workflows/build_and_push_docker_image.yml
vendored
124
.github/workflows/build_and_push_docker_image.yml
vendored
@@ -1,124 +0,0 @@
|
||||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
dockerfile-path:
|
||||
required: true
|
||||
type: string
|
||||
image-name:
|
||||
required: true
|
||||
type: string
|
||||
outputs:
|
||||
build-tools-tag:
|
||||
description: "tag generated for build tools"
|
||||
value: ${{ jobs.tag.outputs.build-tools-tag }}
|
||||
|
||||
jobs:
|
||||
check-if-build-tools-dockerfile-changed:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
|
||||
steps:
|
||||
- name: Check if Dockerfile.buildtools has changed
|
||||
id: dockerfile
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
|
||||
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
|
||||
exit
|
||||
fi
|
||||
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
|
||||
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
|
||||
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
tag:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ check-if-build-tools-dockerfile-changed ]
|
||||
outputs:
|
||||
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
|
||||
|
||||
steps:
|
||||
- name: Get buildtools tag
|
||||
env:
|
||||
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
|
||||
IMAGE_TAG=$GITHUB_RUN_ID
|
||||
else
|
||||
IMAGE_TAG=pinned
|
||||
fi
|
||||
|
||||
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
|
||||
shell: bash
|
||||
id: buildtools-tag
|
||||
|
||||
kaniko:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
|
||||
kaniko-arm:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
manifest:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
name: 'manifest'
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs:
|
||||
- tag
|
||||
- kaniko
|
||||
- kaniko-arm
|
||||
- check-if-build-tools-dockerfile-changed
|
||||
|
||||
steps:
|
||||
- name: Create manifest
|
||||
run: |
|
||||
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
- name: Push manifest
|
||||
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
|
||||
142
.github/workflows/build_and_test.yml
vendored
142
.github/workflows/build_and_test.yml
vendored
@@ -21,8 +21,6 @@ env:
|
||||
COPT: '-Werror'
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
@@ -46,19 +44,6 @@ jobs:
|
||||
|
||||
exit 1
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Cancel previous e2e-tests runs for this PR
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
gh workflow --repo neondatabase/cloud \
|
||||
run cancel-previous-in-concurrency-group.yml \
|
||||
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
@@ -89,19 +74,11 @@ jobs:
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
build-buildtools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/build_and_push_docker_image.yml
|
||||
with:
|
||||
dockerfile-path: Dockerfile.buildtools
|
||||
image-name: build-tools
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -121,20 +98,20 @@ jobs:
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run `ruff check` to ensure code format
|
||||
run: poetry run ruff check .
|
||||
- name: Run ruff to ensure code format
|
||||
run: poetry run ruff .
|
||||
|
||||
- name: Run `ruff format` to ensure code format
|
||||
run: poetry run ruff format --check .
|
||||
- name: Run black to ensure code format
|
||||
run: poetry run black --diff --check .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -198,15 +175,11 @@ jobs:
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
|
||||
build-neon:
|
||||
needs: [ check-permissions, tag, build-buildtools-image ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# Raise locked memory limit for tokio-epoll-uring.
|
||||
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
|
||||
# io_uring will account the memory of the CQ and SQ as locked.
|
||||
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -226,10 +199,6 @@ jobs:
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -359,20 +328,16 @@ jobs:
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run rust tests
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
done
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
@@ -382,7 +347,7 @@ jobs:
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
@@ -439,12 +404,12 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, build-neon ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -471,25 +436,22 @@ jobs:
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
benchmarks:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image ]
|
||||
needs: [ check-permissions, build-neon ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# the amount of groups (N) should be reflected in `extra_params: --splits N ...`
|
||||
pytest_split_group: [ 1, 2, 3, 4 ]
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
@@ -503,22 +465,21 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
|
||||
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -531,6 +492,7 @@ jobs:
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
@@ -559,10 +521,11 @@ jobs:
|
||||
})
|
||||
|
||||
coverage-report:
|
||||
needs: [ check-permissions, regress-tests, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests ]
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -608,6 +571,17 @@ jobs:
|
||||
--input-objects=/tmp/coverage/binaries.list \
|
||||
--format=lcov
|
||||
|
||||
- name: Upload coverage report
|
||||
id: upload-coverage-report
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build coverage report NEW
|
||||
id: upload-coverage-report-new
|
||||
env:
|
||||
@@ -644,11 +618,21 @@ jobs:
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
env:
|
||||
REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
|
||||
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
script: |
|
||||
const { REPORT_URL_NEW, COMMIT_SHA } = process.env
|
||||
const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
sha: `${COMMIT_SHA}`,
|
||||
state: 'success',
|
||||
target_url: `${REPORT_URL}`,
|
||||
context: 'Code coverage report',
|
||||
})
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: context.repo.owner,
|
||||
@@ -700,13 +684,12 @@ jobs:
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
neon-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
@@ -745,7 +728,6 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
@@ -756,7 +738,7 @@ jobs:
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
@@ -791,7 +773,6 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
@@ -802,7 +783,7 @@ jobs:
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
@@ -850,7 +831,6 @@ jobs:
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
@@ -872,7 +852,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.21.0
|
||||
VM_BUILDER_VERSION: v0.19.0
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -1116,10 +1096,6 @@ jobs:
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -1137,7 +1113,7 @@ jobs:
|
||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
|
||||
46
.github/workflows/neon_extra_builds.yml
vendored
46
.github/workflows/neon_extra_builds.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
timeout-minutes: 90
|
||||
runs-on: macos-14
|
||||
runs-on: macos-latest
|
||||
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
@@ -60,21 +60,21 @@ jobs:
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
@@ -89,7 +89,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
@@ -110,7 +110,7 @@ jobs:
|
||||
run: make walproposer-lib -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Run cargo build
|
||||
run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release
|
||||
run: cargo build --all --release
|
||||
|
||||
- name: Check that no warnings are produced
|
||||
run: ./run_clippy.sh
|
||||
@@ -124,12 +124,12 @@ jobs:
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
CARGO_FEATURES: --features testing
|
||||
CARGO_FLAGS: --release
|
||||
CARGO_FLAGS: --locked --release
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -142,10 +142,6 @@ jobs:
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -210,20 +206,18 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run cargo test
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
cargo nextest run $CARGO_FEATURES
|
||||
cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo nextest run --package remote_storage --test test_real_s3
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
@@ -233,7 +227,7 @@ jobs:
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo nextest run --package remote_storage --test test_real_azure
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
|
||||
check-codestyle-rust-arm:
|
||||
timeout-minutes: 90
|
||||
@@ -244,20 +238,6 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -2,7 +2,7 @@ name: Create Release Branch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * 1'
|
||||
- cron: '0 7 * * 5'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
70
.github/workflows/update_build_tools_image.yml
vendored
70
.github/workflows/update_build_tools_image.yml
vendored
@@ -1,70 +0,0 @@
|
||||
name: 'Update build tools image tag'
|
||||
|
||||
# This workflow it used to update tag of build tools in ECR.
|
||||
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
to-tag:
|
||||
description: 'Destination tag'
|
||||
required: true
|
||||
type: string
|
||||
default: 'pinned'
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
|
||||
env:
|
||||
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
|
||||
steps:
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.21'
|
||||
|
||||
- name: Install crane
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
|
||||
|
||||
- name: Copy images
|
||||
run: |
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -6,7 +6,6 @@ __pycache__/
|
||||
test_output/
|
||||
.vscode
|
||||
.idea
|
||||
neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
|
||||
@@ -19,6 +18,3 @@ neon.iml
|
||||
*.o
|
||||
*.so
|
||||
*.Po
|
||||
|
||||
# pgindent typedef lists
|
||||
*.list
|
||||
|
||||
@@ -9,24 +9,6 @@ refactoring, additional comments, and so forth. Let's try to raise the
|
||||
bar, and clean things up as we go. Try to leave code in a better shape
|
||||
than it was before.
|
||||
|
||||
## Pre-commit hook
|
||||
|
||||
We have a sample pre-commit hook in `pre-commit.py`.
|
||||
To set it up, run:
|
||||
|
||||
```bash
|
||||
ln -s ../../pre-commit.py .git/hooks/pre-commit
|
||||
```
|
||||
|
||||
This will run following checks on staged files before each commit:
|
||||
- `rustfmt`
|
||||
- checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
|
||||
|
||||
There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
|
||||
and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
|
||||
|
||||
If you want to skip the hook, run `git commit` with `--no-verify` option.
|
||||
|
||||
## Submitting changes
|
||||
|
||||
1. Get at least one +1 on your PR before you push.
|
||||
@@ -70,17 +52,3 @@ We're using the following approach to make it work:
|
||||
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
|
||||
|
||||
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
|
||||
|
||||
## How do I add the "pinned" tag to an buildtools image?
|
||||
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
|
||||
|
||||
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
|
||||
or using GitHub CLI:
|
||||
|
||||
```bash
|
||||
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
|
||||
-f from-tag=6254913013 \
|
||||
-f to-tag=pinned \
|
||||
|
||||
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
|
||||
```
|
||||
1575
Cargo.lock
generated
1575
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
76
Cargo.toml
76
Cargo.toml
@@ -3,11 +3,8 @@ resolver = "2"
|
||||
members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"control_plane/attachment_service",
|
||||
"pageserver",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/pagebench",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
@@ -40,19 +37,20 @@ license = "Apache-2.0"
|
||||
[workspace.dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
azure_core = "0.18"
|
||||
azure_identity = "0.18"
|
||||
azure_storage = "0.18"
|
||||
azure_storage_blobs = "0.18"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||
azure_core = "0.16"
|
||||
azure_identity = "0.16"
|
||||
azure_storage = "0.16"
|
||||
azure_storage_blobs = "0.16"
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
aws-config = { version = "1.0", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "1.0"
|
||||
aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
|
||||
aws-smithy-types = "1.0"
|
||||
aws-credential-types = "1.0"
|
||||
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "0.29"
|
||||
aws-smithy-http = "0.56"
|
||||
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
|
||||
aws-credential-types = "0.56"
|
||||
aws-types = "0.56"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
@@ -64,6 +62,7 @@ camino = "1.1.6"
|
||||
cfg-if = "1.0.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
close_fds = "0.3.2"
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
@@ -80,7 +79,6 @@ futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.13"
|
||||
hashlink = "0.8.1"
|
||||
hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
@@ -91,33 +89,29 @@ humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.11"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
jsonwebtoken = "8"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
native-tls = "0.2"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
nix = "0.26"
|
||||
notify = "5.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.20.0"
|
||||
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.12.0"
|
||||
opentelemetry = "0.19.0"
|
||||
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.11.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "49.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
regex = "1.4"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
reqwest-retry = "0.2.2"
|
||||
routerify = "3"
|
||||
@@ -128,17 +122,14 @@ rustls-pemfile = "1"
|
||||
rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sd-notify = "0.4.1"
|
||||
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_path_to_error = "0.1"
|
||||
serde_with = "2.0"
|
||||
serde_assert = "0.5.0"
|
||||
sha2 = "0.10.2"
|
||||
signal-hook = "0.3"
|
||||
smallvec = "1.11"
|
||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
@@ -148,27 +139,23 @@ tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
test-context = "0.1"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
tikv-jemalloc-ctl = "0.5"
|
||||
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.10.0"
|
||||
tokio-rustls = "0.24"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.20.0"
|
||||
tracing-opentelemetry = "0.19.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
webpki-roots = "0.25"
|
||||
x509-parser = "0.15"
|
||||
@@ -178,11 +165,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -192,7 +179,6 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
@@ -220,11 +206,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
# bug fixes for UUID
|
||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
|
||||
### inside this image in the real deployments.
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
|
||||
# Build Postgres
|
||||
@@ -53,7 +53,6 @@ RUN set -e \
|
||||
--bin pagectl \
|
||||
--bin safekeeper \
|
||||
--bin storage_broker \
|
||||
--bin attachment_service \
|
||||
--bin proxy \
|
||||
--bin neon_local \
|
||||
--locked --release \
|
||||
@@ -81,7 +80,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# System deps
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install -y \
|
||||
autoconf \
|
||||
automake \
|
||||
bison \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
curl \
|
||||
flex \
|
||||
git \
|
||||
gnupg \
|
||||
gzip \
|
||||
jq \
|
||||
libcurl4-openssl-dev \
|
||||
libbz2-dev \
|
||||
libffi-dev \
|
||||
liblzma-dev \
|
||||
libncurses5-dev \
|
||||
libncursesw5-dev \
|
||||
libpq-dev \
|
||||
libreadline-dev \
|
||||
libseccomp-dev \
|
||||
libsqlite3-dev \
|
||||
libssl-dev \
|
||||
libstdc++-10-dev \
|
||||
libtool \
|
||||
libxml2-dev \
|
||||
libxmlsec1-dev \
|
||||
libxxhash-dev \
|
||||
lsof \
|
||||
make \
|
||||
netcat \
|
||||
net-tools \
|
||||
openssh-client \
|
||||
parallel \
|
||||
pkg-config \
|
||||
unzip \
|
||||
wget \
|
||||
xz-utils \
|
||||
zlib1g-dev \
|
||||
zstd \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION 25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
&& unzip -q protoc.zip -d protoc \
|
||||
&& mv protoc/bin/protoc /usr/local/bin/protoc \
|
||||
&& mv protoc/include/google /usr/local/include/google \
|
||||
&& rm -rf protoc.zip protoc
|
||||
|
||||
# LLVM
|
||||
ENV LLVM_VERSION=17
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& apt update \
|
||||
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# PostgreSQL 14
|
||||
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
|
||||
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
|
||||
&& apt update \
|
||||
&& apt install -y postgresql-client-14 \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# AWS CLI
|
||||
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
||||
&& unzip -q awscliv2.zip \
|
||||
&& ./aws/install \
|
||||
&& rm awscliv2.zip
|
||||
|
||||
# Mold: A Modern Linker
|
||||
ENV MOLD_VERSION v2.4.0
|
||||
RUN set -e \
|
||||
&& git clone https://github.com/rui314/mold.git \
|
||||
&& mkdir mold/build \
|
||||
&& cd mold/build \
|
||||
&& git checkout ${MOLD_VERSION} \
|
||||
&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
|
||||
&& cmake --build . -j $(nproc) \
|
||||
&& cmake --install . \
|
||||
&& cd .. \
|
||||
&& rm -rf mold
|
||||
|
||||
# LCOV
|
||||
# Build lcov from a fork:
|
||||
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
|
||||
# And patches from us:
|
||||
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
|
||||
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
|
||||
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
|
||||
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
|
||||
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
|
||||
&& cd lcov \
|
||||
&& make install \
|
||||
&& rm -rf ../lcov.tar.gz
|
||||
|
||||
# Switch to nonroot user
|
||||
USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Python
|
||||
ENV PYTHON_VERSION=3.9.2 \
|
||||
PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
|
||||
RUN set -e \
|
||||
&& cd $HOME \
|
||||
&& curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
|
||||
&& chmod +x pyenv-installer \
|
||||
&& ./pyenv-installer \
|
||||
&& export PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
&& export PATH="$PYENV_ROOT/bin:$PATH" \
|
||||
&& export PATH="$PYENV_ROOT/shims:$PATH" \
|
||||
&& pyenv install ${PYTHON_VERSION} \
|
||||
&& pyenv global ${PYTHON_VERSION} \
|
||||
&& python --version \
|
||||
&& pip install --upgrade pip \
|
||||
&& pip --version \
|
||||
&& pip install pipenv wheel poetry
|
||||
|
||||
# Switch to nonroot user (again)
|
||||
USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.75.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
|
||||
rm rustup-init && \
|
||||
export PATH="$HOME/.cargo/bin:$PATH" && \
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
||||
cargo install --git https://github.com/paritytech/cachepot && \
|
||||
cargo install rustfilt && \
|
||||
cargo install cargo-hakari && \
|
||||
cargo install cargo-deny && \
|
||||
cargo install cargo-hack && \
|
||||
cargo install cargo-nextest && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
ENV RUSTC_WRAPPER=cachepot
|
||||
|
||||
# Show versions
|
||||
RUN whoami \
|
||||
&& python --version \
|
||||
&& pip --version \
|
||||
&& cargo --version --verbose \
|
||||
&& rustup --version --verbose \
|
||||
&& rustc --version --verbose \
|
||||
&& clang --version
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG PG_VERSION
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
@@ -48,29 +48,7 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -143,24 +121,29 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
|
||||
#########################################################################################
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
||||
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
|
||||
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export PLV8_VERSION=3.1.5 \
|
||||
export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
|
||||
;; \
|
||||
"v16") \
|
||||
export PLV8_VERSION=3.1.8 \
|
||||
export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
|
||||
;; \
|
||||
*) \
|
||||
echo "Export the valid PG_VERSION variable" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
|
||||
echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
|
||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
# generate and copy upgrade scripts
|
||||
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
|
||||
cp upgrade/* /usr/local/pgsql/share/extension/ && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
# don't break computes with installed old version of plv8
|
||||
cd /usr/local/pgsql/lib/ && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.5.so && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.8.so && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
|
||||
@@ -241,12 +224,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
@@ -413,9 +393,7 @@ RUN case "${PG_VERSION}" in \
|
||||
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
|
||||
;; \
|
||||
*) \
|
||||
export TIMESCALEDB_VERSION=2.13.0 \
|
||||
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
|
||||
;; \
|
||||
echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
|
||||
esac && \
|
||||
apt-get update && \
|
||||
apt-get install -y cmake && \
|
||||
@@ -523,7 +501,8 @@ RUN apt-get update && \
|
||||
libboost-regex1.74-dev \
|
||||
libboost-serialization1.74-dev \
|
||||
libboost-system1.74-dev \
|
||||
libeigen3-dev
|
||||
libeigen3-dev \
|
||||
libfreetype6-dev
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
|
||||
@@ -548,8 +527,6 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
|
||||
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
|
||||
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
|
||||
-D RDK_INSTALL_INTREE=OFF \
|
||||
-D RDK_INSTALL_COMIC_FONTS=OFF \
|
||||
-D RDK_BUILD_FREETYPE_SUPPORT=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -590,23 +567,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-semver-pg-build"
|
||||
# compile pg_semver extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-semver-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
|
||||
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-embedding-pg-build"
|
||||
@@ -616,7 +576,6 @@ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O
|
||||
FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
@@ -757,20 +716,20 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "wal2json-build"
|
||||
# Compile "wal2json" extension
|
||||
# Layer "pg-wait-sampling-pg-build"
|
||||
# compile pg_wait_sampling extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM build-deps AS wal2json-pg-build
|
||||
FROM build-deps AS pg-wait-sampling-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
|
||||
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
RUN wget https://github.com/postgrespro/pg_wait_sampling/archive/refs/tags/v1.1.5.tar.gz -O pg_wait_sampling.tar.gz && \
|
||||
echo 'a03da6a413f5652ce470a3635ed6ebba528c74cb26aa4cfced8aff8a8441f81ec6dd657ff62cd6ce96a4e6ce02cad9f2519ae9525367ece60497aa20faafde5c pg_wait_sampling.tar.gz' | sha512sum -c && \
|
||||
mkdir pg_wait_sampling-src && cd pg_wait_sampling-src && tar xvzf ../pg_wait_sampling.tar.gz --strip-components=1 -C . && \
|
||||
make USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_wait_sampling.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -779,8 +738,6 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# Public extensions
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=postgis-build /sfcgal/* /
|
||||
@@ -809,9 +766,8 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-wait-sampling-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -885,10 +841,8 @@ FROM debian:bullseye-slim
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
mkdir /var/db/postgres/pgbouncer && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
chmod 0750 /var/db/postgres/pgbouncer && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
@@ -904,7 +858,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost* for rdkit
|
||||
# libboost*, libfreetype6, and zlib1g for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
@@ -917,6 +871,7 @@ RUN apt update && \
|
||||
libboost-serialization1.74.0 \
|
||||
libboost-system1.74.0 \
|
||||
libossp-uuid16 \
|
||||
libfreetype6 \
|
||||
libgeos-c1v5 \
|
||||
libgdal28 \
|
||||
libproj19 \
|
||||
@@ -928,6 +883,7 @@ RUN apt update && \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
zlib1g \
|
||||
ca-certificates && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
|
||||
44
Makefile
44
Makefile
@@ -51,8 +51,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
|
||||
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
||||
|
||||
#
|
||||
# Top level Makefile to build Neon and PostgreSQL
|
||||
@@ -176,10 +174,10 @@ neon-pg-ext-clean-%:
|
||||
|
||||
# Build walproposer as a static library. walproposer source code is located
|
||||
# in the pgxn/neon directory.
|
||||
#
|
||||
#
|
||||
# We also need to include libpgport.a and libpgcommon.a, because walproposer
|
||||
# uses some functions from those libraries.
|
||||
#
|
||||
#
|
||||
# Some object files are removed from libpgport.a and libpgcommon.a because
|
||||
# they depend on openssl and other libraries that are not included in our
|
||||
# Rust build.
|
||||
@@ -262,44 +260,6 @@ distclean:
|
||||
fmt:
|
||||
./pre-commit.py --fix-inplace
|
||||
|
||||
postgres-%-pg-bsd-indent: postgres-%
|
||||
+@echo "Compiling pg_bsd_indent"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
|
||||
|
||||
# Create typedef list for the core. Note that generally it should be combined with
|
||||
# buildfarm one to cover platform specific stuff.
|
||||
# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
|
||||
postgres-%-typedefs.list: postgres-%
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
|
||||
|
||||
# Indent postgres. See src/tools/pgindent/README for details.
|
||||
.PHONY: postgres-%-pgindent
|
||||
postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
|
||||
+@echo merge with buildfarm typedef to cover all platforms
|
||||
+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
|
||||
REL_16_STABLE list misses PGSemaphoreData
|
||||
# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
|
||||
# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
|
||||
cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
|
||||
cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
|
||||
+@echo note: you might want to run it on selected files/dirs instead.
|
||||
INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
|
||||
--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
|
||||
rm -f pg*.BAK
|
||||
|
||||
# Indent pxgn/neon.
|
||||
.PHONY: pgindent
|
||||
neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
|
||||
INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
|
||||
|
||||
|
||||
.PHONY: setup-pre-commit-hook
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
33
README.md
33
README.md
@@ -14,8 +14,8 @@ Alternatively, compile and run the project [locally](#running-local-installation
|
||||
A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
|
||||
|
||||
The Neon storage engine consists of two major components:
|
||||
- Pageserver: Scalable storage backend for the compute nodes.
|
||||
- Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
|
||||
- Pageserver. Scalable storage backend for the compute nodes.
|
||||
- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
|
||||
|
||||
See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
|
||||
|
||||
@@ -29,14 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
|
||||
```bash
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
|
||||
libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
|
||||
libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
|
||||
```
|
||||
* On Fedora, these packages are needed:
|
||||
```bash
|
||||
dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \
|
||||
libffi-devel
|
||||
protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
|
||||
```
|
||||
* On Arch based systems, these packages are needed:
|
||||
```bash
|
||||
@@ -81,9 +80,9 @@ The project uses [rust toolchain file](./rust-toolchain.toml) to define the vers
|
||||
|
||||
This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file.
|
||||
|
||||
rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
|
||||
rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
|
||||
|
||||
non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file.
|
||||
non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file.
|
||||
Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates.
|
||||
|
||||
#### Building on Linux
|
||||
@@ -124,7 +123,7 @@ make -j`sysctl -n hw.logicalcpu` -s
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
||||
Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
||||
|
||||
|
||||
#### Running neon database
|
||||
@@ -150,9 +149,6 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
|
||||
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
|
||||
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
|
||||
|
||||
# create postgres compute node
|
||||
> cargo neon endpoint create main
|
||||
|
||||
# start postgres compute node
|
||||
> cargo neon endpoint start main
|
||||
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||
@@ -166,7 +162,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
|
||||
|
||||
2. Now, it is possible to connect to postgres and run some queries:
|
||||
```text
|
||||
> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
|
||||
> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
postgres=# insert into t values(1,1);
|
||||
@@ -189,11 +185,8 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
|
||||
(L) main [de200bd42b49cc1814412c7e592dd6e9]
|
||||
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
|
||||
|
||||
# create postgres on that branch
|
||||
> cargo neon endpoint create migration_check --branch-name migration_check
|
||||
|
||||
# start postgres on that branch
|
||||
> cargo neon endpoint start migration_check
|
||||
> cargo neon endpoint start migration_check --branch-name migration_check
|
||||
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
|
||||
|
||||
@@ -205,7 +198,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
> psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres
|
||||
> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
@@ -216,7 +209,7 @@ postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
|
||||
# check that the new change doesn't affect the 'main' postgres
|
||||
> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
|
||||
> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
@@ -224,7 +217,7 @@ postgres=# select * from t;
|
||||
(1 row)
|
||||
```
|
||||
|
||||
4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances
|
||||
4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
|
||||
you have just started. You can terminate them all with one command:
|
||||
```sh
|
||||
> cargo neon stop
|
||||
@@ -243,7 +236,7 @@ CARGO_BUILD_FLAGS="--features=testing" make
|
||||
```
|
||||
|
||||
By default, this runs both debug and release modes, and all supported postgres versions. When
|
||||
testing locally, it is convenient to run just one set of permutations, like this:
|
||||
testing locally, it is convenient to run just run one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
|
||||
@@ -13,7 +13,6 @@ clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
num_cpus.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
@@ -21,7 +20,6 @@ postgres.workspace = true
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
tar.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
@@ -39,6 +37,4 @@ workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||
zstd = "0.13"
|
||||
bytes = "1.0"
|
||||
rust-ini = "0.20.0"
|
||||
zstd = "0.12.4"
|
||||
|
||||
@@ -31,31 +31,27 @@
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::get_pg_version;
|
||||
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
@@ -64,18 +60,11 @@ use compute_tools::spec::*;
|
||||
|
||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||
// in-case of not-set environment var
|
||||
const BUILD_TAG_DEFAULT: &str = "latest";
|
||||
const BUILD_TAG_DEFAULT: &str = "5670669815";
|
||||
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
thread::spawn(move || {
|
||||
for sig in signals.forever() {
|
||||
handle_exit_signal(sig);
|
||||
}
|
||||
});
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG")
|
||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||
.to_string();
|
||||
@@ -85,18 +74,10 @@ fn main() -> Result<()> {
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let ext_remote_storage = matches
|
||||
.get_one::<String>("remote-ext-config")
|
||||
// Compatibility hack: if the control plane specified any remote-ext-config
|
||||
// use the default value for extension storage proxy gateway.
|
||||
// Remove this once the control plane is updated to pass the gateway URL
|
||||
.map(|conf| {
|
||||
if conf.starts_with("http") {
|
||||
conf.trim_end_matches('/')
|
||||
} else {
|
||||
"http://pg-ext-s3-gateway"
|
||||
}
|
||||
});
|
||||
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
|
||||
let ext_remote_storage = remote_ext_config.map(|x| {
|
||||
init_remote_storage(x).expect("cannot initialize remote extension storage from config")
|
||||
});
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
@@ -217,16 +198,16 @@ fn main() -> Result<()> {
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
|
||||
ext_remote_storage,
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
build_tag,
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
||||
// available for binding. Prewarming helps Postgres start quicker later,
|
||||
// available for binding. Prewarming helps postgres start quicker later,
|
||||
// because QEMU will already have it's memory allocated from the host, and
|
||||
// the necessary binaries will already be cached.
|
||||
// the necessary binaries will alreaady be cached.
|
||||
if !spec_set {
|
||||
compute.prewarm_postgres()?;
|
||||
}
|
||||
@@ -269,11 +250,6 @@ fn main() -> Result<()> {
|
||||
|
||||
state.status = ComputeStatus::Init;
|
||||
compute.state_changed.notify_all();
|
||||
|
||||
info!(
|
||||
"running compute with features: {:?}",
|
||||
state.pspec.as_ref().unwrap().spec.features
|
||||
);
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
@@ -286,17 +262,11 @@ fn main() -> Result<()> {
|
||||
let pg = match compute.start_compute(extension_server_port) {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
error!("could not start the compute node: {:?}", err);
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{:?}", err));
|
||||
state.status = ComputeStatus::Failed;
|
||||
// Notify others that Postgres failed to start. In case of configuring the
|
||||
// empty compute, it's likely that API handler is still waiting for compute
|
||||
// state change. With this we will notify it that compute is in Failed state,
|
||||
// so control plane will know about it earlier and record proper error instead
|
||||
// of timeout.
|
||||
compute.state_changed.notify_all();
|
||||
drop(state); // unlock
|
||||
drop(state);
|
||||
delay_exit = true;
|
||||
None
|
||||
}
|
||||
@@ -348,20 +318,13 @@ fn main() -> Result<()> {
|
||||
|
||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
if let Some((mut pg, logs_handle)) = pg {
|
||||
if let Some(mut pg) = pg {
|
||||
// Startup is finished, exit the startup tracing span
|
||||
drop(startup_context_guard);
|
||||
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
PG_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
// Process has exited, so we can join the logs thread.
|
||||
let _ = logs_handle
|
||||
.join()
|
||||
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
|
||||
|
||||
info!("Postgres exited with code {}, shutting down", ecode);
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
@@ -516,24 +479,13 @@ fn cli() -> clap::Command {
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
|
||||
/// wait for termination which would be easy then.
|
||||
fn handle_exit_signal(sig: i32) {
|
||||
info!("received {sig} termination signal");
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
kill(pg_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
exit(1);
|
||||
.arg(
|
||||
// DEPRECATED, NO LONGER DOES ANYTHING.
|
||||
// See https://github.com/neondatabase/cloud/issues/7516
|
||||
Arg::new("file-cache-on-disk")
|
||||
.long("file-cache-on-disk")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -2,14 +2,12 @@ use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::io::Write;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -17,29 +15,26 @@ use chrono::{DateTime, Utc};
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use notify::event;
|
||||
use postgres::{Client, NoTls};
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{error, info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
|
||||
use crate::checker::create_availability_check_data;
|
||||
use crate::logger::inlinify;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
use crate::{config, extension_server};
|
||||
|
||||
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
|
||||
pub static PG_PID: AtomicU32 = AtomicU32::new(0);
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
// Url type maintains proper escaping
|
||||
@@ -66,8 +61,8 @@ pub struct ComputeNode {
|
||||
pub state: Mutex<ComputeState>,
|
||||
/// `Condvar` to allow notifying waiters about state changes.
|
||||
pub state_changed: Condvar,
|
||||
/// the address of extension storage proxy gateway
|
||||
pub ext_remote_storage: Option<String>,
|
||||
/// the S3 bucket that we search for extensions in
|
||||
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
@@ -259,7 +254,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
IF array_length(roles, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT neon_superuser TO %s',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
|
||||
@@ -276,7 +271,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
$$;"#,
|
||||
roles_decl, database_decl,
|
||||
);
|
||||
info!("Neon superuser created: {}", inlinify(&query));
|
||||
info!("Neon superuser created:\n{}", &query);
|
||||
client
|
||||
.simple_query(&query)
|
||||
.map_err(|e| anyhow::anyhow!(e).context(query))?;
|
||||
@@ -284,17 +279,6 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
/// Check that compute node has corresponding feature enabled.
|
||||
pub fn has_feature(&self, feature: ComputeFeature) -> bool {
|
||||
let state = self.state.lock().unwrap();
|
||||
|
||||
if let Some(s) = state.pspec.as_ref() {
|
||||
s.spec.features.contains(&feature)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.status = status;
|
||||
@@ -319,7 +303,7 @@ impl ComputeNode {
|
||||
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let start_time = Instant::now();
|
||||
|
||||
@@ -390,34 +374,6 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Gets the basebackup in a retry loop
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let mut retry_period_ms = 500;
|
||||
let mut attempts = 0;
|
||||
let max_attempts = 5;
|
||||
loop {
|
||||
let result = self.try_get_basebackup(compute_state, lsn);
|
||||
match result {
|
||||
Ok(_) => {
|
||||
return result;
|
||||
}
|
||||
Err(ref e) if attempts < max_attempts => {
|
||||
warn!(
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
|
||||
retry_period_ms *= 2;
|
||||
}
|
||||
Err(_) => {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn check_safekeepers_synced_async(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
@@ -520,7 +476,7 @@ impl ComputeNode {
|
||||
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let mut sync_handle = maybe_cgexec(&self.pgbin)
|
||||
let sync_handle = maybe_cgexec(&self.pgbin)
|
||||
.args(["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||
@@ -529,29 +485,15 @@ impl ComputeNode {
|
||||
vec![]
|
||||
})
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we leave stdout to collect LSN, while stderr logs
|
||||
// will be collected in a child thread.
|
||||
let stderr = sync_handle
|
||||
.stderr
|
||||
.take()
|
||||
.expect("stderr should be captured");
|
||||
let logs_handle = handle_postgres_logs(stderr);
|
||||
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
// redirected to the caller output.
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
// Process has exited, so we can join the logs thread.
|
||||
let _ = logs_handle
|
||||
.join()
|
||||
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
|
||||
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
@@ -689,12 +631,11 @@ impl ComputeNode {
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
/// Returns a handle to the child process and a handle to the logs thread.
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_postgres(
|
||||
&self,
|
||||
storage_auth_token: Option<String>,
|
||||
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
||||
) -> Result<std::process::Child> {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
@@ -708,15 +649,30 @@ impl ComputeNode {
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
PG_PID.store(pg.id(), Ordering::SeqCst);
|
||||
|
||||
// Start a thread to collect logs from stderr.
|
||||
let stderr = pg.stderr.take().expect("stderr should be captured");
|
||||
let logs_handle = handle_postgres_logs(stderr);
|
||||
let stderr = pg.stderr.take().unwrap();
|
||||
std::thread::spawn(move || {
|
||||
let reader = std::io::BufReader::new(stderr);
|
||||
let mut last_lines = vec![];
|
||||
for line in reader.lines() {
|
||||
if let Ok(line) = line {
|
||||
if line.starts_with("2023-") {
|
||||
// print all lines from the previous postgres instance
|
||||
let combined = format!("PG:{}\n", last_lines.join("\u{200B}"));
|
||||
let res = std::io::stderr().lock().write_all(combined.as_bytes());
|
||||
if let Err(e) = res {
|
||||
error!("failed to write to stderr: {}", e);
|
||||
}
|
||||
last_lines.clear();
|
||||
}
|
||||
last_lines.push(line);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
wait_for_postgres(&mut pg, pgdata_path)?;
|
||||
|
||||
Ok((pg, logs_handle))
|
||||
Ok(pg)
|
||||
}
|
||||
|
||||
/// Do initial configuration of the already started Postgres.
|
||||
@@ -728,14 +684,13 @@ impl ComputeNode {
|
||||
// In this case we need to connect with old `zenith_admin` name
|
||||
// and create new user. We cannot simply rename connected user,
|
||||
// but we can create a new one and grant it all privileges.
|
||||
let connstr = self.connstr.clone();
|
||||
let mut client = match Client::connect(connstr.as_str(), NoTls) {
|
||||
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
||||
Err(e) => {
|
||||
info!(
|
||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||
e
|
||||
);
|
||||
let mut zenith_admin_connstr = connstr.clone();
|
||||
let mut zenith_admin_connstr = self.connstr.clone();
|
||||
|
||||
zenith_admin_connstr
|
||||
.set_username("zenith_admin")
|
||||
@@ -748,8 +703,8 @@ impl ComputeNode {
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
|
||||
// reconnect with connstring with expected name
|
||||
Client::connect(connstr.as_str(), NoTls)?
|
||||
// reconnect with connsting with expected name
|
||||
Client::connect(self.connstr.as_str(), NoTls)?
|
||||
}
|
||||
Ok(client) => client,
|
||||
};
|
||||
@@ -763,21 +718,14 @@ impl ComputeNode {
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, &mut client, connstr.as_str())?;
|
||||
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, &mut client, self.connstr.as_str())?;
|
||||
handle_extensions(spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
create_availability_check_data(&mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
if self.has_feature(ComputeFeature::Migrations) {
|
||||
thread::spawn(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
handle_migrations(&mut client)
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -800,33 +748,9 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||
config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are reconfiguring:
|
||||
// creating new extensions, roles, etc...
|
||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||
@@ -841,20 +765,11 @@ impl ComputeNode {
|
||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(&spec, &mut client, self.connstr.as_str())?;
|
||||
handle_extensions(&spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
// We can skip handle_migrations here because a new migration can only appear
|
||||
// if we have a new version of the compute_ctl binary, which can only happen
|
||||
// if compute got restarted, in which case we'll end up inside of apply_config
|
||||
// instead of reconfigure.
|
||||
}
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
// reset max_cluster_size in config back to original value and reload config
|
||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let unknown_op = "unknown".to_string();
|
||||
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
|
||||
info!(
|
||||
@@ -866,10 +781,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_compute(
|
||||
&self,
|
||||
extension_server_port: u16,
|
||||
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
||||
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
@@ -880,26 +792,6 @@ impl ComputeNode {
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
// tune pgbouncer
|
||||
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
info!(
|
||||
"start_compute spec.remote_extensions {:?}",
|
||||
pspec.spec.remote_extensions
|
||||
@@ -934,21 +826,11 @@ impl ComputeNode {
|
||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||
|
||||
let start_time = Utc::now();
|
||||
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
|
||||
let config_time = Utc::now();
|
||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are applying config:
|
||||
// creating new extensions, roles, etc...
|
||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
self.apply_config(&compute_state)?;
|
||||
|
||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
|
||||
let startup_end_time = Utc::now();
|
||||
@@ -984,17 +866,7 @@ impl ComputeNode {
|
||||
};
|
||||
info!(?metrics, "compute start finished");
|
||||
|
||||
Ok(pg_process)
|
||||
}
|
||||
|
||||
/// Update the `last_active` in the shared state, but ensure that it's a more recent one.
|
||||
pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
// NB: `Some(<DateTime>)` is always greater than `None`.
|
||||
if last_active > state.last_active {
|
||||
state.last_active = last_active;
|
||||
debug!("set the last compute activity time to: {:?}", last_active);
|
||||
}
|
||||
Ok(pg)
|
||||
}
|
||||
|
||||
// Look for core dumps and collect backtraces.
|
||||
@@ -1106,12 +978,12 @@ LIMIT 100",
|
||||
real_ext_name: String,
|
||||
ext_path: RemotePath,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let ext_remote_storage =
|
||||
self.ext_remote_storage
|
||||
.as_ref()
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"Remote extensions storage is not configured",
|
||||
)))?;
|
||||
let remote_storage = self
|
||||
.ext_remote_storage
|
||||
.as_ref()
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"Remote extensions storage is not configured",
|
||||
)))?;
|
||||
|
||||
let ext_archive_name = ext_path.object_name().expect("bad path");
|
||||
|
||||
@@ -1167,7 +1039,7 @@ LIMIT 100",
|
||||
let download_size = extension_server::download_extension(
|
||||
&real_ext_name,
|
||||
&ext_path,
|
||||
ext_remote_storage,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -93,25 +93,5 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "neon.extension_server_port={}", port)?;
|
||||
}
|
||||
|
||||
// This is essential to keep this line at the end of the file,
|
||||
// because it is intended to override any settings above.
|
||||
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// create file compute_ctl_temp_override.conf in pgdata_dir
|
||||
/// add provided options to this file
|
||||
pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
|
||||
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
||||
let mut file = File::create(path)?;
|
||||
write!(file, "{}", options)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// remove file compute_ctl_temp_override.conf in pgdata_dir
|
||||
pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
|
||||
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
||||
std::fs::remove_file(path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -71,16 +71,18 @@ More specifically, here is an example ext_index.json
|
||||
}
|
||||
}
|
||||
*/
|
||||
use anyhow::Context;
|
||||
use anyhow::{self, Result};
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use regex::Regex;
|
||||
use remote_storage::*;
|
||||
use reqwest::StatusCode;
|
||||
use serde_json;
|
||||
use std::io::Read;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::path::Path;
|
||||
use std::str;
|
||||
use tar::Archive;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::info;
|
||||
use tracing::log::warn;
|
||||
use zstd::stream::read::Decoder;
|
||||
@@ -136,31 +138,23 @@ fn parse_pg_version(human_version: &str) -> &str {
|
||||
pub async fn download_extension(
|
||||
ext_name: &str,
|
||||
ext_path: &RemotePath,
|
||||
ext_remote_storage: &str,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
) -> Result<u64> {
|
||||
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
||||
|
||||
// TODO add retry logic
|
||||
let download_buffer =
|
||||
match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
|
||||
Ok(buffer) => buffer,
|
||||
Err(error_message) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"error downloading extension {:?}: {:?}",
|
||||
ext_name,
|
||||
error_message
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let mut download = remote_storage.download(ext_path).await?;
|
||||
let mut download_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut download_buffer)
|
||||
.await?;
|
||||
let download_size = download_buffer.len() as u64;
|
||||
info!("Download size {:?}", download_size);
|
||||
// it's unclear whether it is more performant to decompress into memory or not
|
||||
// TODO: decompressing into memory can be avoided
|
||||
let decoder = Decoder::new(download_buffer.as_ref())?;
|
||||
let mut archive = Archive::new(decoder);
|
||||
|
||||
let mut decoder = Decoder::new(download_buffer.as_slice())?;
|
||||
let mut decompress_buffer = Vec::new();
|
||||
decoder.read_to_end(&mut decompress_buffer)?;
|
||||
let mut archive = Archive::new(decompress_buffer.as_slice());
|
||||
let unzip_dest = pgbin
|
||||
.strip_suffix("/bin/postgres")
|
||||
.expect("bad pgbin")
|
||||
@@ -228,32 +222,29 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
||||
}
|
||||
}
|
||||
|
||||
// Do request to extension storage proxy, i.e.
|
||||
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
||||
// using HHTP GET
|
||||
// and return the response body as bytes
|
||||
//
|
||||
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = format!("{}/{}", ext_remote_storage, ext_path);
|
||||
|
||||
info!("Download extension {:?} from uri {:?}", ext_path, uri);
|
||||
|
||||
let resp = reqwest::get(uri).await?;
|
||||
|
||||
match resp.status() {
|
||||
StatusCode::OK => match resp.bytes().await {
|
||||
Ok(resp) => {
|
||||
info!("Download extension {:?} completed successfully", ext_path);
|
||||
Ok(resp)
|
||||
}
|
||||
Err(e) => bail!("could not deserialize remote extension response: {}", e),
|
||||
},
|
||||
StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
|
||||
_ => bail!(
|
||||
"unexpected remote extension response status code: {}",
|
||||
resp.status()
|
||||
),
|
||||
// This function initializes the necessary structs to use remote storage
|
||||
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct RemoteExtJson {
|
||||
bucket: String,
|
||||
region: String,
|
||||
endpoint: Option<String>,
|
||||
prefix: Option<String>,
|
||||
}
|
||||
let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
|
||||
|
||||
let config = S3Config {
|
||||
bucket_name: remote_ext_json.bucket,
|
||||
bucket_region: remote_ext_json.region,
|
||||
prefix_in_bucket: remote_ext_json.prefix,
|
||||
endpoint: remote_ext_json.endpoint,
|
||||
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_keys_per_list_response: None,
|
||||
};
|
||||
let config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AwsS3(config),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -123,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from remote extension storage on demand
|
||||
// download extension files from S3 on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
info!("req.uri {:?}", req.uri());
|
||||
@@ -227,7 +227,7 @@ async fn handle_configure_request(
|
||||
|
||||
let parsed_spec = match ParsedSpec::try_from(spec) {
|
||||
Ok(ps) => ps,
|
||||
Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
|
||||
Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
|
||||
};
|
||||
|
||||
// XXX: wrap state update under lock in code blocks. Otherwise,
|
||||
|
||||
@@ -156,17 +156,17 @@ paths:
|
||||
description: Error text or 'OK' if download succeeded.
|
||||
example: "OK"
|
||||
400:
|
||||
description: Request is invalid.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
description: Request is invalid.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: Extension download request failed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
description: Extension download request failed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
|
||||
@@ -38,9 +38,3 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Replace all newline characters with a special character to make it
|
||||
/// easier to grep for log messages.
|
||||
pub fn inlinify(s: &str) -> String {
|
||||
s.replace('\n', "\u{200B}")
|
||||
}
|
||||
|
||||
@@ -3,165 +3,97 @@ use std::{thread, time::Duration};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use compute_api::spec::ComputeFeature;
|
||||
|
||||
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
|
||||
// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
// Then update it in the shared state. This function never errors out.
|
||||
// NB: the only expected panic is at `Mutex` unwrap(), all other errors
|
||||
// should be handled gracefully.
|
||||
// XXX: the only expected panic is at `RwLock` unwrap().
|
||||
fn watch_compute_activity(compute: &ComputeNode) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = compute.connstr.as_str();
|
||||
|
||||
// During startup and configuration we connect to every Postgres database,
|
||||
// but we don't want to count this as some user activity. So wait until
|
||||
// the compute fully started before monitoring activity.
|
||||
wait_for_postgres_start(compute);
|
||||
|
||||
// Define `client` outside of the loop to reuse existing connection if it's active.
|
||||
let mut client = Client::connect(connstr, NoTls);
|
||||
|
||||
let mut sleep = false;
|
||||
let mut prev_active_time: Option<f64> = None;
|
||||
let mut prev_sessions: Option<i64> = None;
|
||||
|
||||
if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
|
||||
info!("starting experimental activity monitor for {}", connstr);
|
||||
} else {
|
||||
info!("starting activity monitor for {}", connstr);
|
||||
}
|
||||
info!("watching Postgres activity at {}", connstr);
|
||||
|
||||
loop {
|
||||
// We use `continue` a lot, so it's more convenient to sleep at the top of the loop.
|
||||
// But skip the first sleep, so we can connect to Postgres immediately.
|
||||
if sleep {
|
||||
// Should be outside of the mutex lock to allow others to read while we sleep.
|
||||
thread::sleep(MONITOR_CHECK_INTERVAL);
|
||||
} else {
|
||||
sleep = true;
|
||||
}
|
||||
// Should be outside of the write lock to allow others to read while we sleep.
|
||||
thread::sleep(MONITOR_CHECK_INTERVAL);
|
||||
|
||||
match &mut client {
|
||||
Ok(cli) => {
|
||||
if cli.is_closed() {
|
||||
info!("connection to Postgres is closed, trying to reconnect");
|
||||
info!("connection to postgres closed, trying to reconnect");
|
||||
|
||||
// Connection is closed, reconnect and try again.
|
||||
client = Client::connect(connstr, NoTls);
|
||||
continue;
|
||||
}
|
||||
|
||||
// This is a new logic, only enable if the feature flag is set.
|
||||
// TODO: remove this once we are sure that it works OR drop it altogether.
|
||||
if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
|
||||
// First, check if the total active time or sessions across all databases has changed.
|
||||
// If it did, it means that user executed some queries. In theory, it can even go down if
|
||||
// some databases were dropped, but it's still a user activity.
|
||||
match get_database_stats(cli) {
|
||||
Ok((active_time, sessions)) => {
|
||||
let mut detected_activity = false;
|
||||
// Get all running client backends except ourself, use RFC3339 DateTime format.
|
||||
let backends = cli
|
||||
.query(
|
||||
"SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
|
||||
FROM pg_stat_activity
|
||||
WHERE backend_type = 'client backend'
|
||||
AND pid != pg_backend_pid()
|
||||
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
|
||||
&[],
|
||||
);
|
||||
let mut last_active = compute.state.lock().unwrap().last_active;
|
||||
|
||||
prev_active_time = match prev_active_time {
|
||||
Some(prev_active_time) => {
|
||||
if active_time != prev_active_time {
|
||||
detected_activity = true;
|
||||
}
|
||||
Some(active_time)
|
||||
}
|
||||
None => Some(active_time),
|
||||
if let Ok(backs) = backends {
|
||||
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
|
||||
|
||||
for b in backs.into_iter() {
|
||||
let state: String = match b.try_get("state") {
|
||||
Ok(state) => state,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
if state == "idle" {
|
||||
let change: String = match b.try_get("state_change") {
|
||||
Ok(state_change) => state_change,
|
||||
Err(_) => continue,
|
||||
};
|
||||
prev_sessions = match prev_sessions {
|
||||
Some(prev_sessions) => {
|
||||
if sessions != prev_sessions {
|
||||
detected_activity = true;
|
||||
}
|
||||
Some(sessions)
|
||||
let change = DateTime::parse_from_rfc3339(&change);
|
||||
match change {
|
||||
Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
|
||||
Err(e) => {
|
||||
info!("cannot parse backend state_change DateTime: {}", e);
|
||||
continue;
|
||||
}
|
||||
None => Some(sessions),
|
||||
};
|
||||
|
||||
if detected_activity {
|
||||
// Update the last active time and continue, we don't need to
|
||||
// check backends state change.
|
||||
compute.update_last_active(Some(Utc::now()));
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// Found non-idle backend, so the last activity is NOW.
|
||||
// Save it and exit the for loop. Also clear the idle backend
|
||||
// `state_change` timestamps array as it doesn't matter now.
|
||||
last_active = Some(Utc::now());
|
||||
idle_backs.clear();
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("could not get database statistics: {}", e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Get idle backend `state_change` with the max timestamp.
|
||||
if let Some(last) = idle_backs.iter().max() {
|
||||
last_active = Some(*last);
|
||||
}
|
||||
}
|
||||
|
||||
// Second, if database statistics is the same, check all backends state change,
|
||||
// maybe there is some with more recent activity. `get_backends_state_change()`
|
||||
// can return None or stale timestamp, so it's `compute.update_last_active()`
|
||||
// responsibility to check if the new timestamp is more recent than the current one.
|
||||
// This helps us to discover new sessions, that did nothing yet.
|
||||
match get_backends_state_change(cli) {
|
||||
Ok(last_active) => {
|
||||
compute.update_last_active(last_active);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("could not get backends state change: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, if there are existing (logical) walsenders, do not suspend.
|
||||
//
|
||||
// walproposer doesn't currently show up in pg_stat_replication,
|
||||
// but protect if it will be
|
||||
let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
|
||||
match cli.query_one(ws_count_query, &[]) {
|
||||
Ok(r) => match r.try_get::<&str, i64>("count") {
|
||||
Ok(num_ws) => {
|
||||
if num_ws > 0 {
|
||||
compute.update_last_active(Some(Utc::now()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("failed to parse walsenders count: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("failed to get list of walsenders: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
//
|
||||
// Do not suspend compute if autovacuum is running
|
||||
//
|
||||
let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
|
||||
match cli.query_one(autovacuum_count_query, &[]) {
|
||||
Ok(r) => match r.try_get::<&str, i64>("count") {
|
||||
Ok(num_workers) => {
|
||||
if num_workers > 0 {
|
||||
compute.update_last_active(Some(Utc::now()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("failed to parse autovacuum workers count: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("failed to get list of autovacuum workers: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
// Update the last activity in the shared state if we got a more recent one.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
// NB: `Some(<DateTime>)` is always greater than `None`.
|
||||
if last_active > state.last_active {
|
||||
state.last_active = last_active;
|
||||
debug!("set the last compute activity time to: {:?}", last_active);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("could not connect to Postgres: {}, retrying", e);
|
||||
debug!("cannot connect to postgres: {}, retrying", e);
|
||||
|
||||
// Establish a new connection and try again.
|
||||
client = Client::connect(connstr, NoTls);
|
||||
@@ -170,124 +102,12 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
}
|
||||
}
|
||||
|
||||
// Hang on condition variable waiting until the compute status is `Running`.
|
||||
fn wait_for_postgres_start(compute: &ComputeNode) {
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Running {
|
||||
info!("compute is not running, waiting before monitoring activity");
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
|
||||
if state.status == ComputeStatus::Running {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Figure out the total active time and sessions across all non-system databases.
|
||||
// Returned tuple is `(active_time, sessions)`.
|
||||
// It can return `0.0` active time or `0` sessions, which means no user databases exist OR
|
||||
// it was a start with skipped `pg_catalog` updates and user didn't do any queries
|
||||
// (or open any sessions) yet.
|
||||
fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
|
||||
// Filter out `postgres` database as `compute_ctl` and other monitoring tools
|
||||
// like `postgres_exporter` use it to query Postgres statistics.
|
||||
// Use explicit 8 bytes type casts to match Rust types.
|
||||
let stats = cli.query_one(
|
||||
"SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
|
||||
coalesce(sum(sessions), 0)::bigint AS total_sessions
|
||||
FROM pg_stat_database
|
||||
WHERE datname NOT IN (
|
||||
'postgres',
|
||||
'template0',
|
||||
'template1'
|
||||
);",
|
||||
&[],
|
||||
);
|
||||
let stats = match stats {
|
||||
Ok(stats) => stats,
|
||||
Err(e) => {
|
||||
return Err(anyhow::anyhow!("could not query active_time: {}", e));
|
||||
}
|
||||
};
|
||||
|
||||
let active_time: f64 = match stats.try_get("total_active_time") {
|
||||
Ok(active_time) => active_time,
|
||||
Err(e) => return Err(anyhow::anyhow!("could not get total_active_time: {}", e)),
|
||||
};
|
||||
|
||||
let sessions: i64 = match stats.try_get("total_sessions") {
|
||||
Ok(sessions) => sessions,
|
||||
Err(e) => return Err(anyhow::anyhow!("could not get total_sessions: {}", e)),
|
||||
};
|
||||
|
||||
Ok((active_time, sessions))
|
||||
}
|
||||
|
||||
// Figure out the most recent state change time across all client backends.
|
||||
// If there is currently active backend, timestamp will be `Utc::now()`.
|
||||
// It can return `None`, which means no client backends exist or we were
|
||||
// unable to parse the timestamp.
|
||||
fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime<Utc>>> {
|
||||
let mut last_active: Option<DateTime<Utc>> = None;
|
||||
// Get all running client backends except ourself, use RFC3339 DateTime format.
|
||||
let backends = cli.query(
|
||||
"SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
|
||||
FROM pg_stat_activity
|
||||
WHERE backend_type = 'client backend'
|
||||
AND pid != pg_backend_pid()
|
||||
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
|
||||
&[],
|
||||
);
|
||||
|
||||
match backends {
|
||||
Ok(backs) => {
|
||||
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
|
||||
|
||||
for b in backs.into_iter() {
|
||||
let state: String = match b.try_get("state") {
|
||||
Ok(state) => state,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
if state == "idle" {
|
||||
let change: String = match b.try_get("state_change") {
|
||||
Ok(state_change) => state_change,
|
||||
Err(_) => continue,
|
||||
};
|
||||
let change = DateTime::parse_from_rfc3339(&change);
|
||||
match change {
|
||||
Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
|
||||
Err(e) => {
|
||||
info!("cannot parse backend state_change DateTime: {}", e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Found non-idle backend, so the last activity is NOW.
|
||||
// Return immediately, no need to check other backends.
|
||||
return Ok(Some(Utc::now()));
|
||||
}
|
||||
}
|
||||
|
||||
// Get idle backend `state_change` with the max timestamp.
|
||||
if let Some(last) = idle_backs.iter().max() {
|
||||
last_active = Some(*last);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(anyhow::anyhow!("could not query backends: {}", e));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(last_active)
|
||||
}
|
||||
|
||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||
pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
let compute = Arc::clone(compute);
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
thread::Builder::new()
|
||||
.name("compute-monitor".into())
|
||||
.spawn(move || watch_compute_activity(&compute))
|
||||
.spawn(move || watch_compute_activity(&state))
|
||||
.expect("cannot launch compute monitor thread")
|
||||
}
|
||||
|
||||
@@ -6,17 +6,12 @@ use std::io::{BufRead, BufReader};
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::Child;
|
||||
use std::thread::JoinHandle;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use ini::Ini;
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
use postgres::{Client, Transaction};
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::time::timeout;
|
||||
use tokio_postgres::NoTls;
|
||||
use tracing::{debug, error, info, instrument};
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
@@ -198,11 +193,16 @@ impl Escaping for PgIdent {
|
||||
/// Build a list of existing Postgres roles
|
||||
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
|
||||
let postgres_roles = xact
|
||||
.query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
|
||||
.query(
|
||||
"SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
|
||||
&[],
|
||||
)?
|
||||
.iter()
|
||||
.map(|row| Role {
|
||||
name: row.get("rolname"),
|
||||
encrypted_password: row.get("rolpassword"),
|
||||
replication: Some(row.get("rolreplication")),
|
||||
bypassrls: Some(row.get("rolbypassrls")),
|
||||
options: None,
|
||||
})
|
||||
.collect();
|
||||
@@ -364,172 +364,3 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update pgbouncer.ini with provided options
|
||||
fn update_pgbouncer_ini(
|
||||
pgbouncer_config: HashMap<String, String>,
|
||||
pgbouncer_ini_path: &str,
|
||||
) -> Result<()> {
|
||||
let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
|
||||
let section = conf.section_mut(Some("pgbouncer")).unwrap();
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
section.insert(option_name, value);
|
||||
debug!(
|
||||
"Updating pgbouncer.ini with new values {}={}",
|
||||
option_name, value
|
||||
);
|
||||
}
|
||||
|
||||
conf.write_to_file(pgbouncer_ini_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tune pgbouncer.
|
||||
/// 1. Apply new config using pgbouncer admin console
|
||||
/// 2. Add new values to pgbouncer.ini to preserve them after restart
|
||||
pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
|
||||
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
|
||||
// for VMs use pgbouncer specific way to connect to
|
||||
// pgbouncer admin console without password
|
||||
// when pgbouncer is running under the same user.
|
||||
"host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string()
|
||||
} else {
|
||||
// for k8s use normal connection string with password
|
||||
// to connect to pgbouncer admin console
|
||||
let mut pgbouncer_connstr =
|
||||
"host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
|
||||
if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
|
||||
pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
|
||||
}
|
||||
pgbouncer_connstr
|
||||
};
|
||||
|
||||
info!(
|
||||
"Connecting to pgbouncer with connection string: {}",
|
||||
pgbouncer_connstr
|
||||
);
|
||||
|
||||
// connect to pgbouncer, retrying several times
|
||||
// because pgbouncer may not be ready yet
|
||||
let mut retries = 3;
|
||||
let client = loop {
|
||||
match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await {
|
||||
Ok((client, connection)) => {
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
break client;
|
||||
}
|
||||
Err(e) => {
|
||||
if retries == 0 {
|
||||
return Err(e.into());
|
||||
}
|
||||
error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e);
|
||||
retries -= 1;
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Apply new config
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
let query = format!("SET {}={}", option_name, value);
|
||||
// keep this log line for debugging purposes
|
||||
info!("Applying pgbouncer setting change: {}", query);
|
||||
|
||||
if let Err(err) = client.simple_query(&query).await {
|
||||
// Don't fail on error, just print it into log
|
||||
error!(
|
||||
"Failed to apply pgbouncer setting change: {}, {}",
|
||||
query, err
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
// save values to pgbouncer.ini
|
||||
// so that they are preserved after pgbouncer restart
|
||||
let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() {
|
||||
// in VMs we use /etc/pgbouncer.ini
|
||||
"/etc/pgbouncer.ini".to_string()
|
||||
} else {
|
||||
// in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini
|
||||
// this is a shared volume between pgbouncer and postgres containers
|
||||
// FIXME: fix permissions for this file
|
||||
"/var/db/postgres/pgbouncer/pgbouncer.ini".to_string()
|
||||
};
|
||||
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
|
||||
/// and send them to the logger. In the future we may also want to add context to
|
||||
/// these logs.
|
||||
pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
|
||||
std::thread::spawn(move || {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to build tokio runtime");
|
||||
|
||||
let res = runtime.block_on(async move {
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)?;
|
||||
handle_postgres_logs_async(stderr).await
|
||||
});
|
||||
if let Err(e) = res {
|
||||
tracing::error!("error while processing postgres logs: {}", e);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
|
||||
/// - next line starts with timestamp
|
||||
/// - EOF
|
||||
/// - no new lines were written for the last second
|
||||
async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
|
||||
let mut lines = tokio::io::BufReader::new(stderr).lines();
|
||||
let timeout_duration = Duration::from_millis(100);
|
||||
let ts_regex =
|
||||
regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid");
|
||||
|
||||
let mut buf = vec![];
|
||||
loop {
|
||||
let next_line = timeout(timeout_duration, lines.next_line()).await;
|
||||
|
||||
// we should flush lines from the buffer if we cannot continue reading multiline message
|
||||
let should_flush_buf = match next_line {
|
||||
// Flushing if new line starts with timestamp
|
||||
Ok(Ok(Some(ref line))) => ts_regex.is_match(line),
|
||||
// Flushing on EOF, timeout or error
|
||||
_ => true,
|
||||
};
|
||||
|
||||
if !buf.is_empty() && should_flush_buf {
|
||||
// join multiline message into a single line, separated by unicode Zero Width Space.
|
||||
// "PG:" suffix is used to distinguish postgres logs from other logs.
|
||||
let combined = format!("PG:{}\n", buf.join("\u{200B}"));
|
||||
buf.clear();
|
||||
|
||||
// sync write to stderr to avoid interleaving with other logs
|
||||
use std::io::Write;
|
||||
let res = std::io::stderr().lock().write_all(combined.as_bytes());
|
||||
if let Err(e) = res {
|
||||
tracing::error!("error while writing to stderr: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// if not timeout, append line to the buffer
|
||||
if next_line.is_ok() {
|
||||
match next_line?? {
|
||||
Some(line) => buf.push(line),
|
||||
// EOF
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ use reqwest::StatusCode;
|
||||
use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
|
||||
|
||||
use crate::config;
|
||||
use crate::logger::inlinify;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
@@ -119,6 +118,19 @@ pub fn get_spec_from_control_plane(
|
||||
spec
|
||||
}
|
||||
|
||||
/// It takes cluster specification and does the following:
|
||||
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
|
||||
/// - Update `pg_hba.conf` to allow external connections.
|
||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||
// always write all config into it creating new file.
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
|
||||
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
||||
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of spec.json
|
||||
@@ -190,20 +202,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
// Print a list of existing Postgres roles (only in debug mode)
|
||||
if span_enabled!(Level::INFO) {
|
||||
let mut vec = Vec::new();
|
||||
info!("postgres roles:");
|
||||
for r in &existing_roles {
|
||||
vec.push(format!(
|
||||
"{}:{}",
|
||||
info!(
|
||||
" - {}:{}",
|
||||
r.name,
|
||||
if r.encrypted_password.is_some() {
|
||||
"[FILTERED]"
|
||||
} else {
|
||||
"(null)"
|
||||
}
|
||||
));
|
||||
);
|
||||
}
|
||||
|
||||
info!("postgres roles (total {}): {:?}", vec.len(), vec);
|
||||
}
|
||||
|
||||
// Process delta operations first
|
||||
@@ -241,10 +251,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
// Refresh Postgres roles info to handle possible roles renaming
|
||||
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
||||
|
||||
info!(
|
||||
"handling cluster spec roles (total {})",
|
||||
spec.cluster.roles.len()
|
||||
);
|
||||
info!("cluster spec roles:");
|
||||
for role in &spec.cluster.roles {
|
||||
let name = &role.name;
|
||||
// XXX: with a limited number of roles it is fine, but consider making it a HashMap
|
||||
@@ -258,6 +265,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let action = if let Some(r) = pg_role {
|
||||
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|
||||
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|
||||
|| !r.bypassrls.unwrap_or(false)
|
||||
|| !r.replication.unwrap_or(false)
|
||||
{
|
||||
RoleAction::Update
|
||||
} else if let Some(pg_pwd) = &r.encrypted_password {
|
||||
@@ -289,25 +298,17 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
match action {
|
||||
RoleAction::None => {}
|
||||
RoleAction::Update => {
|
||||
// This can be run on /every/ role! Not just ones created through the console.
|
||||
// This means that if you add some funny ALTER here that adds a permission,
|
||||
// this will get run even on user-created roles! This will result in different
|
||||
// behavior before and after a spec gets reapplied. The below ALTER as it stands
|
||||
// now only grants LOGIN and changes the password. Please do not allow this branch
|
||||
// to do anything silly.
|
||||
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
|
||||
let mut query: String =
|
||||
format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("running role create query: '{}'", &query);
|
||||
info!("role create query: '{}'", &query);
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
@@ -324,7 +325,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
RoleAction::Create => " -> create",
|
||||
RoleAction::Update => " -> update",
|
||||
};
|
||||
info!(" - {}:{}{}", name, pwd, action_str);
|
||||
info!(" - {}:{}{}", name, pwd, action_str);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -376,49 +377,33 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn reassign_owned_objects_in_one_db(
|
||||
conf: Config,
|
||||
role_name: &PgIdent,
|
||||
db_owner: &PgIdent,
|
||||
) -> Result<()> {
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.pg_quote(),
|
||||
db_owner.pg_quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
role_name,
|
||||
conf.get_dbname().unwrap_or(""),
|
||||
db_owner
|
||||
);
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Reassign all owned objects in all databases to the owner of the database.
|
||||
fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
|
||||
for db in &spec.cluster.databases {
|
||||
if db.owner != *role_name {
|
||||
let mut conf = Config::from_str(connstr)?;
|
||||
conf.dbname(&db.name);
|
||||
reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
|
||||
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.pg_quote(),
|
||||
db.owner.pg_quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
role_name, &db.name, &db.owner
|
||||
);
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Also handle case when there are no databases in the spec.
|
||||
// In this case we need to reassign objects in the default database.
|
||||
let conf = Config::from_str(connstr)?;
|
||||
let db_owner = PgIdent::from_str("cloud_admin")?;
|
||||
reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -433,11 +418,10 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
// Print a list of existing Postgres databases (only in debug mode)
|
||||
if span_enabled!(Level::INFO) {
|
||||
let mut vec = Vec::new();
|
||||
info!("postgres databases:");
|
||||
for (dbname, db) in &existing_dbs {
|
||||
vec.push(format!("{}:{}", dbname, db.owner));
|
||||
info!(" {}:{}", dbname, db.owner);
|
||||
}
|
||||
info!("postgres databases (total {}): {:?}", vec.len(), vec);
|
||||
}
|
||||
|
||||
// Process delta operations first
|
||||
@@ -509,10 +493,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
// Refresh Postgres databases info to handle possible renames
|
||||
let existing_dbs = get_existing_dbs(client)?;
|
||||
|
||||
info!(
|
||||
"handling cluster spec databases (total {})",
|
||||
spec.cluster.databases.len()
|
||||
);
|
||||
info!("cluster spec databases:");
|
||||
for db in &spec.cluster.databases {
|
||||
let name = &db.name;
|
||||
let pg_db = existing_dbs.get(name);
|
||||
@@ -571,7 +552,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
DatabaseAction::Create => " -> create",
|
||||
DatabaseAction::Update => " -> update",
|
||||
};
|
||||
info!(" - {}:{}{}", db.name, db.owner, action_str);
|
||||
info!(" - {}:{}{}", db.name, db.owner, action_str);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -672,11 +653,7 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
|
||||
$$;"
|
||||
.to_string();
|
||||
|
||||
info!(
|
||||
"grant query for db {} : {}",
|
||||
&db.name,
|
||||
inlinify(&grant_query)
|
||||
);
|
||||
info!("grant query for db {} : {}", &db.name, &grant_query);
|
||||
db_client.simple_query(&grant_query)?;
|
||||
}
|
||||
|
||||
@@ -693,121 +670,13 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
info!("creating system extensions with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
}
|
||||
if libs.contains("pg_wait_sampling") {
|
||||
// Create extension only if this compute really needs it
|
||||
let query = "CREATE EXTENSION IF NOT EXISTS pg_wait_sampling";
|
||||
info!("creating system extensions with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||
info!("handle extension neon");
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
|
||||
info!("create neon extension with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "ALTER EXTENSION neon SET SCHEMA neon";
|
||||
info!("alter neon extension schema with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
|
||||
// this will be a no-op if extension is already up to date,
|
||||
// which may happen in two cases:
|
||||
// - extension was just installed
|
||||
// - extension was already installed and is up to date
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension schema with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
info!("handle migrations");
|
||||
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
// !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
let migrations = [
|
||||
"ALTER ROLE neon_superuser BYPASSRLS",
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
END LOOP;
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
END LOOP;
|
||||
END $$;
|
||||
"#,
|
||||
r#"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client.query_one(query, &[])?;
|
||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||
let starting_migration_id = current_migration;
|
||||
|
||||
query = "BEGIN";
|
||||
client.simple_query(query)?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
info!("Running migration:\n{}\n", migrations[current_migration]);
|
||||
client.simple_query(migrations[current_migration])?;
|
||||
current_migration += 1;
|
||||
}
|
||||
let setval = format!(
|
||||
"UPDATE neon_migration.migration_id SET id={}",
|
||||
migrations.len()
|
||||
);
|
||||
client.simple_query(&setval)?;
|
||||
|
||||
query = "COMMIT";
|
||||
client.simple_query(query)?;
|
||||
|
||||
info!(
|
||||
"Ran {} migrations",
|
||||
(migrations.len() - starting_migration_id)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -6,13 +6,9 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
diesel = { version = "2.1.4", features = ["postgres"]}
|
||||
diesel_migrations = { version = "2.1.0", features = ["postgres"]}
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
@@ -21,7 +17,6 @@ hex.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
@@ -29,11 +24,10 @@ tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
toml.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
url.workspace = true
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
[package]
|
||||
name = "attachment_service"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
|
||||
|
||||
utils = { path = "../../libs/utils/" }
|
||||
metrics = { path = "../../libs/metrics/" }
|
||||
control_plane = { path = ".." }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
-- This file was automatically created by Diesel to setup helper functions
|
||||
-- and other internal bookkeeping. This file is safe to edit, any future
|
||||
-- changes will be added to existing projects as new migrations.
|
||||
|
||||
DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
|
||||
DROP FUNCTION IF EXISTS diesel_set_updated_at();
|
||||
@@ -1,36 +0,0 @@
|
||||
-- This file was automatically created by Diesel to setup helper functions
|
||||
-- and other internal bookkeeping. This file is safe to edit, any future
|
||||
-- changes will be added to existing projects as new migrations.
|
||||
|
||||
|
||||
|
||||
|
||||
-- Sets up a trigger for the given table to automatically set a column called
|
||||
-- `updated_at` whenever the row is modified (unless `updated_at` was included
|
||||
-- in the modified columns)
|
||||
--
|
||||
-- # Example
|
||||
--
|
||||
-- ```sql
|
||||
-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
|
||||
--
|
||||
-- SELECT diesel_manage_updated_at('users');
|
||||
-- ```
|
||||
CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
|
||||
BEGIN
|
||||
EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
|
||||
FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
IF (
|
||||
NEW IS DISTINCT FROM OLD AND
|
||||
NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
|
||||
) THEN
|
||||
NEW.updated_at := current_timestamp;
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
@@ -1 +0,0 @@
|
||||
DROP TABLE tenant_shards;
|
||||
@@ -1,12 +0,0 @@
|
||||
CREATE TABLE tenant_shards (
|
||||
tenant_id VARCHAR NOT NULL,
|
||||
shard_number INTEGER NOT NULL,
|
||||
shard_count INTEGER NOT NULL,
|
||||
PRIMARY KEY(tenant_id, shard_number, shard_count),
|
||||
shard_stripe_size INTEGER NOT NULL,
|
||||
generation INTEGER NOT NULL,
|
||||
generation_pageserver BIGINT NOT NULL,
|
||||
placement_policy VARCHAR NOT NULL,
|
||||
-- config is JSON encoded, opaque to the database.
|
||||
config TEXT NOT NULL
|
||||
);
|
||||
@@ -1 +0,0 @@
|
||||
DROP TABLE nodes;
|
||||
@@ -1,10 +0,0 @@
|
||||
CREATE TABLE nodes (
|
||||
node_id BIGINT PRIMARY KEY NOT NULL,
|
||||
|
||||
scheduling_policy VARCHAR NOT NULL,
|
||||
|
||||
listen_http_addr VARCHAR NOT NULL,
|
||||
listen_http_port INTEGER NOT NULL,
|
||||
listen_pg_addr VARCHAR NOT NULL,
|
||||
listen_pg_port INTEGER NOT NULL
|
||||
);
|
||||
@@ -1,116 +0,0 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId};
|
||||
use postgres_connection::parse_host_port;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
pub(super) struct ComputeHookTenant {
|
||||
shards: Vec<(ShardIndex, NodeId)>,
|
||||
}
|
||||
|
||||
impl ComputeHookTenant {
|
||||
pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> {
|
||||
// Find the highest shard count and drop any shards that aren't
|
||||
// for that shard count.
|
||||
let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
|
||||
let Some(shard_count) = shard_count else {
|
||||
// No shards, nothing to do.
|
||||
tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
self.shards.retain(|(k, _v)| k.shard_count == shard_count);
|
||||
self.shards
|
||||
.sort_by_key(|(shard, _node_id)| shard.shard_number);
|
||||
|
||||
if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
|
||||
// We have pageservers for all the shards: proceed to reconfigure compute
|
||||
let env = match LocalEnv::load_config() {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Couldn't load neon_local config, skipping compute update ({e})"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let cplane = ComputeControlPlane::load(env.clone())
|
||||
.expect("Error loading compute control plane");
|
||||
|
||||
let compute_pageservers = self
|
||||
.shards
|
||||
.iter()
|
||||
.map(|(_shard, node_id)| {
|
||||
let ps_conf = env
|
||||
.get_pageserver_conf(*node_id)
|
||||
.expect("Unknown pageserver");
|
||||
let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
|
||||
.expect("Unable to parse listen_pg_addr");
|
||||
(pg_host, pg_port.unwrap_or(5432))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||
if endpoint.tenant_id == tenant_id && endpoint.status() == "running" {
|
||||
tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
|
||||
endpoint.reconfigure(compute_pageservers.clone()).await?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tracing::info!(
|
||||
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
||||
self.shards.len(),
|
||||
shard_count.0
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// The compute hook is a destination for notifications about changes to tenant:pageserver
|
||||
/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures
|
||||
/// the compute connection string.
|
||||
pub(super) struct ComputeHook {
|
||||
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
||||
}
|
||||
|
||||
impl ComputeHook {
|
||||
pub(super) fn new() -> Self {
|
||||
Self {
|
||||
state: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn notify(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id);
|
||||
let mut locked = self.state.lock().await;
|
||||
let entry = locked
|
||||
.entry(tenant_shard_id.tenant_id)
|
||||
.or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
|
||||
|
||||
let shard_index = ShardIndex {
|
||||
shard_count: tenant_shard_id.shard_count,
|
||||
shard_number: tenant_shard_id.shard_number,
|
||||
};
|
||||
|
||||
let mut set = false;
|
||||
for (existing_shard, existing_node) in &mut entry.shards {
|
||||
if *existing_shard == shard_index {
|
||||
*existing_node = node_id;
|
||||
set = true;
|
||||
}
|
||||
}
|
||||
if !set {
|
||||
entry.shards.push((shard_index, node_id));
|
||||
}
|
||||
|
||||
entry.maybe_reconfigure(tenant_shard_id.tenant_id).await
|
||||
}
|
||||
}
|
||||
@@ -1,430 +0,0 @@
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{
|
||||
TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
use utils::http::endpoint::{auth_middleware, request_span};
|
||||
use utils::http::request::parse_request_param;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
endpoint::{self},
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
id::NodeId,
|
||||
};
|
||||
|
||||
use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::attachment_service::{
|
||||
AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
|
||||
TenantShardMigrateRequest,
|
||||
};
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
pub struct HttpState {
|
||||
service: Arc<crate::service::Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
}
|
||||
|
||||
impl HttpState {
|
||||
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
|
||||
let allowlist_routes = ["/status"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
Self {
|
||||
service,
|
||||
auth,
|
||||
allowlist_routes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn get_state(request: &Request<Body>) -> &HttpState {
|
||||
request
|
||||
.data::<Arc<HttpState>>()
|
||||
.expect("unknown state type")
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.re_attach(reattach_req)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
)
|
||||
}
|
||||
|
||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||
/// holds the latest generation for the tenants with deletions enqueued
|
||||
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.validate(validate_req))
|
||||
}
|
||||
|
||||
/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
|
||||
/// (in the real control plane this is unnecessary, because the same program is managing
|
||||
/// generation numbers and doing attachments).
|
||||
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.attach_hook(attach_req)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.inspect(inspect_req))
|
||||
}
|
||||
|
||||
async fn handle_tenant_create(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
|
||||
}
|
||||
|
||||
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
|
||||
// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. This avoids
|
||||
// needing to track a "deleting" state for tenants.
|
||||
async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
|
||||
where
|
||||
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
|
||||
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
let started_at = Instant::now();
|
||||
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
|
||||
// completed.
|
||||
let mut retry_period = Duration::from_secs(1);
|
||||
// On subsequent retries, wait longer.
|
||||
let max_retry_period = Duration::from_secs(5);
|
||||
// Enable callers with a 30 second request timeout to reliably get a response
|
||||
let max_wait = Duration::from_secs(25);
|
||||
|
||||
loop {
|
||||
let status = f(service.clone()).await?;
|
||||
match status {
|
||||
StatusCode::ACCEPTED => {
|
||||
tracing::info!("Deletion accepted, waiting to try again...");
|
||||
tokio::time::sleep(retry_period).await;
|
||||
retry_period = max_retry_period;
|
||||
}
|
||||
StatusCode::NOT_FOUND => {
|
||||
tracing::info!("Deletion complete");
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!("Unexpected status {status}");
|
||||
return json_response(status, ());
|
||||
}
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
if now + retry_period > started_at + max_wait {
|
||||
tracing::info!("Deletion timed out waiting for 404");
|
||||
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
|
||||
// the pageserver's swagger definition for this endpoint, and has the same desired
|
||||
// effect of causing the control plane to retry later.
|
||||
return json_response(StatusCode::CONFLICT, ());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_tenant_location_config(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_location_config(tenant_id, config_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_delete(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service.tenant_delete(tenant_id).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_create(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_delete(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service.tenant_timeline_delete(tenant_id, timeline_id).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_passthrough(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
|
||||
let Some(path) = req.uri().path_and_query() else {
|
||||
// This should never happen, our request router only calls us if there is a path
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
|
||||
};
|
||||
|
||||
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
||||
|
||||
// Find the node that holds shard zero
|
||||
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
|
||||
|
||||
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
||||
// rewrite this to a shard-aware shard zero ID.
|
||||
let path = format!("{}", path);
|
||||
let tenant_str = tenant_id.to_string();
|
||||
let tenant_shard_str = format!("{}", tenant_shard_id);
|
||||
let path = path.replace(&tenant_str, &tenant_shard_str);
|
||||
|
||||
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
|
||||
let resp = client.get_raw(path).await.map_err(|_e|
|
||||
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
||||
// if we can't successfully send a request to the pageserver, we aren't available.
|
||||
ApiError::ShuttingDown)?;
|
||||
|
||||
// We have a reqest::Response, would like a http::Response
|
||||
let mut builder = hyper::Response::builder()
|
||||
.status(resp.status())
|
||||
.version(resp.version());
|
||||
for (k, v) in resp.headers() {
|
||||
builder = builder.header(k, v);
|
||||
}
|
||||
|
||||
let response = builder
|
||||
.body(Body::wrap_stream(resp.bytes_stream()))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
async fn handle_tenant_locate(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
state.service.node_register(register_req).await?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.node_list().await?)
|
||||
}
|
||||
|
||||
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
|
||||
if node_id != config_req.node_id {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Path and body node_id differ"
|
||||
)));
|
||||
}
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_migrate(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_shard_migrate(tenant_shard_id, migrate_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
impl From<ReconcileError> for ApiError {
|
||||
fn from(value: ReconcileError) -> Self {
|
||||
ApiError::Conflict(format!("Reconciliation error: {}", value))
|
||||
}
|
||||
}
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
|
||||
where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
let state = get_state(&request);
|
||||
let service = state.service.clone();
|
||||
|
||||
let startup_complete = service.startup_complete.clone();
|
||||
if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
// This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
|
||||
// timeouts around its remote calls, to bound its runtime.
|
||||
return Err(ApiError::Timeout(
|
||||
"Timed out waiting for service readiness".into(),
|
||||
));
|
||||
}
|
||||
|
||||
request_span(
|
||||
request,
|
||||
|request| async move { handler(service, request).await },
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn make_router(
|
||||
service: Arc<Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router();
|
||||
if auth.is_some() {
|
||||
router = router.middleware(auth_middleware(|request| {
|
||||
let state = get_state(request);
|
||||
if state.allowlist_routes.contains(request.uri()) {
|
||||
None
|
||||
} else {
|
||||
state.auth.as_deref()
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
router
|
||||
.data(Arc::new(HttpState::new(service, auth)))
|
||||
// Non-prefixed generic endpoints (status, metrics)
|
||||
.get("/status", |r| request_span(r, handle_status))
|
||||
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
||||
.post("/upcall/v1/re-attach", |r| {
|
||||
request_span(r, handle_re_attach)
|
||||
})
|
||||
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
|
||||
// Test/dev/debug endpoints
|
||||
.post("/debug/v1/attach-hook", |r| {
|
||||
request_span(r, handle_attach_hook)
|
||||
})
|
||||
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
// Node operations
|
||||
.post("/control/v1/node", |r| {
|
||||
request_span(r, handle_node_register)
|
||||
})
|
||||
.get("/control/v1/node", |r| request_span(r, handle_node_list))
|
||||
.put("/control/v1/node/:node_id/config", |r| {
|
||||
request_span(r, handle_node_configure)
|
||||
})
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
.post("/v1/tenant", |r| {
|
||||
tenant_service_handler(r, handle_tenant_create)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_delete)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
// Tenant Shard operations (low level/maintenance)
|
||||
.put("/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_create)
|
||||
})
|
||||
// Tenant detail GET passthrough to shard zero
|
||||
.get("/v1/tenant/:tenant_id*", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
||||
// timeline GET APIs will be implicitly included.
|
||||
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
// Path aliases for tests_forward_compatibility
|
||||
// TODO: remove these in future PR
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
}
|
||||
@@ -1,60 +0,0 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod compute_hook;
|
||||
pub mod http;
|
||||
mod node;
|
||||
pub mod persistence;
|
||||
mod reconciler;
|
||||
mod scheduler;
|
||||
mod schema;
|
||||
pub mod service;
|
||||
mod tenant_state;
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
enum PlacementPolicy {
|
||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||
Single,
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
/// Do not attach to any pageservers
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
|
||||
struct Sequence(u64);
|
||||
|
||||
impl Sequence {
|
||||
fn initial() -> Self {
|
||||
Self(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Sequence {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl MonotonicCounter<Sequence> for Sequence {
|
||||
fn cnt_advance(&mut self, v: Sequence) {
|
||||
assert!(*self <= v);
|
||||
*self = v;
|
||||
}
|
||||
fn cnt_value(&self) -> Sequence {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl Sequence {
|
||||
fn next(&self) -> Sequence {
|
||||
Sequence(self.0 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PlacementPolicy {
|
||||
fn default() -> Self {
|
||||
PlacementPolicy::Double(1)
|
||||
}
|
||||
}
|
||||
@@ -1,116 +0,0 @@
|
||||
/// The attachment service mimics the aspects of the control plane API
|
||||
/// that are required for a pageserver to operate.
|
||||
///
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::anyhow;
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use std::sync::Arc;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::logging::{self, LogFormat};
|
||||
|
||||
use utils::{project_build_tag, project_git_version, tcp_listener};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
struct Cli {
|
||||
/// Host and port to listen on, like `127.0.0.1:1234`
|
||||
#[arg(short, long)]
|
||||
listen: std::net::SocketAddr,
|
||||
|
||||
/// Path to public key for JWT authentication of clients
|
||||
#[arg(long)]
|
||||
public_key: Option<camino::Utf8PathBuf>,
|
||||
|
||||
/// Token for authenticating this service with the pageservers it controls
|
||||
#[arg(short, long)]
|
||||
jwt_token: Option<String>,
|
||||
|
||||
/// Path to the .json file to store state (will be created if it doesn't exist)
|
||||
#[arg(short, long)]
|
||||
path: Option<Utf8PathBuf>,
|
||||
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||
#[arg(long)]
|
||||
database_url: String,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
|
||||
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
|
||||
GIT_VERSION,
|
||||
launch_ts.to_string(),
|
||||
BUILD_TAG,
|
||||
args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
|
||||
args.listen
|
||||
);
|
||||
|
||||
let config = Config {
|
||||
jwt_token: args.jwt_token,
|
||||
};
|
||||
|
||||
let json_path = args.path;
|
||||
let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
|
||||
|
||||
let service = Service::spawn(config, persistence.clone()).await?;
|
||||
|
||||
let http_listener = tcp_listener::bind(args.listen)?;
|
||||
|
||||
let auth = if let Some(public_key_path) = &args.public_key {
|
||||
let jwt_auth = JwtAuth::from_key_path(public_key_path)?;
|
||||
Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let router = make_router(service, auth)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let router_service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
|
||||
tokio::task::spawn(server);
|
||||
|
||||
// Wait until we receive a signal
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
|
||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
|
||||
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
|
||||
tokio::select! {
|
||||
_ = sigint.recv() => {},
|
||||
_ = sigterm.recv() => {},
|
||||
_ = sigquit.recv() => {},
|
||||
}
|
||||
tracing::info!("Terminating on signal");
|
||||
|
||||
if json_path.is_some() {
|
||||
// Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
|
||||
// full postgres dumps around.
|
||||
if let Err(e) = persistence.write_tenants_json().await {
|
||||
tracing::error!("Failed to write JSON on shutdown: {e}")
|
||||
}
|
||||
}
|
||||
|
||||
std::process::exit(0);
|
||||
}
|
||||
@@ -1,50 +0,0 @@
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::persistence::NodePersistence;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Node {
|
||||
pub(crate) id: NodeId,
|
||||
|
||||
pub(crate) availability: NodeAvailability,
|
||||
pub(crate) scheduling: NodeSchedulingPolicy,
|
||||
|
||||
pub(crate) listen_http_addr: String,
|
||||
pub(crate) listen_http_port: u16,
|
||||
|
||||
pub(crate) listen_pg_addr: String,
|
||||
pub(crate) listen_pg_port: u16,
|
||||
}
|
||||
|
||||
impl Node {
|
||||
pub(crate) fn base_url(&self) -> String {
|
||||
format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
|
||||
}
|
||||
|
||||
/// Is this node elegible to have work scheduled onto it?
|
||||
pub(crate) fn may_schedule(&self) -> bool {
|
||||
match self.availability {
|
||||
NodeAvailability::Active => {}
|
||||
NodeAvailability::Offline => return false,
|
||||
}
|
||||
|
||||
match self.scheduling {
|
||||
NodeSchedulingPolicy::Active => true,
|
||||
NodeSchedulingPolicy::Draining => false,
|
||||
NodeSchedulingPolicy::Filling => true,
|
||||
NodeSchedulingPolicy::Pause => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn to_persistent(&self) -> NodePersistence {
|
||||
NodePersistence {
|
||||
node_id: self.id.0 as i64,
|
||||
scheduling_policy: self.scheduling.into(),
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port as i32,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
listen_pg_port: self.listen_pg_port as i32,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,398 +0,0 @@
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::node::Node;
|
||||
use crate::PlacementPolicy;
|
||||
|
||||
/// ## What do we store?
|
||||
///
|
||||
/// The attachment service does not store most of its state durably.
|
||||
///
|
||||
/// The essential things to store durably are:
|
||||
/// - generation numbers, as these must always advance monotonically to ensure data safety.
|
||||
/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
|
||||
/// - Node's scheduling policies, as the source of truth for these is something external.
|
||||
///
|
||||
/// Other things we store durably as an implementation detail:
|
||||
/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
|
||||
/// but it is operationally simpler to make this service the authority for which nodes
|
||||
/// it talks to.
|
||||
///
|
||||
/// ## Performance/efficiency
|
||||
///
|
||||
/// The attachment service does not go via the database for most things: there are
|
||||
/// a couple of places where we must, and where efficiency matters:
|
||||
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
|
||||
/// before it can attach a tenant, so this acts as a bound on how fast things like
|
||||
/// failover can happen.
|
||||
/// - Pageserver re-attach: we will increment many shards' generations when this happens,
|
||||
/// so it is important to avoid e.g. issuing O(N) queries.
|
||||
///
|
||||
/// Database calls relating to nodes have low performance requirements, as they are very rarely
|
||||
/// updated, and reads of nodes are always from memory, not the database. We only require that
|
||||
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
|
||||
pub struct Persistence {
|
||||
database_url: String,
|
||||
|
||||
// In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of
|
||||
// test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
|
||||
// compatible just yet.
|
||||
json_path: Option<Utf8PathBuf>,
|
||||
}
|
||||
|
||||
/// Legacy format, for use in JSON compat objects in test environment
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct JsonPersistence {
|
||||
tenants: HashMap<TenantShardId, TenantShardPersistence>,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum DatabaseError {
|
||||
#[error(transparent)]
|
||||
Query(#[from] diesel::result::Error),
|
||||
#[error(transparent)]
|
||||
Connection(#[from] diesel::result::ConnectionError),
|
||||
#[error("Logical error: {0}")]
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
impl Persistence {
|
||||
pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
|
||||
Self {
|
||||
database_url,
|
||||
json_path,
|
||||
}
|
||||
}
|
||||
|
||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let database_url = self.database_url.clone();
|
||||
tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
|
||||
// TODO: connection pooling, such as via diesel::r2d2
|
||||
let mut conn = PgConnection::establish(&database_url)?;
|
||||
func(&mut conn)
|
||||
})
|
||||
.await
|
||||
.expect("Task panic")
|
||||
}
|
||||
|
||||
/// When a node is first registered, persist it before using it for anything
|
||||
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
|
||||
let np = node.to_persistent();
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::insert_into(crate::schema::nodes::table)
|
||||
.values(&np)
|
||||
.execute(conn)?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// At startup, populate the list of nodes which our shards may be placed on
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
|
||||
let nodes: Vec<Node> = self
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table
|
||||
.load::<NodePersistence>(conn)?
|
||||
.into_iter()
|
||||
.map(|n| Node {
|
||||
id: NodeId(n.node_id as u64),
|
||||
// At startup we consider a node offline until proven otherwise.
|
||||
availability: NodeAvailability::Offline,
|
||||
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
|
||||
.expect("Bad scheduling policy in DB"),
|
||||
listen_http_addr: n.listen_http_addr,
|
||||
listen_http_port: n.listen_http_port as u16,
|
||||
listen_pg_addr: n.listen_pg_addr,
|
||||
listen_pg_port: n.listen_pg_port as u16,
|
||||
})
|
||||
.collect::<Vec<Node>>())
|
||||
})
|
||||
.await?;
|
||||
|
||||
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
||||
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
/// At startup, load the high level state for shards, such as their config + policy. This will
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let loaded = self
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
if loaded.is_empty() {
|
||||
if let Some(path) = &self.json_path {
|
||||
if tokio::fs::try_exists(path)
|
||||
.await
|
||||
.map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
|
||||
{
|
||||
tracing::info!("Importing from legacy JSON format at {path}");
|
||||
return self.list_tenant_shards_json(path).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(loaded)
|
||||
}
|
||||
|
||||
/// Shim for automated compatibility tests: load tenants from a JSON file instead of database
|
||||
pub(crate) async fn list_tenant_shards_json(
|
||||
&self,
|
||||
path: &Utf8Path,
|
||||
) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let bytes = tokio::fs::read(path)
|
||||
.await
|
||||
.map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
|
||||
|
||||
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
||||
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
||||
// empty strings with proper defaults.
|
||||
if tenant.tenant_id.is_empty() {
|
||||
tenant.tenant_id = tenant_id.to_string();
|
||||
tenant.config = serde_json::to_string(&TenantConfig::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
}
|
||||
}
|
||||
|
||||
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
|
||||
|
||||
// Synchronize database with what is in the JSON file
|
||||
self.insert_tenant_shards(tenants.clone()).await?;
|
||||
|
||||
Ok(tenants)
|
||||
}
|
||||
|
||||
/// For use in testing environments, where we dump out JSON on shutdown.
|
||||
pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
|
||||
let Some(path) = &self.json_path else {
|
||||
anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
|
||||
};
|
||||
tracing::info!("Writing state to {path}...");
|
||||
let tenants = self.list_tenant_shards().await?;
|
||||
let mut tenants_map = HashMap::new();
|
||||
for tsp in tenants {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
|
||||
tenants_map.insert(tenant_shard_id, tsp);
|
||||
}
|
||||
let json = serde_json::to_string(&JsonPersistence {
|
||||
tenants: tenants_map,
|
||||
})?;
|
||||
|
||||
tokio::fs::write(path, &json).await?;
|
||||
tracing::info!("Wrote {} bytes to {path}...", json.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tenants must be persisted before we schedule them for the first time. This enables us
|
||||
/// to correctly retain generation monotonicity, and the externally provided placement policy & config.
|
||||
pub(crate) async fn insert_tenant_shards(
|
||||
&self,
|
||||
shards: Vec<TenantShardPersistence>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
|
||||
/// the tenant from memory on this server.
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
|
||||
/// batched increment of the generations of all tenants whose generation_pageserver is equal to
|
||||
/// the node that called /re-attach.
|
||||
#[tracing::instrument(skip_all, fields(node_id))]
|
||||
pub(crate) async fn re_attach(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_conn(move |conn| {
|
||||
let rows_updated = diesel::update(tenant_shards)
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.set(generation.eq(generation + 1))
|
||||
.execute(conn)?;
|
||||
|
||||
tracing::info!("Incremented {} tenants' generations", rows_updated);
|
||||
|
||||
// TODO: UPDATE+SELECT in one query
|
||||
|
||||
let updated = tenant_shards
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.select(TenantShardPersistence::as_select())
|
||||
.load(conn)?;
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let mut result = HashMap::new();
|
||||
for tsp in updated {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
|
||||
/// advancing generation number. We also store the NodeId for which the generation was issued, so that in
|
||||
/// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
|
||||
pub(crate) async fn increment_generation(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<Generation> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.set((
|
||||
generation.eq(generation + 1),
|
||||
generation_pageserver.eq(node_id.0 as i64),
|
||||
))
|
||||
// TODO: only returning() the generation column
|
||||
.returning(TenantShardPersistence::as_returning())
|
||||
.get_result(conn)?;
|
||||
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(Generation::new(updated.generation as u32))
|
||||
}
|
||||
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.set((
|
||||
generation_pageserver.eq(i64::MAX),
|
||||
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO: when we start shard splitting, we must durably mark the tenant so that
|
||||
// on restart, we know that we must go through recovery (list shards that exist
|
||||
// and pick up where we left off and/or revert to parent shards).
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
|
||||
todo!();
|
||||
}
|
||||
|
||||
// TODO: when we finish shard splitting, we must atomically clean up the old shards
|
||||
// and insert the new shards, and clear the splitting marker.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
|
||||
todo!();
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
pub(crate) struct TenantShardPersistence {
|
||||
#[serde(default)]
|
||||
pub(crate) tenant_id: String,
|
||||
#[serde(default)]
|
||||
pub(crate) shard_number: i32,
|
||||
#[serde(default)]
|
||||
pub(crate) shard_count: i32,
|
||||
#[serde(default)]
|
||||
pub(crate) shard_stripe_size: i32,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
pub(crate) generation: i32,
|
||||
|
||||
// Currently attached pageserver
|
||||
#[serde(rename = "pageserver")]
|
||||
pub(crate) generation_pageserver: i64,
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) placement_policy: String,
|
||||
#[serde(default)]
|
||||
pub(crate) config: String,
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
pub(crate) struct NodePersistence {
|
||||
pub(crate) node_id: i64,
|
||||
pub(crate) scheduling_policy: String,
|
||||
pub(crate) listen_http_addr: String,
|
||||
pub(crate) listen_http_port: i32,
|
||||
pub(crate) listen_pg_addr: String,
|
||||
pub(crate) listen_pg_port: i32,
|
||||
}
|
||||
@@ -1,495 +0,0 @@
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::compute_hook::ComputeHook;
|
||||
use crate::node::Node;
|
||||
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
|
||||
|
||||
/// Object with the lifetime of the background reconcile task that is created
|
||||
/// for tenants which have a difference between their intent and observed states.
|
||||
pub(super) struct Reconciler {
|
||||
/// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
|
||||
/// of a tenant's state from when we spawned a reconcile task.
|
||||
pub(super) tenant_shard_id: TenantShardId,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) intent: IntentState,
|
||||
pub(crate) config: TenantConfig,
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
pub(crate) service_config: service::Config,
|
||||
|
||||
/// A snapshot of the pageservers as they were when we were asked
|
||||
/// to reconcile.
|
||||
pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
|
||||
/// A hook to notify the running postgres instances when we change the location
|
||||
/// of a tenant
|
||||
pub(crate) compute_hook: Arc<ComputeHook>,
|
||||
|
||||
/// A means to abort background reconciliation: it is essential to
|
||||
/// call this when something changes in the original TenantState that
|
||||
/// will make this reconciliation impossible or unnecessary, for
|
||||
/// example when a pageserver node goes offline, or the PlacementPolicy for
|
||||
/// the tenant is changed.
|
||||
pub(crate) cancel: CancellationToken,
|
||||
|
||||
/// Access to persistent storage for updating generation numbers
|
||||
pub(crate) persistence: Arc<Persistence>,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ReconcileError {
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl Reconciler {
|
||||
async fn location_config(
|
||||
&mut self,
|
||||
node_id: NodeId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<Duration>,
|
||||
) -> anyhow::Result<()> {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(&node_id)
|
||||
.expect("Pageserver may not be removed while referenced");
|
||||
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.id, ObservedStateLocation { conf: None });
|
||||
|
||||
tracing::info!("location_config({}) calling: {:?}", node_id, config);
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
||||
client
|
||||
.location_config(self.tenant_shard_id, config.clone(), flush_ms)
|
||||
.await?;
|
||||
tracing::info!("location_config({}) complete: {:?}", node_id, config);
|
||||
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.id, ObservedStateLocation { conf: Some(config) });
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
|
||||
let destination = if let Some(node_id) = self.intent.attached {
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) => {
|
||||
// We will do a live migration only if the intended destination is not
|
||||
// currently in an attached state.
|
||||
match &conf.conf {
|
||||
Some(conf) if conf.mode == LocationConfigMode::Secondary => {
|
||||
// Fall through to do a live migration
|
||||
node_id
|
||||
}
|
||||
None | Some(_) => {
|
||||
// Attached or uncertain: don't do a live migration, proceed
|
||||
// with a general-case reconciliation
|
||||
tracing::info!("maybe_live_migrate: destination is None or attached");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Our destination is not attached: maybe live migrate if some other
|
||||
// node is currently attached. Fall through.
|
||||
node_id
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No intent to be attached
|
||||
tracing::info!("maybe_live_migrate: no attached intent");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let mut origin = None;
|
||||
for (node_id, state) in &self.observed.locations {
|
||||
if let Some(observed_conf) = &state.conf {
|
||||
if observed_conf.mode == LocationConfigMode::AttachedSingle {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(node_id)
|
||||
.expect("Nodes may not be removed while referenced");
|
||||
// We will only attempt live migration if the origin is not offline: this
|
||||
// avoids trying to do it while reconciling after responding to an HA failover.
|
||||
if !matches!(node.availability, NodeAvailability::Offline) {
|
||||
origin = Some(*node_id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let Some(origin) = origin else {
|
||||
tracing::info!("maybe_live_migrate: no origin found");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// We have an origin and a destination: proceed to do the live migration
|
||||
tracing::info!("Live migrating {}->{}", origin, destination);
|
||||
self.live_migrate(origin, destination).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_lsns(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: &NodeId,
|
||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(node_id)
|
||||
.expect("Pageserver may not be removed while referenced");
|
||||
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
||||
|
||||
let timelines = client.timeline_list(&tenant_shard_id).await?;
|
||||
Ok(timelines
|
||||
.into_iter()
|
||||
.map(|t| (t.timeline_id, t.last_record_lsn))
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(node_id)
|
||||
.expect("Pageserver may not be removed while referenced");
|
||||
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
||||
|
||||
match client.tenant_secondary_download(tenant_shard_id).await {
|
||||
Ok(()) => {}
|
||||
Err(_) => {
|
||||
tracing::info!(" (skipping, destination wasn't in secondary mode)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn await_lsn(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pageserver_id: &NodeId,
|
||||
baseline: HashMap<TimelineId, Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
println!(
|
||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
||||
pageserver_id
|
||||
);
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let mut any_behind: bool = false;
|
||||
for (timeline_id, baseline_lsn) in &baseline {
|
||||
match latest.get(timeline_id) {
|
||||
Some(latest_lsn) => {
|
||||
println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
||||
if latest_lsn < baseline_lsn {
|
||||
any_behind = true;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Expected timeline isn't yet visible on migration destination.
|
||||
// (IRL we would have to account for timeline deletion, but this
|
||||
// is just test helper)
|
||||
any_behind = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !any_behind {
|
||||
println!("✅ LSN caught up. Proceeding...");
|
||||
break;
|
||||
} else {
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn live_migrate(
|
||||
&mut self,
|
||||
origin_ps_id: NodeId,
|
||||
dest_ps_id: NodeId,
|
||||
) -> anyhow::Result<()> {
|
||||
// `maybe_live_migrate` is responsibble for sanity of inputs
|
||||
assert!(origin_ps_id != dest_ps_id);
|
||||
|
||||
fn build_location_config(
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
mode: LocationConfigMode,
|
||||
generation: Option<Generation>,
|
||||
secondary_conf: Option<LocationConfigSecondary>,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode,
|
||||
generation: generation.map(|g| g.into().unwrap()),
|
||||
secondary_conf,
|
||||
tenant_conf: config.clone(),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"🔁 Switching origin pageserver {} to stale mode",
|
||||
origin_ps_id
|
||||
);
|
||||
|
||||
// FIXME: it is incorrect to use self.generation here, we should use the generation
|
||||
// from the ObservedState of the origin pageserver (it might be older than self.generation)
|
||||
let stale_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedStale,
|
||||
Some(self.generation),
|
||||
None,
|
||||
);
|
||||
self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
|
||||
.await?;
|
||||
|
||||
let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
|
||||
|
||||
// If we are migrating to a destination that has a secondary location, warm it up first
|
||||
if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
|
||||
if let Some(destination_conf) = &destination_conf.conf {
|
||||
if destination_conf.mode == LocationConfigMode::Secondary {
|
||||
tracing::info!(
|
||||
"🔁 Downloading latest layers to destination pageserver {}",
|
||||
dest_ps_id,
|
||||
);
|
||||
self.secondary_download(self.tenant_shard_id, &dest_ps_id)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Increment generation before attaching to new pageserver
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, dest_ps_id)
|
||||
.await?;
|
||||
|
||||
let dest_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedMulti,
|
||||
Some(self.generation),
|
||||
None,
|
||||
);
|
||||
|
||||
tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
|
||||
self.location_config(dest_ps_id, dest_conf, None).await?;
|
||||
|
||||
if let Some(baseline) = baseline_lsns {
|
||||
tracing::info!("🕑 Waiting for LSN to catch up...");
|
||||
self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
|
||||
.await?;
|
||||
}
|
||||
|
||||
tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
|
||||
self.compute_hook
|
||||
.notify(self.tenant_shard_id, dest_ps_id)
|
||||
.await?;
|
||||
|
||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
|
||||
// this location will be deleted in the general case reconciliation that runs after this.
|
||||
let origin_secondary_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::Secondary,
|
||||
None,
|
||||
Some(LocationConfigSecondary { warm: true }),
|
||||
);
|
||||
self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
|
||||
.await?;
|
||||
// TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
|
||||
// partway through. In fact, all location conf API calls should be in a wrapper that sets
|
||||
// the observed state to None, then runs, then sets it to what we wrote.
|
||||
self.observed.locations.insert(
|
||||
origin_ps_id,
|
||||
ObservedStateLocation {
|
||||
conf: Some(origin_secondary_conf),
|
||||
},
|
||||
);
|
||||
|
||||
println!(
|
||||
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
||||
dest_ps_id
|
||||
);
|
||||
let dest_final_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedSingle,
|
||||
Some(self.generation),
|
||||
None,
|
||||
);
|
||||
self.location_config(dest_ps_id, dest_final_conf.clone(), None)
|
||||
.await?;
|
||||
self.observed.locations.insert(
|
||||
dest_ps_id,
|
||||
ObservedStateLocation {
|
||||
conf: Some(dest_final_conf),
|
||||
},
|
||||
);
|
||||
|
||||
println!("✅ Migration complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reconciling a tenant makes API calls to pageservers until the observed state
|
||||
/// matches the intended state.
|
||||
///
|
||||
/// First we apply special case handling (e.g. for live migrations), and then a
|
||||
/// general case reconciliation where we walk through the intent by pageserver
|
||||
/// and call out to the pageserver to apply the desired state.
|
||||
pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
|
||||
// TODO: if any of self.observed is None, call to remote pageservers
|
||||
// to learn correct state.
|
||||
|
||||
// Special case: live migration
|
||||
self.maybe_live_migrate().await?;
|
||||
|
||||
// If the attached pageserver is not attached, do so now.
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
let mut wanted_conf =
|
||||
attached_location_conf(self.generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!("Observed configuration already correct.")
|
||||
}
|
||||
_ => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
// reconcile this location. This includes locations with different configurations, as well
|
||||
// as locations with unknown (None) observed state.
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
wanted_conf.generation = self.generation.into();
|
||||
tracing::info!("Observed configuration requires update.");
|
||||
self.location_config(node_id, wanted_conf, None).await?;
|
||||
if let Err(e) = self
|
||||
.compute_hook
|
||||
.notify(self.tenant_shard_id, node_id)
|
||||
.await
|
||||
{
|
||||
tracing::warn!(
|
||||
"Failed to notify compute of newly attached pageserver {node_id}: {e}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Configure secondary locations: if these were previously attached this
|
||||
// implicitly downgrades them from attached to secondary.
|
||||
let mut changes = Vec::new();
|
||||
for node_id in &self.intent.secondary {
|
||||
let wanted_conf = secondary_location_conf(&self.shard, &self.config);
|
||||
match self.observed.locations.get(node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!(%node_id, "Observed configuration already correct.")
|
||||
}
|
||||
_ => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
// reconcile this location.
|
||||
tracing::info!(%node_id, "Observed configuration requires update.");
|
||||
changes.push((*node_id, wanted_conf))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Detach any extraneous pageservers that are no longer referenced
|
||||
// by our intent.
|
||||
let all_pageservers = self.intent.all_pageservers();
|
||||
for node_id in self.observed.locations.keys() {
|
||||
if all_pageservers.contains(node_id) {
|
||||
// We are only detaching pageservers that aren't used at all.
|
||||
continue;
|
||||
}
|
||||
|
||||
changes.push((
|
||||
*node_id,
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::Detached,
|
||||
generation: None,
|
||||
secondary_conf: None,
|
||||
shard_number: self.shard.number.0,
|
||||
shard_count: self.shard.count.0,
|
||||
shard_stripe_size: self.shard.stripe_size.0,
|
||||
tenant_conf: self.config.clone(),
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
for (node_id, conf) in changes {
|
||||
self.location_config(node_id, conf, None).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn attached_location_conf(
|
||||
generation: Generation,
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
generation: generation.into(),
|
||||
secondary_conf: None,
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn secondary_location_conf(
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::Secondary,
|
||||
generation: None,
|
||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
}
|
||||
@@ -1,89 +0,0 @@
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
|
||||
/// Scenarios in which we cannot find a suitable location for a tenant shard
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ScheduleError {
|
||||
#[error("No pageservers found")]
|
||||
NoPageservers,
|
||||
#[error("No pageserver found matching constraint")]
|
||||
ImpossibleConstraint,
|
||||
}
|
||||
|
||||
impl From<ScheduleError> for ApiError {
|
||||
fn from(value: ScheduleError) -> Self {
|
||||
ApiError::Conflict(format!("Scheduling error: {}", value))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Scheduler {
|
||||
tenant_counts: HashMap<NodeId, usize>,
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new(
|
||||
tenants: &BTreeMap<TenantShardId, TenantState>,
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
) -> Self {
|
||||
let mut tenant_counts = HashMap::new();
|
||||
for node_id in nodes.keys() {
|
||||
tenant_counts.insert(*node_id, 0);
|
||||
}
|
||||
|
||||
for tenant in tenants.values() {
|
||||
if let Some(ps) = tenant.intent.attached {
|
||||
let entry = tenant_counts.entry(ps).or_insert(0);
|
||||
*entry += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (node_id, node) in nodes {
|
||||
if !node.may_schedule() {
|
||||
tenant_counts.remove(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
Self { tenant_counts }
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_shard(
|
||||
&mut self,
|
||||
hard_exclude: &[NodeId],
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
if self.tenant_counts.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
||||
.tenant_counts
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) {
|
||||
None
|
||||
} else {
|
||||
Some((*k, *v))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by tenant count. Nodes with the same tenant count are sorted by ID.
|
||||
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
if tenant_counts.is_empty() {
|
||||
// After applying constraints, no pageservers were left
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
for (node_id, count) in &tenant_counts {
|
||||
tracing::info!("tenant_counts[{node_id}]={count}");
|
||||
}
|
||||
|
||||
let node_id = tenant_counts.first().unwrap().0;
|
||||
tracing::info!("scheduler selected node {node_id}");
|
||||
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
|
||||
Ok(node_id)
|
||||
}
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
// @generated automatically by Diesel CLI.
|
||||
|
||||
diesel::table! {
|
||||
nodes (node_id) {
|
||||
node_id -> Int8,
|
||||
scheduling_policy -> Varchar,
|
||||
listen_http_addr -> Varchar,
|
||||
listen_http_port -> Int4,
|
||||
listen_pg_addr -> Varchar,
|
||||
listen_pg_port -> Int4,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
tenant_shards (tenant_id, shard_number, shard_count) {
|
||||
tenant_id -> Varchar,
|
||||
shard_number -> Int4,
|
||||
shard_count -> Int4,
|
||||
shard_stripe_size -> Int4,
|
||||
generation -> Int4,
|
||||
generation_pageserver -> Int8,
|
||||
placement_policy -> Varchar,
|
||||
config -> Text,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,467 +0,0 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::NodeId,
|
||||
seqwait::{SeqWait, SeqWaitError},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
persistence::Persistence,
|
||||
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
|
||||
scheduler::{ScheduleError, Scheduler},
|
||||
service, PlacementPolicy, Sequence,
|
||||
};
|
||||
|
||||
pub(crate) struct TenantState {
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
|
||||
pub(crate) shard: ShardIdentity,
|
||||
|
||||
// Runtime only: sequence used to coordinate when updating this object while
|
||||
// with background reconcilers may be running. A reconciler runs to a particular
|
||||
// sequence.
|
||||
pub(crate) sequence: Sequence,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
pub(crate) generation: Generation,
|
||||
|
||||
// High level description of how the tenant should be set up. Provided
|
||||
// externally.
|
||||
pub(crate) policy: PlacementPolicy,
|
||||
|
||||
// Low level description of exactly which pageservers should fulfil
|
||||
// which role. Generated by `Self::schedule`.
|
||||
pub(crate) intent: IntentState,
|
||||
|
||||
// Low level description of how the tenant is configured on pageservers:
|
||||
// if this does not match `Self::intent` then the tenant needs reconciliation
|
||||
// with `Self::reconcile`.
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
// Tenant configuration, passed through opaquely to the pageserver. Identical
|
||||
// for all shards in a tenant.
|
||||
pub(crate) config: TenantConfig,
|
||||
|
||||
/// If a reconcile task is currently in flight, it may be joined here (it is
|
||||
/// only safe to join if either the result has been received or the reconciler's
|
||||
/// cancellation token has been fired)
|
||||
pub(crate) reconciler: Option<ReconcilerHandle>,
|
||||
|
||||
/// Optionally wait for reconciliation to complete up to a particular
|
||||
/// sequence number.
|
||||
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
|
||||
/// Indicates sequence number for which we have encountered an error reconciling. If
|
||||
/// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
|
||||
/// and callers should stop waiting for `waiter` and propagate the error.
|
||||
pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
|
||||
/// The most recent error from a reconcile on this tenant
|
||||
/// TODO: generalize to an array of recent events
|
||||
/// TOOD: use a ArcSwap instead of mutex for faster reads?
|
||||
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug)]
|
||||
pub(crate) struct IntentState {
|
||||
pub(crate) attached: Option<NodeId>,
|
||||
pub(crate) secondary: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub(crate) struct ObservedState {
|
||||
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
|
||||
}
|
||||
|
||||
/// Our latest knowledge of how this tenant is configured in the outside world.
|
||||
///
|
||||
/// Meaning:
|
||||
/// * No instance of this type exists for a node: we are certain that we have nothing configured on that
|
||||
/// node for this shard.
|
||||
/// * Instance exists with conf==None: we *might* have some state on that node, but we don't know
|
||||
/// what it is (e.g. we failed partway through configuring it)
|
||||
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
|
||||
/// and that configuration will still be present unless something external interfered.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct ObservedStateLocation {
|
||||
/// If None, it means we do not know the status of this shard's location on this node, but
|
||||
/// we know that we might have some state on this node.
|
||||
pub(crate) conf: Option<LocationConfig>,
|
||||
}
|
||||
pub(crate) struct ReconcilerWaiter {
|
||||
// For observability purposes, remember the ID of the shard we're
|
||||
// waiting for.
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
|
||||
seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
error: std::sync::Arc<std::sync::Mutex<String>>,
|
||||
seq: Sequence,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ReconcileWaitError {
|
||||
#[error("Timeout waiting for shard {0}")]
|
||||
Timeout(TenantShardId),
|
||||
#[error("shutting down")]
|
||||
Shutdown,
|
||||
#[error("Reconcile error on shard {0}: {1}")]
|
||||
Failed(TenantShardId, String),
|
||||
}
|
||||
|
||||
impl ReconcilerWaiter {
|
||||
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
|
||||
tokio::select! {
|
||||
result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> {
|
||||
result.map_err(|e| match e {
|
||||
SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id),
|
||||
SeqWaitError::Shutdown => ReconcileWaitError::Shutdown
|
||||
})?;
|
||||
},
|
||||
result = self.error_seq_wait.wait_for(self.seq) => {
|
||||
result.map_err(|e| match e {
|
||||
SeqWaitError::Shutdown => ReconcileWaitError::Shutdown,
|
||||
SeqWaitError::Timeout => unreachable!()
|
||||
})?;
|
||||
|
||||
return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Having spawned a reconciler task, the tenant shard's state will carry enough
|
||||
/// information to optionally cancel & await it later.
|
||||
pub(crate) struct ReconcilerHandle {
|
||||
sequence: Sequence,
|
||||
handle: JoinHandle<()>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
/// When a reconcile task completes, it sends this result object
|
||||
/// to be applied to the primary TenantState.
|
||||
pub(crate) struct ReconcileResult {
|
||||
pub(crate) sequence: Sequence,
|
||||
/// On errors, `observed` should be treated as an incompleted description
|
||||
/// of state (i.e. any nodes present in the result should override nodes
|
||||
/// present in the parent tenant state, but any unmentioned nodes should
|
||||
/// not be removed from parent tenant state)
|
||||
pub(crate) result: Result<(), ReconcileError>,
|
||||
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) observed: ObservedState,
|
||||
}
|
||||
|
||||
impl IntentState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
attached: None,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(p) = self.attached {
|
||||
result.push(p)
|
||||
}
|
||||
|
||||
result.extend(self.secondary.iter().copied());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// When a node goes offline, we update intents to avoid using it
|
||||
/// as their attached pageserver.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ObservedState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
locations: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantState {
|
||||
pub(crate) fn new(
|
||||
tenant_shard_id: TenantShardId,
|
||||
shard: ShardIdentity,
|
||||
policy: PlacementPolicy,
|
||||
) -> Self {
|
||||
Self {
|
||||
tenant_shard_id,
|
||||
policy,
|
||||
intent: IntentState::default(),
|
||||
generation: Generation::new(0),
|
||||
shard,
|
||||
observed: ObservedState::default(),
|
||||
config: TenantConfig::default(),
|
||||
reconciler: None,
|
||||
sequence: Sequence(1),
|
||||
waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
last_error: Arc::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// For use on startup when learning state from pageservers: generate my [`IntentState`] from my
|
||||
/// [`ObservedState`], even if it violates my [`PlacementPolicy`]. Call [`Self::schedule`] next,
|
||||
/// to get an intent state that complies with placement policy. The overall goal is to do scheduling
|
||||
/// in a way that makes use of any configured locations that already exist in the outside world.
|
||||
pub(crate) fn intent_from_observed(&mut self) {
|
||||
// Choose an attached location by filtering observed locations, and then sorting to get the highest
|
||||
// generation
|
||||
let mut attached_locs = self
|
||||
.observed
|
||||
.locations
|
||||
.iter()
|
||||
.filter_map(|(node_id, l)| {
|
||||
if let Some(conf) = &l.conf {
|
||||
if conf.mode == LocationConfigMode::AttachedMulti
|
||||
|| conf.mode == LocationConfigMode::AttachedSingle
|
||||
|| conf.mode == LocationConfigMode::AttachedStale
|
||||
{
|
||||
Some((node_id, conf.generation))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
attached_locs.sort_by_key(|i| i.1);
|
||||
if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
|
||||
self.intent.attached = Some(*node_id);
|
||||
}
|
||||
|
||||
// All remaining observed locations generate secondary intents. This includes None
|
||||
// observations, as these may well have some local content on disk that is usable (this
|
||||
// is an edge case that might occur if we restarted during a migration or other change)
|
||||
self.observed.locations.keys().for_each(|node_id| {
|
||||
if Some(*node_id) != self.intent.attached {
|
||||
self.intent.secondary.push(*node_id);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
||||
// TODO: before scheduling new nodes, check if any existing content in
|
||||
// self.intent refers to pageservers that are offline, and pick other
|
||||
// pageservers if so.
|
||||
|
||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||
// more work on the same pageservers we're already using.
|
||||
let mut used_pageservers = self.intent.all_pageservers();
|
||||
let mut modified = false;
|
||||
|
||||
use PlacementPolicy::*;
|
||||
match self.policy {
|
||||
Single => {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Double(secondary_count) => {
|
||||
// Should have exactly one attached, and N secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.secondary.push(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Detached => {
|
||||
// Should have no attached or secondary pageservers
|
||||
if self.intent.attached.is_some() {
|
||||
self.intent.attached = None;
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if modified {
|
||||
self.sequence.0 += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn dirty(&self) -> bool {
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for node_id in &self.intent.secondary {
|
||||
let wanted_conf = secondary_location_conf(&self.shard, &self.config);
|
||||
match self.observed.locations.get(node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
pub(crate) fn maybe_reconcile(
|
||||
&mut self,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
pageservers: &Arc<HashMap<NodeId, Node>>,
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
service_config: &service::Config,
|
||||
persistence: &Arc<Persistence>,
|
||||
) -> Option<ReconcilerWaiter> {
|
||||
// If there are any ambiguous observed states, and the nodes they refer to are available,
|
||||
// we should reconcile to clean them up.
|
||||
let mut dirty_observed = false;
|
||||
for (node_id, observed_loc) in &self.observed.locations {
|
||||
let node = pageservers
|
||||
.get(node_id)
|
||||
.expect("Nodes may not be removed while referenced");
|
||||
if observed_loc.conf.is_none()
|
||||
&& !matches!(node.availability, NodeAvailability::Offline)
|
||||
{
|
||||
dirty_observed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !self.dirty() && !dirty_observed {
|
||||
tracing::info!("Not dirty, no reconciliation needed.");
|
||||
return None;
|
||||
}
|
||||
|
||||
// Reconcile already in flight for the current sequence?
|
||||
if let Some(handle) = &self.reconciler {
|
||||
if handle.sequence == self.sequence {
|
||||
return Some(ReconcilerWaiter {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
seq_wait: self.waiter.clone(),
|
||||
error_seq_wait: self.error_waiter.clone(),
|
||||
error: self.last_error.clone(),
|
||||
seq: self.sequence,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Reconcile in flight for a stale sequence? Our sequence's task will wait for it before
|
||||
// doing our sequence's work.
|
||||
let old_handle = self.reconciler.take();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let mut reconciler = Reconciler {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
shard: self.shard,
|
||||
generation: self.generation,
|
||||
intent: self.intent.clone(),
|
||||
config: self.config.clone(),
|
||||
observed: self.observed.clone(),
|
||||
pageservers: pageservers.clone(),
|
||||
compute_hook: compute_hook.clone(),
|
||||
service_config: service_config.clone(),
|
||||
cancel: cancel.clone(),
|
||||
persistence: persistence.clone(),
|
||||
};
|
||||
|
||||
let reconcile_seq = self.sequence;
|
||||
|
||||
tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
|
||||
let join_handle = tokio::task::spawn(async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
if let Some(old_handle) = old_handle {
|
||||
old_handle.cancel.cancel();
|
||||
if let Err(e) = old_handle.handle.await {
|
||||
// We can't do much with this other than log it: the task is done, so
|
||||
// we may proceed with our work.
|
||||
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
// Early check for cancellation before doing any work
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
|
||||
let result = reconciler.reconcile().await;
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
sequence: reconcile_seq,
|
||||
result,
|
||||
tenant_shard_id: reconciler.tenant_shard_id,
|
||||
generation: reconciler.generation,
|
||||
observed: reconciler.observed,
|
||||
})
|
||||
.ok();
|
||||
});
|
||||
|
||||
self.reconciler = Some(ReconcilerHandle {
|
||||
sequence: self.sequence,
|
||||
handle: join_handle,
|
||||
cancel,
|
||||
});
|
||||
|
||||
Some(ReconcilerWaiter {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
seq_wait: self.waiter.clone(),
|
||||
error_seq_wait: self.error_waiter.clone(),
|
||||
error: self.last_error.clone(),
|
||||
seq: self.sequence,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,45 +1,22 @@
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use diesel::{
|
||||
backend::Backend,
|
||||
query_builder::{AstPass, QueryFragment, QueryId},
|
||||
Connection, PgConnection, QueryResult, RunQueryDsl,
|
||||
};
|
||||
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
|
||||
use hyper::Method;
|
||||
use pageserver_api::{
|
||||
models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{env, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
use anyhow::anyhow;
|
||||
use camino::Utf8PathBuf;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{path::PathBuf, process::Child};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
pub struct AttachmentService {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
jwt_token: Option<String>,
|
||||
public_key_path: Option<Utf8PathBuf>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
path: PathBuf,
|
||||
client: reqwest::blocking::Client,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "attachment_service";
|
||||
|
||||
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub tenant_id: TenantId,
|
||||
pub node_id: Option<NodeId>,
|
||||
}
|
||||
|
||||
@@ -50,7 +27,7 @@ pub struct AttachHookResponse {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct InspectRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub tenant_id: TenantId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -58,131 +35,9 @@ pub struct InspectResponse {
|
||||
pub attachment: Option<(u32, NodeId)>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
pub generation: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponse {
|
||||
pub shards: Vec<TenantCreateResponseShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeRegisterRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeConfigureRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub availability: Option<NodeAvailability>,
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantLocateResponse {
|
||||
pub shards: Vec<TenantLocateResponseShard>,
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||
// secondary locations on this node still exist. Newly added nodes are in this
|
||||
// state until we successfully contact them.
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
Pause,
|
||||
Draining,
|
||||
}
|
||||
|
||||
impl FromStr for NodeSchedulingPolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"filling" => Ok(Self::Filling),
|
||||
"pause" => Ok(Self::Pause),
|
||||
"draining" => Ok(Self::Draining),
|
||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeSchedulingPolicy> for String {
|
||||
fn from(value: NodeSchedulingPolicy) -> String {
|
||||
use NodeSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Filling => "filling",
|
||||
Pause => "pause",
|
||||
Draining => "draining",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
impl AttachmentService {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachments.json");
|
||||
let path = env.base_data_dir.join("attachments.json");
|
||||
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
@@ -194,43 +49,11 @@ impl AttachmentService {
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
|
||||
// port, for use by our captive postgres.
|
||||
let postgres_port = listen_url
|
||||
.port()
|
||||
.expect("Control plane API setting should always have a port")
|
||||
+ 1;
|
||||
|
||||
// Assume all pageservers have symmetric auth configuration: this service
|
||||
// expects to use one JWT token to talk to all of them.
|
||||
let ps_conf = env
|
||||
.pageservers
|
||||
.first()
|
||||
.expect("Config is validated to contain at least one pageserver");
|
||||
let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
|
||||
AuthType::Trust => (None, None),
|
||||
AuthType::NeonJWT => {
|
||||
let jwt_token = env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
||||
.unwrap();
|
||||
|
||||
// If pageserver auth is enabled, this implicitly enables auth for this service,
|
||||
// using the same credentials.
|
||||
let public_key_path =
|
||||
camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
|
||||
.unwrap();
|
||||
(Some(jwt_token), Some(public_key_path))
|
||||
}
|
||||
};
|
||||
|
||||
Self {
|
||||
env: env.clone(),
|
||||
path,
|
||||
listen,
|
||||
jwt_token,
|
||||
public_key_path,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
client: reqwest::blocking::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
}
|
||||
@@ -241,430 +64,74 @@ impl AttachmentService {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// PIDFile for the postgres instance used to store attachment service state
|
||||
fn postgres_pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join("attachment_service_postgres.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// In order to access database migrations, we need to find the Neon source tree
|
||||
async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
// We assume that either prd or our binary is in the source tree. The former is usually
|
||||
// true for automated test runners, the latter is usually true for developer workstations. Often
|
||||
// both are true, which is fine.
|
||||
let candidate_start_points = [
|
||||
// Current working directory
|
||||
Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
|
||||
// Directory containing the binary we're running inside
|
||||
Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
|
||||
];
|
||||
|
||||
// For each candidate start point, search through ancestors looking for a neon.git source tree root
|
||||
for start_point in &candidate_start_points {
|
||||
// Start from the build dir: assumes we are running out of a built neon source tree
|
||||
for path in start_point.ancestors() {
|
||||
// A crude approximation: the root of the source tree is whatever contains a "control_plane"
|
||||
// subdirectory.
|
||||
let control_plane = path.join("control_plane");
|
||||
if tokio::fs::try_exists(&control_plane).await? {
|
||||
return Ok(path.to_owned());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fall-through
|
||||
Err(anyhow::anyhow!(
|
||||
"Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
|
||||
))
|
||||
}
|
||||
|
||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||
///
|
||||
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
|
||||
|
||||
for v in prefer_versions {
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
||||
if tokio::fs::try_exists(&path).await? {
|
||||
return Ok(path);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through
|
||||
anyhow::bail!(
|
||||
"Postgres binaries not found in {}",
|
||||
self.env.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
/// Readiness check for our postgres process
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||
let bin_path = pg_bin_dir.join("pg_isready");
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
|
||||
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
||||
|
||||
Ok(exitcode.success())
|
||||
}
|
||||
|
||||
/// Create our database if it doesn't exist, and run migrations.
|
||||
///
|
||||
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
||||
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
||||
/// who just want to run `cargo neon_local` without knowing about diesel.
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
let database_url = format!(
|
||||
"postgresql://localhost:{}/attachment_service",
|
||||
self.postgres_port
|
||||
);
|
||||
println!("Running attachment service database setup...");
|
||||
fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
|
||||
let base = ::url::Url::parse(database_url).unwrap();
|
||||
let database = base.path_segments().unwrap().last().unwrap().to_owned();
|
||||
let mut new_url = base.join(default_database).unwrap();
|
||||
new_url.set_query(base.query());
|
||||
(database, new_url.into())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CreateDatabaseStatement {
|
||||
db_name: String,
|
||||
}
|
||||
|
||||
impl CreateDatabaseStatement {
|
||||
pub fn new(db_name: &str) -> Self {
|
||||
CreateDatabaseStatement {
|
||||
db_name: db_name.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
|
||||
fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
|
||||
out.push_sql("CREATE DATABASE ");
|
||||
out.push_identifier(&self.db_name)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
|
||||
|
||||
impl QueryId for CreateDatabaseStatement {
|
||||
type QueryId = ();
|
||||
|
||||
const HAS_STATIC_QUERY_ID: bool = false;
|
||||
}
|
||||
if PgConnection::establish(&database_url).is_err() {
|
||||
let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
|
||||
println!("Creating database: {database}");
|
||||
let mut conn = PgConnection::establish(&postgres_url)?;
|
||||
CreateDatabaseStatement::new(&database).execute(&mut conn)?;
|
||||
}
|
||||
let mut conn = PgConnection::establish(&database_url)?;
|
||||
|
||||
let migrations_dir = self
|
||||
.find_source_root()
|
||||
.await?
|
||||
.join("control_plane/attachment_service/migrations");
|
||||
|
||||
let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
|
||||
println!("Running migrations in {}", migrations.path().display());
|
||||
HarnessWithOutput::write_to_stdout(&mut conn)
|
||||
.run_pending_migrations(migrations)
|
||||
.map(|_| ())
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
println!("Migrations complete");
|
||||
|
||||
Ok(database_url)
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the attachment service for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachment_service_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
};
|
||||
|
||||
println!("Starting attachment service database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"attachment_service_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
|| self.pg_isready(&pg_bin_dir),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
"-p",
|
||||
self.path.as_ref(),
|
||||
"--database-url",
|
||||
&database_url,
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
}
|
||||
|
||||
if let Some(public_key_path) = &self.public_key_path {
|
||||
args.push(format!("--public-key={public_key_path}"));
|
||||
}
|
||||
pub fn start(&self) -> anyhow::Result<Child> {
|
||||
let path_str = self.path.to_string_lossy();
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.attachment_service_bin(),
|
||||
args,
|
||||
[(
|
||||
"NEON_REPO_DIR".to_string(),
|
||||
self.env.base_data_dir.to_string_lossy().to_string(),
|
||||
)],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
|| async {
|
||||
match self.status().await {
|
||||
Ok(_) => Ok(true),
|
||||
Err(_) => Ok(false),
|
||||
}
|
||||
},
|
||||
["-l", &self.listen, "-p", &path_str],
|
||||
[],
|
||||
background_process::InitialPidFile::Create(&self.pid_file()),
|
||||
// TODO: a real status check
|
||||
|| Ok(true),
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
println!("Stopping attachment service database...");
|
||||
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
|
||||
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_stop_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
if !stop_status.success() {
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Attachment service data base is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: hyper::Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> anyhow::Result<RS>
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
}
|
||||
|
||||
let response = builder.send().await?;
|
||||
let response = response.error_from_body().await?;
|
||||
|
||||
Ok(response
|
||||
.json()
|
||||
.await
|
||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())
|
||||
}
|
||||
|
||||
/// Call into the attach_hook API, for use before handing out attachments to pageservers
|
||||
#[instrument(skip(self))]
|
||||
pub async fn attach_hook(
|
||||
pub fn attach_hook(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
tenant_id: TenantId,
|
||||
pageserver_id: NodeId,
|
||||
) -> anyhow::Result<Option<u32>> {
|
||||
use hyper::StatusCode;
|
||||
|
||||
let url = self
|
||||
.env
|
||||
.control_plane_api
|
||||
.clone()
|
||||
.unwrap()
|
||||
.join("attach-hook")
|
||||
.unwrap();
|
||||
|
||||
let request = AttachHookRequest {
|
||||
tenant_shard_id,
|
||||
tenant_id,
|
||||
node_id: Some(pageserver_id),
|
||||
};
|
||||
|
||||
let response = self
|
||||
.dispatch::<_, AttachHookResponse>(
|
||||
Method::POST,
|
||||
"debug/v1/attach-hook".to_string(),
|
||||
Some(request),
|
||||
)
|
||||
.await?;
|
||||
let response = self.client.post(url).json(&request).send()?;
|
||||
if response.status() != StatusCode::OK {
|
||||
return Err(anyhow!("Unexpected status {}", response.status()));
|
||||
}
|
||||
|
||||
let response = response.json::<AttachHookResponse>()?;
|
||||
Ok(response.gen)
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn inspect(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> anyhow::Result<Option<(u32, NodeId)>> {
|
||||
let request = InspectRequest { tenant_shard_id };
|
||||
pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
|
||||
use hyper::StatusCode;
|
||||
|
||||
let response = self
|
||||
.dispatch::<_, InspectResponse>(
|
||||
Method::POST,
|
||||
"debug/v1/inspect".to_string(),
|
||||
Some(request),
|
||||
)
|
||||
.await?;
|
||||
let url = self
|
||||
.env
|
||||
.control_plane_api
|
||||
.clone()
|
||||
.unwrap()
|
||||
.join("inspect")
|
||||
.unwrap();
|
||||
|
||||
let request = InspectRequest { tenant_id };
|
||||
|
||||
let response = self.client.post(url).json(&request).send()?;
|
||||
if response.status() != StatusCode::OK {
|
||||
return Err(anyhow!("Unexpected status {}", response.status()));
|
||||
}
|
||||
|
||||
let response = response.json::<InspectResponse>()?;
|
||||
Ok(response.attachment)
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_create(
|
||||
&self,
|
||||
req: TenantCreateRequest,
|
||||
) -> anyhow::Result<TenantCreateResponse> {
|
||||
self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||
self.dispatch::<(), _>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_migrate(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<TenantShardMigrateResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("tenant/{tenant_shard_id}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id,
|
||||
node_id,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
||||
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
||||
pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{}/config", req.node_id),
|
||||
Some(req),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn status(&self) -> anyhow::Result<()> {
|
||||
self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))]
|
||||
pub async fn tenant_timeline_create(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
req: TimelineCreateRequest,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
self.dispatch(
|
||||
Method::POST,
|
||||
format!("v1/tenant/{tenant_id}/timeline"),
|
||||
Some(req),
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::io::Write;
|
||||
use std::os::unix::prelude::AsRawFd;
|
||||
use std::os::unix::process::CommandExt;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
use std::process::{Child, Command};
|
||||
use std::time::Duration;
|
||||
use std::{fs, io, thread};
|
||||
|
||||
@@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
|
||||
|
||||
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
|
||||
/// it itself.
|
||||
pub enum InitialPidFile {
|
||||
pub enum InitialPidFile<'t> {
|
||||
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
|
||||
Create(Utf8PathBuf),
|
||||
Create(&'t Utf8Path),
|
||||
/// The process will create the pidfile itself, need to wait for that event.
|
||||
Expect(Utf8PathBuf),
|
||||
Expect(&'t Utf8Path),
|
||||
}
|
||||
|
||||
/// Start a background child process using the parameters given.
|
||||
pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
pub fn start_process<F, AI, A, EI>(
|
||||
process_name: &str,
|
||||
datadir: &Path,
|
||||
command: &Path,
|
||||
@@ -60,10 +60,9 @@ pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
envs: EI,
|
||||
initial_pid_file: InitialPidFile,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<()>
|
||||
) -> anyhow::Result<Child>
|
||||
where
|
||||
F: Fn() -> Fut,
|
||||
Fut: std::future::Future<Output = anyhow::Result<bool>>,
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
AI: IntoIterator<Item = A>,
|
||||
A: AsRef<OsStr>,
|
||||
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
|
||||
@@ -90,7 +89,7 @@ where
|
||||
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
|
||||
filled_cmd.envs(envs);
|
||||
|
||||
let pid_file_to_check = match &initial_pid_file {
|
||||
let pid_file_to_check = match initial_pid_file {
|
||||
InitialPidFile::Create(path) => {
|
||||
pre_exec_create_pidfile(filled_cmd, path);
|
||||
path
|
||||
@@ -98,7 +97,7 @@ where
|
||||
InitialPidFile::Expect(path) => path,
|
||||
};
|
||||
|
||||
let spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
let mut spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
format!("Could not spawn {process_name}, see console output and log files for details.")
|
||||
})?;
|
||||
let pid = spawned_process.id();
|
||||
@@ -106,26 +105,12 @@ where
|
||||
i32::try_from(pid)
|
||||
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
|
||||
);
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
|
||||
println!("SIGKILL & wait the started process");
|
||||
(|| {
|
||||
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
|
||||
spawned_process.kill().context("SIGKILL child")?;
|
||||
spawned_process.wait().context("wait() for child process")?;
|
||||
anyhow::Ok(())
|
||||
})()
|
||||
.with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check).await {
|
||||
match process_started(pid, Some(pid_file_to_check), &process_status_check) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started and passed status check, pid: {pid}");
|
||||
// leak the child process, it'll outlive this neon_local invocation
|
||||
drop(scopeguard::ScopeGuard::into_inner(spawned_process));
|
||||
return Ok(());
|
||||
println!("\n{process_name} started, pid: {pid}");
|
||||
return Ok(spawned_process);
|
||||
}
|
||||
Ok(false) => {
|
||||
if retries == NOTICE_AFTER_RETRIES {
|
||||
@@ -140,15 +125,16 @@ where
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("error starting process {process_name:?}: {e:#}");
|
||||
println!("{process_name} failed to start: {e:#}");
|
||||
if let Err(e) = spawned_process.kill() {
|
||||
println!("Could not stop {process_name} subprocess: {e:#}")
|
||||
};
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!(
|
||||
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
|
||||
);
|
||||
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
@@ -256,9 +242,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_PROFILE",
|
||||
// HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
|
||||
"HOME",
|
||||
"AWS_SESSION_TOKEN",
|
||||
"AZURE_STORAGE_ACCOUNT",
|
||||
"AZURE_STORAGE_ACCESS_KEY",
|
||||
] {
|
||||
@@ -332,20 +316,22 @@ where
|
||||
cmd
|
||||
}
|
||||
|
||||
async fn process_started<F, Fut>(
|
||||
fn process_started<F>(
|
||||
pid: Pid,
|
||||
pid_file_to_check: &Utf8Path,
|
||||
pid_file_to_check: Option<&Utf8Path>,
|
||||
status_check: &F,
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
F: Fn() -> Fut,
|
||||
Fut: std::future::Future<Output = anyhow::Result<bool>>,
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
{
|
||||
match status_check().await {
|
||||
Ok(true) => match pid_file::read(pid_file_to_check)? {
|
||||
PidFileRead::NotExist => Ok(false),
|
||||
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
|
||||
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
|
||||
match status_check() {
|
||||
Ok(true) => match pid_file_to_check {
|
||||
Some(pid_file_path) => match pid_file::read(pid_file_path)? {
|
||||
PidFileRead::NotExist => Ok(false),
|
||||
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
|
||||
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
|
||||
},
|
||||
None => Ok(true),
|
||||
},
|
||||
Ok(false) => Ok(false),
|
||||
Err(e) => anyhow::bail!("process failed to start: {e}"),
|
||||
|
||||
320
control_plane/src/bin/attachment_service.rs
Normal file
320
control_plane/src/bin/attachment_service.rs
Normal file
@@ -0,0 +1,320 @@
|
||||
/// The attachment service mimics the aspects of the control plane API
|
||||
/// that are required for a pageserver to operate.
|
||||
///
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::anyhow;
|
||||
use clap::Parser;
|
||||
use hex::FromHex;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
endpoint::{self},
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
id::{NodeId, TenantId},
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
use pageserver_api::control_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
|
||||
ValidateResponseTenant,
|
||||
};
|
||||
|
||||
use control_plane::attachment_service::{
|
||||
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
|
||||
};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
struct Cli {
|
||||
/// Host and port to listen on, like `127.0.0.1:1234`
|
||||
#[arg(short, long)]
|
||||
listen: std::net::SocketAddr,
|
||||
|
||||
/// Path to the .json file to store state (will be created if it doesn't exist)
|
||||
#[arg(short, long)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
// The persistent state of each Tenant
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
struct TenantState {
|
||||
// Currently attached pageserver
|
||||
pageserver: Option<NodeId>,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
generation: u32,
|
||||
}
|
||||
|
||||
fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
V: Clone + Serialize,
|
||||
{
|
||||
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
|
||||
|
||||
transformed
|
||||
.collect::<HashMap<String, V>>()
|
||||
.serialize(serializer)
|
||||
}
|
||||
|
||||
fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
V: Deserialize<'de>,
|
||||
{
|
||||
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
|
||||
hex_map
|
||||
.into_iter()
|
||||
.map(|(k, v)| {
|
||||
TenantId::from_hex(k)
|
||||
.map(|k| (k, v))
|
||||
.map_err(serde::de::Error::custom)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PersistentState {
|
||||
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
|
||||
tenants: HashMap<TenantId, TenantState>,
|
||||
|
||||
#[serde(skip)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl PersistentState {
|
||||
async fn save(&self) -> anyhow::Result<()> {
|
||||
let bytes = serde_json::to_vec(self)?;
|
||||
tokio::fs::write(&self.path, &bytes).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn load(path: &Path) -> anyhow::Result<Self> {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
|
||||
decoded.path = path.to_owned();
|
||||
Ok(decoded)
|
||||
}
|
||||
|
||||
async fn load_or_new(path: &Path) -> Self {
|
||||
match Self::load(path).await {
|
||||
Ok(s) => {
|
||||
tracing::info!("Loaded state file at {}", path.display());
|
||||
s
|
||||
}
|
||||
Err(e)
|
||||
if e.downcast_ref::<std::io::Error>()
|
||||
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
|
||||
.unwrap_or(false) =>
|
||||
{
|
||||
tracing::info!("Will create state file at {}", path.display());
|
||||
Self {
|
||||
tenants: HashMap::new(),
|
||||
path: path.to_owned(),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
struct State {
|
||||
inner: Arc<tokio::sync::RwLock<PersistentState>>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn new(persistent_state: PersistentState) -> State {
|
||||
Self {
|
||||
inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn get_state(request: &Request<Body>) -> &State {
|
||||
request
|
||||
.data::<Arc<State>>()
|
||||
.expect("unknown state type")
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let mut locked = state.write().await;
|
||||
|
||||
let mut response = ReAttachResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
for (t, state) in &mut locked.tenants {
|
||||
if state.pageserver == Some(reattach_req.node_id) {
|
||||
state.generation += 1;
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: *t,
|
||||
gen: state.generation,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||
/// holds the latest generation for the tenants with deletions enqueued
|
||||
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||
|
||||
let locked = get_state(&req).inner.read().await;
|
||||
|
||||
let mut response = ValidateResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
|
||||
for req_tenant in validate_req.tenants {
|
||||
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
|
||||
let valid = tenant_state.generation == req_tenant.gen;
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
|
||||
/// (in the real control plane this is unnecessary, because the same program is managing
|
||||
/// generation numbers and doing attachments).
|
||||
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let mut locked = state.write().await;
|
||||
|
||||
let tenant_state = locked
|
||||
.tenants
|
||||
.entry(attach_req.tenant_id)
|
||||
.or_insert_with(|| TenantState {
|
||||
pageserver: attach_req.node_id,
|
||||
generation: 0,
|
||||
});
|
||||
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
tenant_state.generation += 1;
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
ps_id = %attaching_pageserver,
|
||||
generation = %tenant_state.generation,
|
||||
"issuing",
|
||||
);
|
||||
} else if let Some(ps_id) = tenant_state.pageserver {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
%ps_id,
|
||||
generation = %tenant_state.generation,
|
||||
"dropping",
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
"no-op: tenant already has no pageserver");
|
||||
}
|
||||
tenant_state.pageserver = attach_req.node_id;
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
AttachHookResponse {
|
||||
gen: attach_req.node_id.map(|_| generation),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let locked = state.write().await;
|
||||
let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
InspectResponse {
|
||||
attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.data(Arc::new(State::new(persistent_state)))
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||
.post("/inspect", |r| request_span(r, handle_inspect))
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
)?;
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"Starting, state at {}, listening on {}",
|
||||
args.path.to_string_lossy(),
|
||||
args.listen
|
||||
);
|
||||
|
||||
let persistent_state = PersistentState::load_or_new(&args.path).await;
|
||||
|
||||
let http_listener = tcp_listener::bind(args.listen)?;
|
||||
let router = make_router(persistent_state)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
|
||||
tokio::task::spawn(server);
|
||||
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
|
||||
tracing::info!("Got {}. Terminating", signal.name());
|
||||
// We're just a test helper: no graceful shutdown.
|
||||
std::process::exit(0);
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6,26 +6,21 @@
|
||||
//! rely on `neon_local` to set up the environment for each test.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::{
|
||||
AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
|
||||
};
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::tenant_migration::migrate_tenant;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use pageserver_api::models::TimelineInfo;
|
||||
use pageserver_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||
@@ -35,7 +30,6 @@ use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use url::Host;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -51,7 +45,7 @@ project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
|
||||
|
||||
fn default_conf(num_pageservers: u16) -> String {
|
||||
let mut template = format!(
|
||||
@@ -126,20 +120,15 @@ fn main() -> Result<()> {
|
||||
let mut env = LocalEnv::load_config().context("Error loading config")?;
|
||||
let original_env = env.clone();
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let subcommand_result = match sub_name {
|
||||
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
||||
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
|
||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
||||
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
|
||||
"tenant" => handle_tenant(sub_args, &mut env),
|
||||
"timeline" => handle_timeline(sub_args, &mut env),
|
||||
"start" => handle_start_all(sub_args, &env),
|
||||
"stop" => handle_stop_all(sub_args, &env),
|
||||
"pageserver" => handle_pageserver(sub_args, &env),
|
||||
"attachment_service" => handle_attachment_service(sub_args, &env),
|
||||
"safekeeper" => handle_safekeeper(sub_args, &env),
|
||||
"endpoint" => handle_endpoint(sub_args, &env),
|
||||
"mappings" => handle_mappings(sub_args, &mut env),
|
||||
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
|
||||
_ => bail!("unexpected subcommand {sub_name}"),
|
||||
@@ -179,7 +168,7 @@ fn print_timelines_tree(
|
||||
info: t.clone(),
|
||||
children: BTreeSet::new(),
|
||||
name: timeline_name_mappings
|
||||
.remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
|
||||
.remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
|
||||
},
|
||||
)
|
||||
})
|
||||
@@ -280,13 +269,12 @@ fn print_timeline(
|
||||
|
||||
/// Returns a map of timeline IDs to timeline_id@lsn strings.
|
||||
/// Connects to the pageserver to query this information.
|
||||
async fn get_timeline_infos(
|
||||
fn get_timeline_infos(
|
||||
env: &local_env::LocalEnv,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<HashMap<TimelineId, TimelineInfo>> {
|
||||
Ok(get_default_pageserver(env)
|
||||
.timeline_list(tenant_shard_id)
|
||||
.await?
|
||||
.timeline_list(tenant_id)?
|
||||
.into_iter()
|
||||
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
|
||||
.collect())
|
||||
@@ -303,20 +291,6 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to parse --tenant_id option, for commands that accept a shard suffix
|
||||
fn get_tenant_shard_id(
|
||||
sub_match: &ArgMatches,
|
||||
env: &local_env::LocalEnv,
|
||||
) -> anyhow::Result<TenantShardId> {
|
||||
if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() {
|
||||
tenant_id_from_arguments
|
||||
} else if let Some(default_id) = env.default_tenant_id {
|
||||
Ok(TenantShardId::unsharded(default_id))
|
||||
} else {
|
||||
anyhow::bail!("No tenant shard id. Use --tenant-id, or set a default tenant");
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
||||
sub_match
|
||||
.get_one::<String>("tenant-id")
|
||||
@@ -325,14 +299,6 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
||||
.context("Failed to parse tenant id from the argument string")
|
||||
}
|
||||
|
||||
fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantShardId>> {
|
||||
sub_match
|
||||
.get_one::<String>("tenant-id")
|
||||
.map(|id_str| TenantShardId::from_str(id_str))
|
||||
.transpose()
|
||||
.context("Failed to parse tenant shard id from the argument string")
|
||||
}
|
||||
|
||||
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
|
||||
sub_match
|
||||
.get_one::<String>("timeline-id")
|
||||
@@ -366,7 +332,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
|
||||
let mut env =
|
||||
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
|
||||
let force = init_match.get_one("force").expect("we set a default value");
|
||||
let force = init_match.get_flag("force");
|
||||
env.init(pg_version, force)
|
||||
.context("Failed to initialize neon repository")?;
|
||||
|
||||
@@ -407,82 +373,51 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn handle_tenant(
|
||||
tenant_match: &ArgMatches,
|
||||
env: &mut local_env::LocalEnv,
|
||||
) -> anyhow::Result<()> {
|
||||
fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let pageserver = get_default_pageserver(env);
|
||||
match tenant_match.subcommand() {
|
||||
Some(("list", _)) => {
|
||||
for t in pageserver.tenant_list().await? {
|
||||
for t in pageserver.tenant_list()? {
|
||||
println!("{} {:?}", t.id, t.state);
|
||||
}
|
||||
}
|
||||
Some(("create", create_match)) => {
|
||||
let tenant_conf: HashMap<_, _> = create_match
|
||||
.get_many::<String>("config")
|
||||
.map(|vals: clap::parser::ValuesRef<'_, String>| {
|
||||
vals.flat_map(|c| c.split_once(':')).collect()
|
||||
})
|
||||
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
let shard_count: u8 = create_match
|
||||
.get_one::<u8>("shard-count")
|
||||
.cloned()
|
||||
.unwrap_or(0);
|
||||
|
||||
let shard_stripe_size: Option<u32> =
|
||||
create_match.get_one::<u32>("shard-stripe-size").cloned();
|
||||
|
||||
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
||||
|
||||
// If tenant ID was not specified, generate one
|
||||
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
||||
|
||||
// We must register the tenant with the attachment service, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.tenant_create(TenantCreateRequest {
|
||||
// Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
|
||||
// attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
||||
// type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters {
|
||||
count: ShardCount(shard_count),
|
||||
stripe_size: shard_stripe_size
|
||||
.map(ShardStripeSize)
|
||||
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
|
||||
},
|
||||
config: tenant_conf,
|
||||
})
|
||||
.await?;
|
||||
let generation = if env.control_plane_api.is_some() {
|
||||
// We must register the tenant with the attachment service, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
|
||||
println!("tenant {tenant_id} successfully created on the pageserver");
|
||||
|
||||
// Create an initial timeline for the new tenant
|
||||
let new_timeline_id =
|
||||
parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
|
||||
let new_timeline_id = parse_timeline_id(create_match)?;
|
||||
let pg_version = create_match
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
// FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
|
||||
// different shards picking different start lsns. Maybe we have to teach attachment service
|
||||
// to let shard 0 branch first and then propagate the chosen LSN to other shards.
|
||||
attachment_service
|
||||
.tenant_timeline_create(
|
||||
tenant_id,
|
||||
TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: None,
|
||||
ancestor_start_lsn: None,
|
||||
existing_initdb_timeline_id: None,
|
||||
pg_version: Some(pg_version),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
tenant_id,
|
||||
new_timeline_id,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
|
||||
env.register_branch_mapping(
|
||||
DEFAULT_BRANCH_NAME.to_string(),
|
||||
@@ -490,7 +425,9 @@ async fn handle_tenant(
|
||||
new_timeline_id,
|
||||
)?;
|
||||
|
||||
println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);
|
||||
println!(
|
||||
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
|
||||
);
|
||||
|
||||
if create_match.get_flag("set-default") {
|
||||
println!("Setting tenant {tenant_id} as a default one");
|
||||
@@ -512,84 +449,31 @@ async fn handle_tenant(
|
||||
|
||||
pageserver
|
||||
.tenant_config(tenant_id, tenant_conf)
|
||||
.await
|
||||
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||
}
|
||||
Some(("migrate", matches)) => {
|
||||
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
let new_pageserver = get_pageserver(env, matches)?;
|
||||
let new_pageserver_id = new_pageserver.conf.id;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
||||
.await?;
|
||||
|
||||
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
|
||||
migrate_tenant(env, tenant_id, new_pageserver)?;
|
||||
println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
|
||||
}
|
||||
Some(("status", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
|
||||
let mut shard_table = comfy_table::Table::new();
|
||||
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
|
||||
|
||||
let mut tenant_synthetic_size = None;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
||||
|
||||
let size = pageserver
|
||||
.http_client
|
||||
.tenant_details(shard.shard_id)
|
||||
.await?
|
||||
.tenant_info
|
||||
.current_physical_size
|
||||
.unwrap();
|
||||
|
||||
shard_table.add_row([
|
||||
format!("{}", shard.shard_id.shard_slug()),
|
||||
format!("{}", shard.node_id.0),
|
||||
format!("{} MiB", size / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
if shard.shard_id.is_zero() {
|
||||
tenant_synthetic_size =
|
||||
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
|
||||
}
|
||||
}
|
||||
|
||||
let Some(synthetic_size) = tenant_synthetic_size else {
|
||||
bail!("Shard 0 not found")
|
||||
};
|
||||
|
||||
let mut tenant_table = comfy_table::Table::new();
|
||||
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
|
||||
tenant_table.add_row([
|
||||
"Synthetic size".to_string(),
|
||||
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
println!("{tenant_table}");
|
||||
println!("{shard_table}");
|
||||
}
|
||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||
None => bail!("no tenant subcommand provided"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
|
||||
fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
|
||||
let pageserver = get_default_pageserver(env);
|
||||
|
||||
match timeline_match.subcommand() {
|
||||
Some(("list", list_match)) => {
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
|
||||
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
|
||||
let tenant_id = get_tenant_id(list_match, env)?;
|
||||
let timelines = pageserver.timeline_list(&tenant_id)?;
|
||||
print_timelines_tree(timelines, env.timeline_name_mappings())?;
|
||||
}
|
||||
Some(("create", create_match)) => {
|
||||
@@ -603,20 +487,9 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
||||
let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: None,
|
||||
existing_initdb_timeline_id: None,
|
||||
ancestor_start_lsn: None,
|
||||
pg_version: Some(pg_version),
|
||||
};
|
||||
let timeline_info = attachment_service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
let timeline_info =
|
||||
pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
||||
@@ -660,9 +533,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
println!("Importing timeline into pageserver ...");
|
||||
pageserver
|
||||
.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
|
||||
.await?;
|
||||
pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
|
||||
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
|
||||
|
||||
println!("Creating endpoint for imported timeline ...");
|
||||
@@ -674,6 +545,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
None,
|
||||
pg_version,
|
||||
ComputeMode::Primary,
|
||||
DEFAULT_PAGESERVER_ID,
|
||||
)?;
|
||||
println!("Done");
|
||||
}
|
||||
@@ -697,18 +569,14 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||
.transpose()
|
||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||
let new_timeline_id = TimelineId::generate();
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: Some(ancestor_timeline_id),
|
||||
existing_initdb_timeline_id: None,
|
||||
ancestor_start_lsn: start_lsn,
|
||||
pg_version: None,
|
||||
};
|
||||
let timeline_info = attachment_service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
tenant_id,
|
||||
None,
|
||||
start_lsn,
|
||||
Some(ancestor_timeline_id),
|
||||
None,
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
|
||||
@@ -726,24 +594,22 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match ep_match.subcommand() {
|
||||
Some(ep_subcommand_data) => ep_subcommand_data,
|
||||
None => bail!("no endpoint subcommand provided"),
|
||||
};
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
|
||||
// All subcommands take an optional --tenant-id option
|
||||
let tenant_id = get_tenant_id(sub_args, env)?;
|
||||
|
||||
match sub_name {
|
||||
"list" => {
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
|
||||
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
|
||||
.await
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("Failed to load timeline info: {}", e);
|
||||
HashMap::new()
|
||||
});
|
||||
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
|
||||
eprintln!("Failed to load timeline info: {}", e);
|
||||
HashMap::new()
|
||||
});
|
||||
|
||||
let timeline_name_mappings = env.timeline_name_mappings();
|
||||
|
||||
@@ -763,7 +629,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
for (endpoint_id, endpoint) in cplane
|
||||
.endpoints
|
||||
.iter()
|
||||
.filter(|(_, endpoint)| endpoint.tenant_id == tenant_shard_id.tenant_id)
|
||||
.filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
|
||||
{
|
||||
let lsn_str = match endpoint.mode {
|
||||
ComputeMode::Static(lsn) => {
|
||||
@@ -782,10 +648,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
};
|
||||
|
||||
let branch_name = timeline_name_mappings
|
||||
.get(&TenantTimelineId::new(
|
||||
tenant_shard_id.tenant_id,
|
||||
endpoint.timeline_id,
|
||||
))
|
||||
.get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id))
|
||||
.map(|name| name.as_str())
|
||||
.unwrap_or("?");
|
||||
|
||||
@@ -802,7 +665,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
println!("{table}");
|
||||
}
|
||||
"create" => {
|
||||
let tenant_id = get_tenant_id(sub_args, env)?;
|
||||
let branch_name = sub_args
|
||||
.get_one::<String>("branch-name")
|
||||
.map(|s| s.as_str())
|
||||
@@ -833,6 +695,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.copied()
|
||||
.unwrap_or(false);
|
||||
|
||||
let pageserver_id =
|
||||
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||
NodeId(id_str.parse().context("while parsing pageserver id")?)
|
||||
} else {
|
||||
DEFAULT_PAGESERVER_ID
|
||||
};
|
||||
|
||||
let mode = match (lsn, hot_standby) {
|
||||
(Some(lsn), false) => ComputeMode::Static(lsn),
|
||||
(None, true) => ComputeMode::Replica,
|
||||
@@ -840,18 +709,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
||||
};
|
||||
|
||||
match (mode, hot_standby) {
|
||||
(ComputeMode::Static(_), true) => {
|
||||
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
||||
}
|
||||
(ComputeMode::Primary, true) => {
|
||||
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
|
||||
|
||||
cplane.new_endpoint(
|
||||
&endpoint_id,
|
||||
tenant_id,
|
||||
@@ -860,20 +717,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
http_port,
|
||||
pg_version,
|
||||
mode,
|
||||
pageserver_id,
|
||||
)?;
|
||||
}
|
||||
"start" => {
|
||||
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
|
||||
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
|
||||
let endpoint_id = sub_args
|
||||
.get_one::<String>("endpoint_id")
|
||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
||||
|
||||
let pageserver_id =
|
||||
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||
Some(NodeId(
|
||||
id_str.parse().context("while parsing pageserver id")?,
|
||||
))
|
||||
NodeId(id_str.parse().context("while parsing pageserver id")?)
|
||||
} else {
|
||||
None
|
||||
DEFAULT_PAGESERVER_ID
|
||||
};
|
||||
|
||||
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
||||
@@ -893,67 +751,80 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
env.safekeepers.iter().map(|sk| sk.id).collect()
|
||||
};
|
||||
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
|
||||
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
|
||||
|
||||
cplane.check_conflicting_endpoints(
|
||||
endpoint.mode,
|
||||
endpoint.tenant_id,
|
||||
endpoint.timeline_id,
|
||||
)?;
|
||||
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
|
||||
(
|
||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// full managed by attachment service, therefore not sharded.
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
|
||||
let pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let stripe_size = locate_result.shard_params.stripe_size;
|
||||
|
||||
(pageservers, stripe_size)
|
||||
};
|
||||
assert!(!pageservers.is_empty());
|
||||
|
||||
let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
|
||||
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
|
||||
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
||||
let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
|
||||
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
|
||||
|
||||
Some(env.generate_auth_token(&claims)?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint
|
||||
.start(
|
||||
&auth_token,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
remote_ext_config,
|
||||
stripe_size.0 as usize,
|
||||
)
|
||||
.await?;
|
||||
let hot_standby = sub_args
|
||||
.get_one::<bool>("hot-standby")
|
||||
.copied()
|
||||
.unwrap_or(false);
|
||||
|
||||
if let Some(endpoint) = endpoint {
|
||||
match (&endpoint.mode, hot_standby) {
|
||||
(ComputeMode::Static(_), true) => {
|
||||
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
||||
}
|
||||
(ComputeMode::Primary, true) => {
|
||||
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
} else {
|
||||
let branch_name = sub_args
|
||||
.get_one::<String>("branch-name")
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||
let timeline_id = env
|
||||
.get_branch_timeline_id(branch_name, tenant_id)
|
||||
.ok_or_else(|| {
|
||||
anyhow!("Found no timeline id for branch name '{branch_name}'")
|
||||
})?;
|
||||
let lsn = sub_args
|
||||
.get_one::<String>("lsn")
|
||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||
.transpose()
|
||||
.context("Failed to parse Lsn from the request")?;
|
||||
let pg_version = sub_args
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to `pg-version` from the argument string")?;
|
||||
|
||||
let mode = match (lsn, hot_standby) {
|
||||
(Some(lsn), false) => ComputeMode::Static(lsn),
|
||||
(None, true) => ComputeMode::Replica,
|
||||
(None, false) => ComputeMode::Primary,
|
||||
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
||||
};
|
||||
|
||||
// when used with custom port this results in non obvious behaviour
|
||||
// port is remembered from first start command, i e
|
||||
// start --port X
|
||||
// stop
|
||||
// start <-- will also use port X even without explicit port argument
|
||||
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
|
||||
|
||||
let ep = cplane.new_endpoint(
|
||||
endpoint_id,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
pg_port,
|
||||
http_port,
|
||||
pg_version,
|
||||
mode,
|
||||
pageserver_id,
|
||||
)?;
|
||||
ep.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
}
|
||||
}
|
||||
"reconfigure" => {
|
||||
let endpoint_id = sub_args
|
||||
@@ -963,31 +834,15 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageservers =
|
||||
let pageserver_id =
|
||||
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||
let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?);
|
||||
let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
|
||||
vec![(
|
||||
pageserver.pg_connection_config.host().clone(),
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
Some(NodeId(
|
||||
id_str.parse().context("while parsing pageserver id")?,
|
||||
))
|
||||
} else {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
None
|
||||
};
|
||||
endpoint.reconfigure(pageservers).await?;
|
||||
endpoint.reconfigure(pageserver_id)?;
|
||||
}
|
||||
"stop" => {
|
||||
let endpoint_id = sub_args
|
||||
@@ -1053,13 +908,11 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
))
|
||||
}
|
||||
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(&pageserver_config_overrides(subcommand_args), *register)
|
||||
.await
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
@@ -1086,32 +939,28 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args), false)
|
||||
.await
|
||||
{
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("set-state", subcommand_args)) => {
|
||||
Some(("migrate", subcommand_args)) => {
|
||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
||||
let scheduling = subcommand_args.get_one("scheduling");
|
||||
let availability = subcommand_args.get_one("availability");
|
||||
//TODO what shutdown strategy should we use here?
|
||||
if let Err(e) = pageserver.stop(false) {
|
||||
eprintln!("pageserver stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.node_configure(NodeConfigureRequest {
|
||||
node_id: pageserver.conf.id,
|
||||
scheduling: scheduling.cloned(),
|
||||
availability: availability.cloned(),
|
||||
})
|
||||
.await?;
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("status", subcommand_args)) => {
|
||||
match get_pageserver(env, subcommand_args)?.check_status().await {
|
||||
match get_pageserver(env, subcommand_args)?.check_status() {
|
||||
Ok(_) => println!("Page server is up and running"),
|
||||
Err(err) => {
|
||||
eprintln!("Page server is not available: {}", err);
|
||||
@@ -1126,14 +975,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_attachment_service(
|
||||
sub_match: &ArgMatches,
|
||||
env: &local_env::LocalEnv,
|
||||
) -> Result<()> {
|
||||
fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let svc = AttachmentService::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", _start_match)) => {
|
||||
if let Err(e) = svc.start().await {
|
||||
if let Err(e) = svc.start() {
|
||||
eprintln!("start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1145,7 +991,7 @@ async fn handle_attachment_service(
|
||||
.map(|s| s.as_str())
|
||||
== Some("immediate");
|
||||
|
||||
if let Err(e) = svc.stop(immediate).await {
|
||||
if let Err(e) = svc.stop(immediate) {
|
||||
eprintln!("stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1174,7 +1020,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match sub_match.subcommand() {
|
||||
Some(safekeeper_command_data) => safekeeper_command_data,
|
||||
None => bail!("no safekeeper subcommand provided"),
|
||||
@@ -1192,7 +1038,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
"start" => {
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
|
||||
if let Err(e) = safekeeper.start(extra_opts).await {
|
||||
if let Err(e) = safekeeper.start(extra_opts) {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1218,7 +1064,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
if let Err(e) = safekeeper.start(extra_opts).await {
|
||||
if let Err(e) = safekeeper.start(extra_opts) {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1231,54 +1077,51 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
// Endpoints are not started automatically
|
||||
|
||||
broker::start_broker_process(env).await?;
|
||||
broker::start_broker_process(env)?;
|
||||
|
||||
// Only start the attachment service if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.start().await {
|
||||
if let Err(e) = attachment_service.start() {
|
||||
eprintln!("attachment_service start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
try_stop_all(env, true);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(sub_match), true)
|
||||
.await
|
||||
{
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
try_stop_all(env, true).await;
|
||||
try_stop_all(env, true);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
for node in env.safekeepers.iter() {
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start(vec![]).await {
|
||||
if let Err(e) = safekeeper.start(vec![]) {
|
||||
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
|
||||
try_stop_all(env, false).await;
|
||||
try_stop_all(env, false);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let immediate =
|
||||
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||
|
||||
try_stop_all(env, immediate).await;
|
||||
try_stop_all(env, immediate);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
// Stop all endpoints
|
||||
match ComputeControlPlane::load(env.clone()) {
|
||||
Ok(cplane) => {
|
||||
@@ -1313,7 +1156,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
|
||||
if env.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.stop(immediate).await {
|
||||
if let Err(e) = attachment_service.stop(immediate) {
|
||||
eprintln!("attachment service stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
@@ -1402,7 +1245,7 @@ fn cli() -> Command {
|
||||
let remote_ext_config_args = Arg::new("remote-ext-config")
|
||||
.long("remote-ext-config")
|
||||
.num_args(1)
|
||||
.help("Configure the remote extensions storage proxy gateway to request for extensions.")
|
||||
.help("Configure the S3 bucket that we search for extensions in.")
|
||||
.required(false);
|
||||
|
||||
let lsn_arg = Arg::new("lsn")
|
||||
@@ -1417,15 +1260,9 @@ fn cli() -> Command {
|
||||
.required(false);
|
||||
|
||||
let force_arg = Arg::new("force")
|
||||
.value_parser(value_parser!(InitForceMode))
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("force")
|
||||
.default_value(
|
||||
InitForceMode::MustNotExist
|
||||
.to_possible_value()
|
||||
.unwrap()
|
||||
.get_name()
|
||||
.to_owned(),
|
||||
)
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Force initialization even if the repository is not empty")
|
||||
.required(false);
|
||||
|
||||
@@ -1471,7 +1308,6 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("create")
|
||||
.about("Create a new blank timeline")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
@@ -1509,8 +1345,6 @@ fn cli() -> Command {
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
|
||||
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
|
||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
|
||||
)
|
||||
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
|
||||
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
|
||||
@@ -1521,9 +1355,6 @@ fn cli() -> Command {
|
||||
.about("Migrate a tenant from one pageserver to another")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(pageserver_id_arg.clone()))
|
||||
.subcommand(Command::new("status")
|
||||
.about("Human readable summary of the tenant's shards and attachment locations")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("pageserver")
|
||||
@@ -1533,11 +1364,7 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
|
||||
.long("register")
|
||||
.default_value("true").required(false)
|
||||
.value_parser(value_parser!(bool))
|
||||
.value_name("register"))
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
@@ -1547,12 +1374,6 @@ fn cli() -> Command {
|
||||
.about("Restart local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
.subcommand(Command::new("set-state")
|
||||
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
|
||||
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
|
||||
.about("Set scheduling or availability state of pageserver node")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("attachment_service")
|
||||
@@ -1608,7 +1429,15 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
.arg(endpoint_id_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
.arg(lsn_arg)
|
||||
.arg(pg_port_arg)
|
||||
.arg(http_port_arg)
|
||||
.arg(endpoint_pageserver_id_arg.clone())
|
||||
.arg(pg_version_arg)
|
||||
.arg(hot_standby_arg)
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
)
|
||||
@@ -1621,6 +1450,7 @@ fn cli() -> Command {
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
.arg(endpoint_id_arg)
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(
|
||||
Arg::new("destroy")
|
||||
.help("Also delete data directory (now optional, should be default in future)")
|
||||
|
||||
@@ -11,7 +11,7 @@ use camino::Utf8PathBuf;
|
||||
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let broker = &env.broker;
|
||||
let listen_addr = &broker.listen_addr;
|
||||
|
||||
@@ -19,15 +19,15 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
|
||||
|
||||
let args = [format!("--listen-addr={listen_addr}")];
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let client = reqwest::blocking::Client::new();
|
||||
background_process::start_process(
|
||||
"storage_broker",
|
||||
&env.base_data_dir,
|
||||
&env.storage_broker_bin(),
|
||||
args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
|
||||
|| async {
|
||||
background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
|
||||
|| {
|
||||
let url = broker.client_url();
|
||||
let status_url = url.join("status").with_context(|| {
|
||||
format!("Failed to append /status path to broker endpoint {url}")
|
||||
@@ -36,13 +36,12 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
|
||||
.get(status_url)
|
||||
.build()
|
||||
.with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
|
||||
match client.execute(request).await {
|
||||
match client.execute(request) {
|
||||
Ok(resp) => Ok(resp.status().is_success()),
|
||||
Err(_) => Ok(false),
|
||||
}
|
||||
},
|
||||
)
|
||||
.await
|
||||
.context("Failed to spawn storage_broker subprocess")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -45,19 +45,15 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
|
||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
||||
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
@@ -70,7 +66,7 @@ pub struct EndpointConf {
|
||||
http_port: u16,
|
||||
pg_version: u32,
|
||||
skip_pg_catalog_updates: bool,
|
||||
features: Vec<ComputeFeature>,
|
||||
pageserver_id: NodeId,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -122,14 +118,18 @@ impl ComputeControlPlane {
|
||||
http_port: Option<u16>,
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
pageserver_id: NodeId,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
let pageserver =
|
||||
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||
let ep = Arc::new(Endpoint {
|
||||
endpoint_id: endpoint_id.to_owned(),
|
||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
|
||||
env: self.env.clone(),
|
||||
pageserver,
|
||||
timeline_id,
|
||||
mode,
|
||||
tenant_id,
|
||||
@@ -141,7 +141,6 @@ impl ComputeControlPlane {
|
||||
// with this we basically test a case of waking up an idle compute, where
|
||||
// we also skip catalog updates in the cloud.
|
||||
skip_pg_catalog_updates: true,
|
||||
features: vec![],
|
||||
});
|
||||
|
||||
ep.create_endpoint_dir()?;
|
||||
@@ -156,7 +155,7 @@ impl ComputeControlPlane {
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: true,
|
||||
features: vec![],
|
||||
pageserver_id,
|
||||
})?,
|
||||
)?;
|
||||
std::fs::write(
|
||||
@@ -169,30 +168,6 @@ impl ComputeControlPlane {
|
||||
|
||||
Ok(ep)
|
||||
}
|
||||
|
||||
pub fn check_conflicting_endpoints(
|
||||
&self,
|
||||
mode: ComputeMode,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<()> {
|
||||
if matches!(mode, ComputeMode::Primary) {
|
||||
// this check is not complete, as you could have a concurrent attempt at
|
||||
// creating another primary, both reading the state before checking it here,
|
||||
// but it's better than nothing.
|
||||
let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
|
||||
v.tenant_id == tenant_id
|
||||
&& v.timeline_id == timeline_id
|
||||
&& v.mode == mode
|
||||
&& v.status() != "stopped"
|
||||
});
|
||||
|
||||
if let Some((key, _)) = duplicates.next() {
|
||||
bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -215,12 +190,10 @@ pub struct Endpoint {
|
||||
// These are not part of the endpoint as such, but the environment
|
||||
// the endpoint runs in.
|
||||
pub env: LocalEnv,
|
||||
pageserver: PageServerNode,
|
||||
|
||||
// Optimizations
|
||||
skip_pg_catalog_updates: bool,
|
||||
|
||||
// Feature flags
|
||||
features: Vec<ComputeFeature>,
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
@@ -240,17 +213,20 @@ impl Endpoint {
|
||||
let conf: EndpointConf =
|
||||
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
|
||||
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
|
||||
|
||||
Ok(Endpoint {
|
||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
|
||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
|
||||
endpoint_id,
|
||||
env: env.clone(),
|
||||
pageserver,
|
||||
timeline_id: conf.timeline_id,
|
||||
mode: conf.mode,
|
||||
tenant_id: conf.tenant_id,
|
||||
pg_version: conf.pg_version,
|
||||
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
|
||||
features: conf.features,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -437,14 +413,11 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
|
||||
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
if send_sigterm {
|
||||
kill(pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -465,21 +438,11 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
|
||||
pageservers
|
||||
.iter()
|
||||
.map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub async fn start(
|
||||
pub fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
remote_ext_config: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
) -> Result<()> {
|
||||
if self.status() == "running" {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
@@ -493,9 +456,13 @@ impl Endpoint {
|
||||
std::fs::remove_dir_all(self.pgdata())?;
|
||||
}
|
||||
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
let pageserver_connstring = {
|
||||
let config = &self.pageserver.pg_connection_config;
|
||||
let (host, port) = (config.host(), config.port());
|
||||
|
||||
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
format!("postgresql://no_user@{host}:{port}")
|
||||
};
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
if self.mode == ComputeMode::Primary {
|
||||
for sk_id in safekeepers {
|
||||
@@ -509,24 +476,11 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
// if it is present, read it and pass to compute_ctl
|
||||
let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
|
||||
let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
|
||||
let remote_extensions: Option<RemoteExtSpec>;
|
||||
|
||||
if let Ok(spec_file) = remote_extensions_spec {
|
||||
remote_extensions = serde_json::from_reader(spec_file).ok();
|
||||
} else {
|
||||
remote_extensions = None;
|
||||
};
|
||||
|
||||
// Create spec file
|
||||
let spec = ComputeSpec {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
@@ -543,9 +497,7 @@ impl Endpoint {
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
remote_extensions: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -583,21 +535,9 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
let child = cmd.spawn()?;
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let child = scopeguard::guard(child, |mut child| {
|
||||
println!("SIGKILL & wait the started process");
|
||||
(|| {
|
||||
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
|
||||
child.kill().context("SIGKILL child")?;
|
||||
child.wait().context("wait() for child process")?;
|
||||
anyhow::Ok(())
|
||||
})()
|
||||
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Write down the pid so we can wait for it when we want to stop
|
||||
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
|
||||
// TODO use background_process::start_process instead
|
||||
let pid = child.id();
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
std::fs::write(pidfile_path, pid.to_string())?;
|
||||
@@ -608,7 +548,7 @@ impl Endpoint {
|
||||
const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
|
||||
loop {
|
||||
attempt += 1;
|
||||
match self.get_status().await {
|
||||
match self.get_status() {
|
||||
Ok(state) => {
|
||||
match state.status {
|
||||
ComputeStatus::Init => {
|
||||
@@ -646,15 +586,12 @@ impl Endpoint {
|
||||
std::thread::sleep(ATTEMPT_INTERVAL);
|
||||
}
|
||||
|
||||
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
|
||||
drop(scopeguard::ScopeGuard::into_inner(child));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Call the /status HTTP API
|
||||
pub async fn get_status(&self) -> Result<ComputeState> {
|
||||
let client = reqwest::Client::new();
|
||||
pub fn get_status(&self) -> Result<ComputeState> {
|
||||
let client = reqwest::blocking::Client::new();
|
||||
|
||||
let response = client
|
||||
.request(
|
||||
@@ -665,17 +602,16 @@ impl Endpoint {
|
||||
self.http_address.port()
|
||||
),
|
||||
)
|
||||
.send()
|
||||
.await?;
|
||||
.send()?;
|
||||
|
||||
// Interpret the response
|
||||
let status = response.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
Ok(response.json().await?)
|
||||
Ok(response.json()?)
|
||||
} else {
|
||||
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
||||
let url = response.url().to_owned();
|
||||
let msg = match response.text().await {
|
||||
let msg = match response.text() {
|
||||
Ok(err_body) => format!("Error: {}", err_body),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
};
|
||||
@@ -683,7 +619,7 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
|
||||
pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
let file = std::fs::File::open(spec_path)?;
|
||||
@@ -693,28 +629,26 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If we weren't given explicit pageservers, query the attachment service
|
||||
if pageservers.is_empty() {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
|
||||
pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
if let Some(pageserver_id) = pageserver_id {
|
||||
let endpoint_config_path = self.endpoint_path().join("endpoint.json");
|
||||
let mut endpoint_conf: EndpointConf = {
|
||||
let file = std::fs::File::open(&endpoint_config_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
endpoint_conf.pageserver_id = pageserver_id;
|
||||
std::fs::write(
|
||||
endpoint_config_path,
|
||||
serde_json::to_string_pretty(&endpoint_conf)?,
|
||||
)?;
|
||||
|
||||
let pageserver =
|
||||
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||
let ps_http_conf = &pageserver.pg_connection_config;
|
||||
let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
|
||||
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
|
||||
}
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstr.is_empty());
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let client = reqwest::blocking::Client::new();
|
||||
let response = client
|
||||
.post(format!(
|
||||
"http://{}:{}/configure",
|
||||
@@ -725,15 +659,14 @@ impl Endpoint {
|
||||
"{{\"spec\":{}}}",
|
||||
serde_json::to_string_pretty(&spec)?
|
||||
))
|
||||
.send()
|
||||
.await?;
|
||||
.send()?;
|
||||
|
||||
let status = response.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
Ok(())
|
||||
} else {
|
||||
let url = response.url().to_owned();
|
||||
let msg = match response.text().await {
|
||||
let msg = match response.text() {
|
||||
Ok(err_body) => format!("Error: {}", err_body),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
};
|
||||
@@ -758,15 +691,10 @@ impl Endpoint {
|
||||
&None,
|
||||
)?;
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some
|
||||
// cleanup work to do after postgres stops, like syncing safekeepers,
|
||||
// etc.
|
||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||
//
|
||||
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
|
||||
// want this cleanup: tests intentionally do stop when majority of
|
||||
// safekeepers is down, so sync-safekeepers would hang otherwise. This
|
||||
// could be a separate flag though.
|
||||
self.wait_for_compute_ctl_to_exit(destroy)?;
|
||||
self.wait_for_compute_ctl_to_exit()?;
|
||||
if destroy {
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
|
||||
@@ -14,3 +14,4 @@ pub mod local_env;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
pub mod tenant_migration;
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
|
||||
use clap::ValueEnum;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -163,31 +162,6 @@ impl Default for SafekeeperConf {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum InitForceMode {
|
||||
MustNotExist,
|
||||
EmptyDirOk,
|
||||
RemoveAllContents,
|
||||
}
|
||||
|
||||
impl ValueEnum for InitForceMode {
|
||||
fn value_variants<'a>() -> &'a [Self] {
|
||||
&[
|
||||
Self::MustNotExist,
|
||||
Self::EmptyDirOk,
|
||||
Self::RemoveAllContents,
|
||||
]
|
||||
}
|
||||
|
||||
fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
|
||||
Some(clap::builder::PossibleValue::new(match self {
|
||||
InitForceMode::MustNotExist => "must-not-exist",
|
||||
InitForceMode::EmptyDirOk => "empty-dir-ok",
|
||||
InitForceMode::RemoveAllContents => "remove-all-contents",
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
impl SafekeeperConf {
|
||||
/// Compute is served by port on which only tenant scoped tokens allowed, if
|
||||
/// it is configured.
|
||||
@@ -223,11 +197,7 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
pub fn attachment_service_bin(&self) -> PathBuf {
|
||||
// Irrespective of configuration, attachment service binary is always
|
||||
// run from the same location as neon_local. This means that for compatibility
|
||||
// tests that run old pageserver/safekeeper, they still run latest attachment service.
|
||||
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
|
||||
neon_local_bin_dir.join("attachment_service")
|
||||
self.neon_distrib_dir.join("attachment_service")
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> PathBuf {
|
||||
@@ -255,13 +225,7 @@ impl LocalEnv {
|
||||
if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
|
||||
Ok(conf)
|
||||
} else {
|
||||
let have_ids = self
|
||||
.pageservers
|
||||
.iter()
|
||||
.map(|node| format!("{}:{}", node.id, node.listen_http_addr))
|
||||
.collect::<Vec<_>>();
|
||||
let joined = have_ids.join(",");
|
||||
bail!("could not find pageserver {id}, have ids {joined}")
|
||||
bail!("could not find pageserver {id}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -420,7 +384,7 @@ impl LocalEnv {
|
||||
//
|
||||
// Initialize a new Neon repository
|
||||
//
|
||||
pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
|
||||
pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
|
||||
// check if config already exists
|
||||
let base_path = &self.base_data_dir;
|
||||
ensure!(
|
||||
@@ -429,34 +393,25 @@ impl LocalEnv {
|
||||
);
|
||||
|
||||
if base_path.exists() {
|
||||
match force {
|
||||
InitForceMode::MustNotExist => {
|
||||
bail!(
|
||||
"directory '{}' already exists. Perhaps already initialized?",
|
||||
base_path.display()
|
||||
);
|
||||
}
|
||||
InitForceMode::EmptyDirOk => {
|
||||
if let Some(res) = std::fs::read_dir(base_path)?.next() {
|
||||
res.context("check if directory is empty")?;
|
||||
anyhow::bail!("directory not empty: {base_path:?}");
|
||||
}
|
||||
}
|
||||
InitForceMode::RemoveAllContents => {
|
||||
println!("removing all contents of '{}'", base_path.display());
|
||||
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
|
||||
// all contents inside. This helps if the developer symbol links another directory (i.e.,
|
||||
// S3 local SSD) to the `.neon` base directory.
|
||||
for entry in std::fs::read_dir(base_path)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
fs::remove_dir_all(&path)?;
|
||||
} else {
|
||||
fs::remove_file(&path)?;
|
||||
}
|
||||
if force {
|
||||
println!("removing all contents of '{}'", base_path.display());
|
||||
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
|
||||
// all contents inside. This helps if the developer symbol links another directory (i.e.,
|
||||
// S3 local SSD) to the `.neon` base directory.
|
||||
for entry in std::fs::read_dir(base_path)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
fs::remove_dir_all(&path)?;
|
||||
} else {
|
||||
fs::remove_file(&path)?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bail!(
|
||||
"directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
|
||||
base_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,37 +6,76 @@
|
||||
//!
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Write};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::process::{Child, Command};
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::models::{
|
||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
|
||||
use crate::local_env::PageServerConf;
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
/// Directory within .neon which will be used by default for LocalFs remote storage.
|
||||
pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
#[error("Reqwest error: {0}")]
|
||||
Transport(#[from] reqwest::Error),
|
||||
|
||||
#[error("Error: {0}")]
|
||||
Response(String),
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for PageserverHttpError {
|
||||
fn from(e: anyhow::Error) -> Self {
|
||||
Self::Response(e.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
type Result<T> = result::Result<T, PageserverHttpError>;
|
||||
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
fn error_from_body(self) -> Result<Self>;
|
||||
}
|
||||
|
||||
impl ResponseErrorMessageExt for Response {
|
||||
fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
}
|
||||
|
||||
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
||||
let url = self.url().to_owned();
|
||||
Err(PageserverHttpError::Response(
|
||||
match self.json::<HttpErrorBody>() {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Control routines for pageserver.
|
||||
//
|
||||
@@ -47,7 +86,8 @@ pub struct PageServerNode {
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub conf: PageServerConf,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: mgmt_api::Client,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
@@ -59,19 +99,8 @@ impl PageServerNode {
|
||||
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
|
||||
conf: conf.clone(),
|
||||
env: env.clone(),
|
||||
http_client: mgmt_api::Client::new(
|
||||
format!("http://{}", conf.listen_http_addr),
|
||||
{
|
||||
match conf.http_auth_type {
|
||||
AuthType::Trust => None,
|
||||
AuthType::NeonJWT => Some(
|
||||
env.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
||||
.unwrap(),
|
||||
),
|
||||
}
|
||||
}
|
||||
.as_deref(),
|
||||
),
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://{}/v1", conf.listen_http_addr),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,16 +138,6 @@ impl PageServerNode {
|
||||
"control_plane_api='{}'",
|
||||
control_plane_api.as_str()
|
||||
));
|
||||
|
||||
// Attachment service uses the same auth as pageserver: if JWT is enabled
|
||||
// for us, we will also need it to talk to them.
|
||||
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
||||
let jwt_token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
||||
.unwrap();
|
||||
overrides.push(format!("control_plane_api_token='{}'", jwt_token));
|
||||
}
|
||||
}
|
||||
|
||||
if !cli_overrides
|
||||
@@ -162,8 +181,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false, register).await
|
||||
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
self.start_node(config_overrides, false)
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
@@ -204,13 +223,7 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn start_node(
|
||||
&self,
|
||||
config_overrides: &[&str],
|
||||
update_config: bool,
|
||||
register: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
"Starting pageserver node {} at '{}' in {:?}",
|
||||
@@ -218,7 +231,7 @@ impl PageServerNode {
|
||||
self.pg_connection_config.raw_address(),
|
||||
datadir
|
||||
);
|
||||
io::stdout().flush().context("flush stdout")?;
|
||||
io::stdout().flush()?;
|
||||
|
||||
let datadir_path_str = datadir.to_str().with_context(|| {
|
||||
format!(
|
||||
@@ -230,42 +243,20 @@ impl PageServerNode {
|
||||
if update_config {
|
||||
args.push(Cow::Borrowed("--update-config"));
|
||||
}
|
||||
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
&datadir,
|
||||
&self.env.pageserver_bin(),
|
||||
args.iter().map(Cow::as_ref),
|
||||
self.pageserver_env_variables()?,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
|| async {
|
||||
let st = self.check_status().await;
|
||||
match st {
|
||||
Ok(()) => Ok(true),
|
||||
Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
}
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(PageserverHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pageserver_basic_args<'a>(
|
||||
@@ -311,12 +302,7 @@ impl PageServerNode {
|
||||
background_process::stop_process(immediate, "pageserver", &self.pid_file())
|
||||
}
|
||||
|
||||
pub async fn page_server_psql_client(
|
||||
&self,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
|
||||
let mut config = self.pg_connection_config.clone();
|
||||
if self.conf.pg_auth_type == AuthType::NeonJWT {
|
||||
let token = self
|
||||
@@ -324,18 +310,44 @@ impl PageServerNode {
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
config = config.set_password(Some(token));
|
||||
}
|
||||
Ok(config.connect_no_tls().await?)
|
||||
Ok(config.connect_no_tls()?)
|
||||
}
|
||||
|
||||
pub async fn check_status(&self) -> mgmt_api::Result<()> {
|
||||
self.http_client.status().await
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
|
||||
let mut builder = self.http_client.request(method, url);
|
||||
if self.conf.http_auth_type == AuthType::NeonJWT {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
builder = builder.bearer_auth(token)
|
||||
}
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
|
||||
self.http_client.list_tenants().await
|
||||
pub fn check_status(&self) -> Result<()> {
|
||||
self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
Ok(())
|
||||
}
|
||||
pub fn parse_config(mut settings: HashMap<&str, &str>) -> anyhow::Result<models::TenantConfig> {
|
||||
let result = models::TenantConfig {
|
||||
|
||||
pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
|
||||
Ok(self
|
||||
.http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json()?)
|
||||
}
|
||||
|
||||
pub fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let mut settings = settings.clone();
|
||||
|
||||
let config = models::TenantConfig {
|
||||
checkpoint_distance: settings
|
||||
.remove("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>())
|
||||
@@ -394,41 +406,33 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
lazy_slru_download: settings
|
||||
.remove("lazy_slru_download")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
} else {
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let config = Self::parse_config(settings.clone())?;
|
||||
|
||||
let request = models::TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(new_tenant_id),
|
||||
generation,
|
||||
config,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
Ok(self.http_client.tenant_create(&request).await?)
|
||||
self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
|
||||
.json(&request)
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json::<Option<String>>()
|
||||
.with_context(|| {
|
||||
format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}")
|
||||
})?
|
||||
.context("No tenant id was found in the tenant creation response")
|
||||
.and_then(|tenant_id_string| {
|
||||
tenant_id_string.parse().with_context(|| {
|
||||
format!("Failed to parse response string as tenant id: '{tenant_id_string}'")
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn tenant_config(
|
||||
pub fn tenant_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
mut settings: HashMap<&str, &str>,
|
||||
@@ -499,12 +503,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
lazy_slru_download: settings
|
||||
.remove("lazy_slru_download")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
}
|
||||
};
|
||||
|
||||
@@ -512,59 +510,80 @@ impl PageServerNode {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
|
||||
self.http_client
|
||||
.tenant_config(&models::TenantConfigRequest { tenant_id, config })
|
||||
.await?;
|
||||
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
|
||||
.json(&models::TenantConfigRequest { tenant_id, config })
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
pub fn location_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
tenant_id: TenantId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<Duration>,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.location_config(tenant_shard_id, config, flush_ms)
|
||||
.await?)
|
||||
let req_body = TenantLocationConfigRequest { tenant_id, config };
|
||||
|
||||
self.http_request(
|
||||
Method::PUT,
|
||||
format!(
|
||||
"{}/tenant/{}/location_config",
|
||||
self.http_base_url, tenant_id
|
||||
),
|
||||
)?
|
||||
.json(&req_body)
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn timeline_list(
|
||||
pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
let timeline_infos: Vec<TimelineInfo> = self
|
||||
.http_request(
|
||||
Method::GET,
|
||||
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
|
||||
)?
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json()?;
|
||||
|
||||
Ok(timeline_infos)
|
||||
}
|
||||
|
||||
pub fn timeline_create(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_secondary_download(*tenant_id)
|
||||
.await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
ancestor_start_lsn: Option<Lsn>,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
pg_version: Option<u32>,
|
||||
existing_initdb_timeline_id: Option<TimelineId>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let req = models::TimelineCreateRequest {
|
||||
// If timeline ID was not specified, generate one
|
||||
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
||||
|
||||
self.http_request(
|
||||
Method::POST,
|
||||
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
|
||||
)?
|
||||
.json(&models::TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
ancestor_timeline_id,
|
||||
pg_version,
|
||||
existing_initdb_timeline_id,
|
||||
};
|
||||
Ok(self
|
||||
.http_client
|
||||
.timeline_create(tenant_shard_id, &req)
|
||||
.await?)
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json::<Option<TimelineInfo>>()
|
||||
.with_context(|| {
|
||||
format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
|
||||
})?
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"No timeline id was found in the timeline creation response for tenant {tenant_id}"
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Import a basebackup prepared using either:
|
||||
@@ -576,7 +595,7 @@ impl PageServerNode {
|
||||
/// * `timeline_id` - id to assign to imported timeline
|
||||
/// * `base` - (start lsn of basebackup, path to `base.tar` file)
|
||||
/// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
|
||||
pub async fn timeline_import(
|
||||
pub fn timeline_import(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -584,72 +603,38 @@ impl PageServerNode {
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let (client, conn) = self.page_server_psql_client().await?;
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = conn.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
tokio::pin!(client);
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
|
||||
let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
|
||||
let base_tarfile = File::open(base_tarfile_path)?;
|
||||
let mut base_reader = BufReader::new(base_tarfile);
|
||||
|
||||
// Init wal reader if necessary
|
||||
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
|
||||
let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
|
||||
let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
|
||||
let wal_tarfile = File::open(wal_tarfile_path)?;
|
||||
let wal_reader = BufReader::new(wal_tarfile);
|
||||
(end_lsn, Some(wal_reader))
|
||||
} else {
|
||||
(start_lsn, None)
|
||||
};
|
||||
|
||||
let copy_in = |reader, cmd| {
|
||||
let client = &client;
|
||||
async move {
|
||||
let writer = client.copy_in(&cmd).await?;
|
||||
let writer = std::pin::pin!(writer);
|
||||
let mut writer = writer.sink_map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
|
||||
});
|
||||
let mut reader = std::pin::pin!(reader);
|
||||
writer.send_all(&mut reader).await?;
|
||||
writer.into_inner().finish().await?;
|
||||
anyhow::Ok(())
|
||||
}
|
||||
};
|
||||
|
||||
// Import base
|
||||
copy_in(
|
||||
base_tarfile,
|
||||
format!(
|
||||
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
|
||||
),
|
||||
)
|
||||
.await?;
|
||||
let import_cmd = format!(
|
||||
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
|
||||
);
|
||||
let mut writer = client.copy_in(&import_cmd)?;
|
||||
io::copy(&mut base_reader, &mut writer)?;
|
||||
writer.finish()?;
|
||||
|
||||
// Import wal if necessary
|
||||
if let Some(wal_reader) = wal_reader {
|
||||
copy_in(
|
||||
wal_reader,
|
||||
format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
|
||||
)
|
||||
.await?;
|
||||
if let Some(mut wal_reader) = wal_reader {
|
||||
let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
|
||||
let mut writer = client.copy_in(&import_cmd)?;
|
||||
io::copy(&mut wal_reader, &mut writer)?;
|
||||
writer.finish()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_synthetic_size(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> anyhow::Result<TenantHistorySize> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_synthetic_size(tenant_shard_id)
|
||||
.await?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,11 +7,13 @@
|
||||
//! ```
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{http::error::HttpErrorBody, id::NodeId};
|
||||
@@ -32,14 +34,12 @@ pub enum SafekeeperHttpError {
|
||||
|
||||
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
async fn error_from_body(self) -> Result<Self>;
|
||||
fn error_from_body(self) -> Result<Self>;
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ResponseErrorMessageExt for reqwest::Response {
|
||||
async fn error_from_body(self) -> Result<Self> {
|
||||
impl ResponseErrorMessageExt for Response {
|
||||
fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
@@ -48,7 +48,7 @@ impl ResponseErrorMessageExt for reqwest::Response {
|
||||
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
||||
let url = self.url().to_owned();
|
||||
Err(SafekeeperHttpError::Response(
|
||||
match self.json::<HttpErrorBody>().await {
|
||||
match self.json::<HttpErrorBody>() {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
@@ -69,7 +69,7 @@ pub struct SafekeeperNode {
|
||||
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: reqwest::Client,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
}
|
||||
|
||||
@@ -80,7 +80,7 @@ impl SafekeeperNode {
|
||||
conf: conf.clone(),
|
||||
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
|
||||
env: env.clone(),
|
||||
http_client: reqwest::Client::new(),
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||
}
|
||||
}
|
||||
@@ -103,7 +103,7 @@ impl SafekeeperNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
|
||||
pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
@@ -191,16 +191,13 @@ impl SafekeeperNode {
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
[],
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
|| async {
|
||||
match self.check_status().await {
|
||||
Ok(()) => Ok(true),
|
||||
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
}
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
///
|
||||
@@ -219,7 +216,7 @@ impl SafekeeperNode {
|
||||
)
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
// TODO: authentication
|
||||
//if self.env.auth_type == AuthType::NeonJWT {
|
||||
// builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
|
||||
@@ -227,12 +224,10 @@ impl SafekeeperNode {
|
||||
self.http_client.request(method, url)
|
||||
}
|
||||
|
||||
pub async fn check_status(&self) -> Result<()> {
|
||||
pub fn check_status(&self) -> Result<()> {
|
||||
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
|
||||
.send()
|
||||
.await?
|
||||
.error_from_body()
|
||||
.await?;
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
202
control_plane/src/tenant_migration.rs
Normal file
202
control_plane/src/tenant_migration.rs
Normal file
@@ -0,0 +1,202 @@
|
||||
//!
|
||||
//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
|
||||
//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
|
||||
//! point to the new pageserver.
|
||||
//!
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::{
|
||||
attachment_service::AttachmentService, endpoint::ComputeControlPlane,
|
||||
pageserver::PageServerNode,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
/// Given an attached pageserver, retrieve the LSN for all timelines
|
||||
fn get_lsns(
|
||||
tenant_id: TenantId,
|
||||
pageserver: &PageServerNode,
|
||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||
let timelines = pageserver.timeline_list(&tenant_id)?;
|
||||
Ok(timelines
|
||||
.into_iter()
|
||||
.map(|t| (t.timeline_id, t.last_record_lsn))
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
|
||||
/// `baseline`.
|
||||
fn await_lsn(
|
||||
tenant_id: TenantId,
|
||||
pageserver: &PageServerNode,
|
||||
baseline: HashMap<TimelineId, Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let latest = match get_lsns(tenant_id, pageserver) {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
println!(
|
||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
||||
pageserver.conf.id
|
||||
);
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let mut any_behind: bool = false;
|
||||
for (timeline_id, baseline_lsn) in &baseline {
|
||||
match latest.get(timeline_id) {
|
||||
Some(latest_lsn) => {
|
||||
println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
||||
if latest_lsn < baseline_lsn {
|
||||
any_behind = true;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Expected timeline isn't yet visible on migration destination.
|
||||
// (IRL we would have to account for timeline deletion, but this
|
||||
// is just test helper)
|
||||
any_behind = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !any_behind {
|
||||
println!("✅ LSN caught up. Proceeding...");
|
||||
break;
|
||||
} else {
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This function spans multiple services, to demonstrate live migration of a tenant
|
||||
/// between pageservers:
|
||||
/// - Coordinate attach/secondary/detach on pageservers
|
||||
/// - call into attachment_service for generations
|
||||
/// - reconfigure compute endpoints to point to new attached pageserver
|
||||
pub fn migrate_tenant(
|
||||
env: &LocalEnv,
|
||||
tenant_id: TenantId,
|
||||
dest_ps: PageServerNode,
|
||||
) -> anyhow::Result<()> {
|
||||
// Get a new generation
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
|
||||
let previous = attachment_service.inspect(tenant_id)?;
|
||||
let mut baseline_lsns = None;
|
||||
if let Some((generation, origin_ps_id)) = &previous {
|
||||
let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
|
||||
|
||||
if origin_ps_id == &dest_ps.conf.id {
|
||||
println!("🔁 Already attached to {origin_ps_id}, freshening...");
|
||||
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
||||
let dest_conf = LocationConfig {
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
generation: gen.map(Generation::new),
|
||||
secondary_conf: None,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
};
|
||||
dest_ps.location_config(tenant_id, dest_conf)?;
|
||||
println!("✅ Migration complete");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
|
||||
|
||||
let stale_conf = LocationConfig {
|
||||
mode: LocationConfigMode::AttachedStale,
|
||||
generation: Some(Generation::new(*generation)),
|
||||
secondary_conf: None,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
};
|
||||
origin_ps.location_config(tenant_id, stale_conf)?;
|
||||
|
||||
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
|
||||
}
|
||||
|
||||
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
||||
let dest_conf = LocationConfig {
|
||||
mode: LocationConfigMode::AttachedMulti,
|
||||
generation: gen.map(Generation::new),
|
||||
secondary_conf: None,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
};
|
||||
|
||||
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
|
||||
dest_ps.location_config(tenant_id, dest_conf)?;
|
||||
|
||||
if let Some(baseline) = baseline_lsns {
|
||||
println!("🕑 Waiting for LSN to catch up...");
|
||||
await_lsn(tenant_id, &dest_ps, baseline)?;
|
||||
}
|
||||
|
||||
let cplane = ComputeControlPlane::load(env.clone())?;
|
||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||
if endpoint.tenant_id == tenant_id {
|
||||
println!(
|
||||
"🔁 Reconfiguring endpoint {} to use pageserver {}",
|
||||
endpoint_name, dest_ps.conf.id
|
||||
);
|
||||
endpoint.reconfigure(Some(dest_ps.conf.id))?;
|
||||
}
|
||||
}
|
||||
|
||||
for other_ps_conf in &env.pageservers {
|
||||
if other_ps_conf.id == dest_ps.conf.id {
|
||||
continue;
|
||||
}
|
||||
|
||||
let other_ps = PageServerNode::from_env(env, other_ps_conf);
|
||||
let other_ps_tenants = other_ps.tenant_list()?;
|
||||
|
||||
// Check if this tenant is attached
|
||||
let found = other_ps_tenants
|
||||
.into_iter()
|
||||
.map(|t| t.id)
|
||||
.any(|i| i == tenant_id);
|
||||
if !found {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Downgrade to a secondary location
|
||||
let secondary_conf = LocationConfig {
|
||||
mode: LocationConfigMode::Secondary,
|
||||
generation: None,
|
||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||
tenant_conf: TenantConfig::default(),
|
||||
};
|
||||
|
||||
println!(
|
||||
"💤 Switching to secondary mode on pageserver {}",
|
||||
other_ps.conf.id
|
||||
);
|
||||
other_ps.location_config(tenant_id, secondary_conf)?;
|
||||
}
|
||||
|
||||
println!(
|
||||
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
||||
dest_ps.conf.id
|
||||
);
|
||||
let dest_conf = LocationConfig {
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
generation: gen.map(Generation::new),
|
||||
secondary_conf: None,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
};
|
||||
dest_ps.location_config(tenant_id, dest_conf)?;
|
||||
|
||||
println!("✅ Migration complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -35,7 +35,6 @@ allow = [
|
||||
"Artistic-2.0",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
"CC0-1.0",
|
||||
"ISC",
|
||||
"MIT",
|
||||
"MPL-2.0",
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
# For documentation on how to configure this file,
|
||||
# see https://diesel.rs/guides/configuring-diesel-cli
|
||||
|
||||
[print_schema]
|
||||
file = "control_plane/attachment_service/src/schema.rs"
|
||||
custom_type_derives = ["diesel::query_builder::QueryId"]
|
||||
|
||||
[migrations_directory]
|
||||
dir = "control_plane/attachment_service/migrations"
|
||||
@@ -1,197 +0,0 @@
|
||||
# Per-Tenant GetPage@LSN Throttling
|
||||
|
||||
Author: Christian Schwarz
|
||||
Date: Oct 24, 2023
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver
|
||||
and the interactions with its client, i.e., the neon_smgr component in Compute.
|
||||
|
||||
The result of implementing & executing this RFC will be a fleet-wide upper limit for
|
||||
**"the highest GetPage/second that Pageserver can support for a single tenant/shard"**.
|
||||
|
||||
## Background
|
||||
|
||||
### GetPage@LSN Request Flow
|
||||
|
||||
Pageserver exposes its `page_service.rs` as a libpq listener.
|
||||
The Computes' `neon_smgr` module connects to that libpq listener.
|
||||
Once a connection is established, the protocol allows Compute to request page images at a given LSN.
|
||||
We call these requests GetPage@LSN requests, or GetPage requests for short.
|
||||
Other request types can be sent, but these are low traffic compared to GetPage requests
|
||||
and are not the concern of this RFC.
|
||||
|
||||
Pageserver associates one libpq connection with one tokio task.
|
||||
|
||||
Per connection/task, the pq protocol is handled by the common `postgres_backend` crate.
|
||||
Its `run_message_loop` function invokes the `page_service` specific `impl<IO> postgres_backend::Handler<IO> for PageServerHandler`.
|
||||
Requests are processed in the order in which they arrive via the TCP-based pq protocol.
|
||||
So, there is no concurrent request processing within one connection/task.
|
||||
|
||||
There is a degree of natural pipelining:
|
||||
Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream.
|
||||
And Pageserver can fill the pipe with responses in the other direction.
|
||||
Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc.
|
||||
|
||||
### GetPage@LSN Access Pattern
|
||||
|
||||
The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC).
|
||||
Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches.
|
||||
|
||||
If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*.
|
||||
|
||||
## Motivation
|
||||
|
||||
In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h,
|
||||
then dropping to ca 18k GetPage/second for a period of 9h.
|
||||
|
||||
We noticed this because of an internal GetPage latency SLO burn rate alert, i.e.,
|
||||
the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO.
|
||||
|
||||
Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants.
|
||||
|
||||
However, here are some illustrative data points for the 155k period:
|
||||
The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance.
|
||||
We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`)
|
||||
The CPU utilization of the instance was 75% user+system.
|
||||
Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%.
|
||||
The hit rate for materialized pages was ca. 40%.
|
||||
Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100.
|
||||
|
||||
The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**.
|
||||
The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM.
|
||||
The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**.
|
||||
|
||||
My claim is that it was **unhealthy to serve this workload at the pace we did**:
|
||||
* it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this)
|
||||
* more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons:
|
||||
* **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops.
|
||||
At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit.
|
||||
The result is an **uneven** performance profile from the Compute perspective.
|
||||
|
||||
* **economics**: Neon currently does not charge for IOPS, only capacity.
|
||||
**We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.**
|
||||
For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume.
|
||||
Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume.
|
||||
We charge 0$.
|
||||
It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free.
|
||||
|
||||
|
||||
Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits:
|
||||
vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver.
|
||||
So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity.
|
||||
|
||||
## Solution: Throttling GetPage Requests
|
||||
|
||||
**The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**.
|
||||
That is, unless we want to start charging for provisioned GetPage@LSN/second.
|
||||
Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size.
|
||||
Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913).
|
||||
|
||||
## The Design Space
|
||||
|
||||
What that remains is the question about *policy* and *mechanism*:
|
||||
|
||||
**Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant.
|
||||
Candidates are:
|
||||
|
||||
* hard limit, same limit value per connection|timeline|tenant
|
||||
* Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance.
|
||||
This is a major operational pain point / risk right now.
|
||||
* hard limit, configurable per connection|timeline|tenant
|
||||
* This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers.
|
||||
* Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant.
|
||||
* fair share among active connections|timelines|tenants per instance
|
||||
* example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity
|
||||
* NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance
|
||||
* ...
|
||||
|
||||
|
||||
Regarding **mechanism**, it's clear that **backpressure** is the way to go.
|
||||
However, we must choose between
|
||||
* **implicit** backpressure through pq/TCP and
|
||||
* **explicit** rejection of requests + retries with exponential backoff
|
||||
|
||||
Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**:
|
||||
where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling?
|
||||
|
||||
And when we eventually move the measurement point into the Computes (to avoid coordinated omission),
|
||||
how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO?
|
||||
|
||||
## Scope Of This RFC
|
||||
|
||||
**This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**.
|
||||
|
||||
This proposal is easy to implement and significantly de-risks operating large Pageservers,
|
||||
based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants.
|
||||
|
||||
For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate.
|
||||
Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate.
|
||||
If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation.
|
||||
|
||||
The mechanism for backpressure will be TCP-based implicit backpressure.
|
||||
The compute team isn't concerned about prefetch queue depth.
|
||||
Pageserver will implement it by delaying the reading of requests from the libpq connection(s).
|
||||
|
||||
The rate limit will be implemented using a per-tenant token bucket.
|
||||
The bucket will be be shared among all connections to the tenant.
|
||||
The bucket implementation supports starvation-preventing `await`ing.
|
||||
The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/).
|
||||
The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771
|
||||
can be used to evaluate the overhead of sharing the bucket among connections of a tenant.
|
||||
A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler.
|
||||
|
||||
Regarding metrics / the internal GetPage latency SLO:
|
||||
we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by:
|
||||
- histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver)
|
||||
- histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver)
|
||||
|
||||
Further observability measures:
|
||||
- an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute.
|
||||
The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats.
|
||||
|
||||
Rollout will happen as follows:
|
||||
- deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf
|
||||
- experimentation in staging and later production to study impact & interaction with auto-scaling
|
||||
- determination of a sensible global default value
|
||||
- the value will be chosen as high as possible ...
|
||||
- ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance.
|
||||
- deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default
|
||||
- reset of the experimental per-tenant overrides
|
||||
- gain experience & lower the limit over time
|
||||
- we stop lowering the limit as soon as this RFC's goal is achieved, i.e.,
|
||||
once we decide that in practice the chosen value sufficiently de-risks operating large pageservers
|
||||
|
||||
The per-tenant override will remain for emergencies and testing.
|
||||
But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant.
|
||||
|
||||
Toward the upper layers of the Neon stack, the resulting limit will be
|
||||
**"the highest GetPage/second that Pageserver can support for a single tenant"**.
|
||||
|
||||
### Rationale
|
||||
|
||||
We decided against error + retry because of worries about starvation.
|
||||
|
||||
## Future Work
|
||||
|
||||
Enable per-tenant emergency override of the limit via Console.
|
||||
Should be part of a more general framework to specify tenant config overrides.
|
||||
**NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users,
|
||||
or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that
|
||||
concerns itself with GetPage/second capacity planning.
|
||||
|
||||
Compute-side metrics for GetPage latency.
|
||||
|
||||
Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled.
|
||||
|
||||
Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss.
|
||||
|
||||
Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant.
|
||||
Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance.
|
||||
With per-tenant rate limiting, we will not meet that expectation.
|
||||
However, we can currently only scale per tenant.
|
||||
Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis.
|
||||
But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit.
|
||||
To solve this properly, I think we'll need replicas for tenants / shard.
|
||||
To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas.
|
||||
@@ -1,205 +0,0 @@
|
||||
# Name
|
||||
|
||||
Created on: 2023-09-08
|
||||
Author: Arpad Müller
|
||||
|
||||
## Summary
|
||||
|
||||
Enable the pageserver to recover from data corruption events by implementing
|
||||
a feature to re-apply historic WAL records in parallel to the already occurring
|
||||
WAL replay.
|
||||
|
||||
The feature is outside of the user-visible backup and history story, and only
|
||||
serves as a second-level backup for the case that there is a bug in the
|
||||
pageservers that corrupted the served pages.
|
||||
|
||||
The RFC proposes the addition of two new features:
|
||||
* recover a broken branch from WAL (downtime is allowed)
|
||||
* a test recovery system to recover random branches to make sure recovery works
|
||||
|
||||
## Motivation
|
||||
|
||||
The historic WAL is currently stored in S3 even after it has been replayed by
|
||||
the pageserver and thus been integrated into the pageserver's storage system.
|
||||
This is done to defend from data corruption failures inside the pageservers.
|
||||
|
||||
However, application of this WAL in the disaster recovery setting is currently
|
||||
very manual and we want to automate this to make it easier.
|
||||
|
||||
### Use cases
|
||||
|
||||
There are various use cases for this feature, like:
|
||||
|
||||
* The main motivation is replaying in the instance of pageservers corrupting
|
||||
data.
|
||||
* We might want to, beyond the user-visible history features, through our
|
||||
support channels and upon customer request, in select instances, recover
|
||||
historic versions beyond the range of history that we officially support.
|
||||
* Running the recovery process in the background for random tenant timelines
|
||||
to figure out if there was a corruption of data (we would compare with what
|
||||
the pageserver stores for the "official" timeline).
|
||||
* Using the WAL to arrive at historic pages we can then back up to S3 so that
|
||||
WAL itself can be discarded, or at least not used for future replays.
|
||||
Again, this sounds a lot like what the pageserver is already doing, but the
|
||||
point is to provide a fallback to the service provided by the pageserver.
|
||||
|
||||
## Design
|
||||
|
||||
### Design constraints
|
||||
|
||||
The main design constraint is that the feature needs to be *simple* enough that
|
||||
the number of bugs are as low, and reliability as high as possible: the main
|
||||
goal of this endeavour is to achieve higher correctness than the pageserver.
|
||||
|
||||
For the background process, we cannot afford a downtime of the timeline that is
|
||||
being cloned, as we don't want to restrict ourselves to offline tenants only.
|
||||
In the scenario where we want to recover from disasters or roll back to a
|
||||
historic lsn through support staff, downtimes are more affordable, and
|
||||
inevitable if the original had been subject to the corruption. Ideally, the
|
||||
two code paths would share code, so the solution would be designed for not
|
||||
requiring downtimes.
|
||||
|
||||
### API endpoint changes
|
||||
|
||||
This RFC proposes two API endpoint changes in the safekeeper and the
|
||||
pageserver.
|
||||
|
||||
Remember, the pageserver timeline API creation endpoint is to this URL:
|
||||
|
||||
```
|
||||
/v1/tenant/{tenant_id}/timeline/
|
||||
```
|
||||
|
||||
Where `{tenant_id}` is the ID of the tenant the timeline is created for,
|
||||
and specified as part of the URL. The timeline ID is passed via the POST
|
||||
request body as the only required parameter `new_timeline_id`.
|
||||
|
||||
This proposal adds one optional parameter called
|
||||
`existing_initdb_timeline_id` to the request's json body. If the parameter
|
||||
is not specified, behaviour should be as existing, so the pageserver runs
|
||||
initdb.
|
||||
If the parameter is specified, it is expected to point to a timeline ID.
|
||||
In fact that ID might match `new_timeline_id`, what's important is that
|
||||
S3 storage contains a matching initdb under the URL matching the given
|
||||
tenant and timeline.
|
||||
|
||||
Having both `ancestor_timeline_id` and `existing_initdb_timeline_id`
|
||||
specified is illegal and will yield in an HTTP error. This feature is
|
||||
only meant for the "main" branch that doesn't have any ancestors
|
||||
of its own, as only here initdb is relevant.
|
||||
|
||||
For the safekeeper, we propose the addition of the following copy endpoint:
|
||||
|
||||
```
|
||||
/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy
|
||||
```
|
||||
it is meant for POST requests with json, and the two URL parameters
|
||||
`tenant_id` and `source_timeline_id`. The json request body contains
|
||||
the two required parameters `target_timeline_id` and `until_lsn`.
|
||||
|
||||
After invoking, the copy endpoint starts a copy process of the WAL from
|
||||
the source ID to the target ID. The lsn is updated according to the
|
||||
progress of the API call.
|
||||
|
||||
### Higher level features
|
||||
|
||||
We want the API changes to support the following higher level features:
|
||||
|
||||
* recovery-after-corruption DR of the main timeline of a tenant. This
|
||||
feature allows for downtime.
|
||||
* test DR of the main timeline into a special copy timeline. this feature
|
||||
is meant to run against selected production tenants in the background,
|
||||
without the user noticing, so it does not allow for downtime.
|
||||
|
||||
The recovery-after-corruption DR only needs the pageserver changes.
|
||||
It works as follows:
|
||||
|
||||
* delete the timeline from the pageservers via timeline deletion API
|
||||
* re-create it via timeline creation API (same ID as before) and set
|
||||
`existing_initdb_timeline_id` to the same timeline ID
|
||||
|
||||
The test DR requires also the copy primitive and works as follows:
|
||||
|
||||
* copy the WAL of the timeline to a new place
|
||||
* create a new timeline for the tenant
|
||||
|
||||
## Non Goals
|
||||
|
||||
At the danger of being repetitive, the main goal of this feature is to be a
|
||||
backup method, so reliability is very important. This implies that other
|
||||
aspects like performance or space reduction are less important.
|
||||
|
||||
### Corrupt WAL
|
||||
|
||||
The process suggested by this RFC assumes that the WAL is free of corruption.
|
||||
In some instances, corruption can make it into WAL, like for example when
|
||||
higher level components like postgres or the application first read corrupt
|
||||
data, and then execute a write with data derived from that earlier read. That
|
||||
written data might then contain the corruption.
|
||||
|
||||
Common use cases can hit this quite easily. For example, an application reads
|
||||
some counter, increments it, and then writes the new counter value to the
|
||||
database.
|
||||
On a lower level, the compute might put FPIs (Full Page Images) into the WAL,
|
||||
which have corrupt data for rows unrelated to the write operation at hand.
|
||||
|
||||
Separating corrupt writes from non-corrupt ones is a hard problem in general,
|
||||
and if the application was involved in making the corrupt write, a recovery
|
||||
would also involve the application. Therefore, corruption that has made it into
|
||||
the WAL is outside of the scope of this feature. However, the WAL replay can be
|
||||
issued to right before the point in time where the corruption occured. Then the
|
||||
data loss is isolated to post-corruption writes only.
|
||||
|
||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
||||
|
||||
Most changes would happen to the pageservers.
|
||||
For the higher level features, maybe other components like the console would
|
||||
be involved.
|
||||
|
||||
We need to make sure that the shadow timelines are not subject to the usual
|
||||
limits and billing we apply to existing timelines.
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
The first problem to keep in mind is the reproducability of `initdb`.
|
||||
So an initial step would be to upload `initdb` snapshots to S3.
|
||||
|
||||
After that, we'd have the endpoint spawn a background process which
|
||||
performs the replay of the WAL to that new timeline. This process should
|
||||
follow the existing workflows as closely as possible, just using the
|
||||
WAL records of a different timeline.
|
||||
|
||||
The timeline created will be in a special state that solely looks for WAL
|
||||
entries of the timeline it is trying to copy. Once the target LSN is reached,
|
||||
it turns into a normal timeline that also accepts writes to its own
|
||||
timeline ID.
|
||||
|
||||
### Scalability
|
||||
|
||||
For now we want to run this entire process on a single node, and as
|
||||
it is by nature linear, it's hard to parallelize. However, for the
|
||||
verification workloads, we can easily start the WAL replay in parallel
|
||||
for different points in time. This is valuable especially for tenants
|
||||
with large WAL records.
|
||||
|
||||
Compare this with the tricks to make addition circuits execute with
|
||||
lower latency by making them perform the addition for both possible
|
||||
values of the carry bit, and then, in a second step, taking the
|
||||
result for the carry bit that was actually obtained.
|
||||
|
||||
The other scalability dimension to consider is the WAL length, which
|
||||
is a growing question as tenants accumulate changes. There are
|
||||
possible approaches to this, including creating snapshots of the
|
||||
page files and uploading them to S3, but if we do this for every single
|
||||
branch, we lose the cheap branching property.
|
||||
|
||||
### Implementation by component
|
||||
|
||||
The proposed changes for the various components of the neon architecture
|
||||
are written up in this notion page:
|
||||
|
||||
https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2
|
||||
|
||||
### Unresolved questions
|
||||
|
||||
none known (outside of the mentioned ones).
|
||||
@@ -1,142 +0,0 @@
|
||||
# Vectored Timeline Get
|
||||
|
||||
Created on: 2024-01-02
|
||||
Author: Christian Schwarz
|
||||
|
||||
# Summary
|
||||
|
||||
A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
|
||||
|
||||
# Motivation
|
||||
|
||||
During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
|
||||
For an example, see
|
||||
https://github.com/neondatabase/neon/blob/5c88213eaf1b1e29c610a078d0b380f69ed49a7e/pageserver/src/basebackup.rs#L281-L302.
|
||||
|
||||
Each of these `Timeline::get` calls must traverse the layer map to gather reconstruct data (`Timeline::get_reconstruct_data`) for the requested page number (`blknum` in the example).
|
||||
For each layer visited by layer map traversal, we do a `DiskBtree` point lookup.
|
||||
If it's negative (no entry), we resume layer map traversal.
|
||||
If it's positive, we collect the result in our reconstruct data bag.
|
||||
If the reconstruct data bag contents suffice to reconstruct the page, we're done with `get_reconstruct_data` and move on to walredo.
|
||||
Otherwise, we resume layer map traversal.
|
||||
|
||||
Doing this many `Timeline::get` calls is quite inefficient because:
|
||||
|
||||
1. We do the layer map traversal repeatedly, even if, e.g., all the data sits in the same image layer at the bottom of the stack.
|
||||
2. We may visit many DiskBtree inner pages multiple times for point lookup of different keys.
|
||||
This is likely particularly bad for L0s which span the whole key space and hence must be visited by layer map traversal, but
|
||||
may not contain the data we're looking for.
|
||||
3. Anecdotally, keys adjacent in keyspace and written simultaneously also end up physically adjacent in the layer files [^1].
|
||||
So, to provide the reconstruct data for N adjacent keys, we would actually only _need_ to issue a single large read to the filesystem, instead of the N reads we currently do.
|
||||
The filesystem, in turn, ideally stores the layer file physically contiguously, so our large read will turn into one IOP toward the disk.
|
||||
|
||||
[^1]: https://www.notion.so/neondatabase/Christian-Investigation-Slow-Basebackups-Early-2023-12-34ea5c7dcdc1485d9ac3731da4d2a6fc?pvs=4#15ee4e143392461fa64590679c8f54c9
|
||||
|
||||
# Solution
|
||||
|
||||
We should have a vectored aka batched aka scatter-gather style alternative API for `Timeline::get`. Having such an API unlocks:
|
||||
|
||||
* more efficient basebackup
|
||||
* batched IO during compaction (useful for strides of unchanged pages)
|
||||
* page_service: expose vectored get_page_at_lsn for compute (=> good for seqscan / prefetch)
|
||||
* if [on-demand SLRU downloads](https://github.com/neondatabase/neon/pull/6151) land before vectored Timeline::get, on-demand SLRU downloads will still benefit from this API
|
||||
|
||||
# DoD
|
||||
|
||||
There is a new variant of `Timeline::get`, called `Timeline::get_vectored`.
|
||||
It takes as arguments an `lsn: Lsn` and a `src: &[KeyVec]` where `struct KeyVec { base: Key, count: usize }`.
|
||||
|
||||
It is up to the implementor to figure out a suitable and efficient way to return the reconstructed page images.
|
||||
It is sufficient to simply return a `Vec<Bytes>`, but, likely more efficient solutions can be found after studying all the callers of `Timeline::get`.
|
||||
|
||||
Functionally, the behavior of `Timeline::get_vectored` is equivalent to
|
||||
|
||||
```rust
|
||||
let mut keys_iter: impl Iterator<Item=Key>
|
||||
= src.map(|KeyVec{ base, count }| (base..base+count)).flatten();
|
||||
let mut out = Vec::new();
|
||||
for key in keys_iter {
|
||||
let data = Timeline::get(key, lsn)?;
|
||||
out.push(data);
|
||||
}
|
||||
return out;
|
||||
```
|
||||
|
||||
However, unlike above, an ideal solution will
|
||||
|
||||
* Visit each `struct Layer` at most once.
|
||||
* For each visited layer, call `Layer::get_value_reconstruct_data` at most once.
|
||||
* This means, read each `DiskBtree` page at most once.
|
||||
* Facilitate merging of the reads we issue to the OS and eventually NVMe.
|
||||
|
||||
Each of these items above represents a signficant amount of work.
|
||||
|
||||
## Performance
|
||||
|
||||
Ideally, the **base performance** of a vectored get of a single page should be identical to the current `Timeline::get`.
|
||||
A reasonable constant overhead over current `Timeline::get` is acceptable.
|
||||
|
||||
The performance improvement for the vectored use case is demonstrated in some way, e.g., using the `pagebench` basebackup benchmark against a tenant with a lot of SLRU segments.
|
||||
|
||||
# Implementation
|
||||
|
||||
High-level set of tasks / changes to be made:
|
||||
|
||||
- **Get clarity on API**:
|
||||
- Define naive `Timeline::get_vectored` implementation & adopt it across pageserver.
|
||||
- The tricky thing here will be the return type (e.g. `Vec<Bytes>` vs `impl Stream`).
|
||||
- Start with something simple to explore the different usages of the API.
|
||||
Then iterate with peers until we have something that is good enough.
|
||||
- **Vectored Layer Map traversal**
|
||||
- Vectored `LayerMap::search` (take 1 LSN and N `Key`s instead of just 1 LSN and 1 `Key`)
|
||||
- Refactor `Timeline::get_reconstruct_data` to hold & return state for N `Key`s instead of 1
|
||||
- The slightly tricky part here is what to do about `cont_lsn` [after we've found some reconstruct data for some keys](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2385)
|
||||
but need more.
|
||||
Likely we'll need to keep track of `cont_lsn` per key and continue next iteration at `max(cont_lsn)` of all keys that still need data.
|
||||
- **Vectored `Layer::get_value_reconstruct_data` / `DiskBtree`**
|
||||
- Current code calls it [here](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2384).
|
||||
- Delta layers use `DiskBtreeReader::visit()` to collect the `(offset,len)` pairs for delta record blobs to load.
|
||||
- Image layers use `DiskBtreeReader::get` to get the offset of the image blob to load. Underneath, that's just a `::visit()` call.
|
||||
- What needs to happen to `DiskBtree::visit()`?
|
||||
* Minimally
|
||||
* take a single `KeyVec` instead of a single `Key` as argument, i.e., take a single contiguous key range to visit.
|
||||
* Change the visit code to to invoke the callback for all values in the `KeyVec`'s key range
|
||||
* This should be good enough for what we've seen when investigating basebackup slowness, because there, the key ranges are contiguous.
|
||||
* Ideally:
|
||||
* Take a `&[KeyVec]`, sort it;
|
||||
* during Btree traversal, peek at the next `KeyVec` range to determine whether we need to descend or back out.
|
||||
* NB: this should be a straight-forward extension of the minimal solution above, as we'll already be checking for "is there more key range in the requested `KeyVec`".
|
||||
- **Facilitate merging of the reads we issue to the OS and eventually NVMe.**
|
||||
- The `DiskBtree::visit` produces a set of offsets which we then read from a `VirtualFile` [here](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
|
||||
- [Delta layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
|
||||
- We hit (and rely) on `PageCache` and `VirtualFile here (not great under pressure)
|
||||
- [Image layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/image_layer.rs#L429-L435)
|
||||
- What needs to happen is the **vectorization of the `blob_io` interface and then the `VirtualFile` API**.
|
||||
- That is tricky because
|
||||
- the `VirtualFile` API, which sits underneath `blob_io`, is being touched by ongoing [io_uring work](https://github.com/neondatabase/neon/pull/5824)
|
||||
- there's the question how IO buffers will be managed; currently this area relies heavily on `PageCache`, but there's controversy around the future of `PageCache`.
|
||||
- The guiding principle here should be to avoid coupling this work to the `PageCache`.
|
||||
- I.e., treat `PageCache` as an extra hop in the I/O chain, rather than as an integral part of buffer management.
|
||||
|
||||
|
||||
Let's see how we can improve by doing the first three items in above list first, then revisit.
|
||||
|
||||
## Rollout / Feature Flags
|
||||
|
||||
No feature flags are required for this epic.
|
||||
|
||||
At the end of this epic, `Timeline::get` forwards to `Timeline::get_vectored`, i.e., it's an all-or-nothing type of change.
|
||||
|
||||
It is encouraged to deliver this feature incrementally, i.e., do many small PRs over multiple weeks.
|
||||
That will help isolate performance regressions across weekly releases.
|
||||
|
||||
# Interaction With Sharding
|
||||
|
||||
[Sharding](https://github.com/neondatabase/neon/pull/5432) splits up the key space, see functions `is_key_local` / `key_to_shard_number`.
|
||||
|
||||
Just as with `Timeline::get`, callers of `Timeline::get_vectored` are responsible for ensuring that they only ask for blocks of the given `struct Timeline`'s shard.
|
||||
|
||||
Given that this is already the case, there shouldn't be significant interaction/interference with sharding.
|
||||
|
||||
However, let's have a safety check for this constraint (error or assertion) because there are currently few affordances at the higher layers of Pageserver for sharding<=>keyspace interaction.
|
||||
For example, `KeySpace` is not broken up by shard stripe, so if someone naively converted the compaction code to issue a vectored get for a keyspace range it would violate this constraint.
|
||||
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
|
||||
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
||||
|
||||
### Obligatory checks
|
||||
We force code formatting via `ruff`, and type hints via `mypy`.
|
||||
We force code formatting via `black`, `ruff`, and type hints via `mypy`.
|
||||
Run the following commands in the repository's root (next to `pyproject.toml`):
|
||||
|
||||
```bash
|
||||
poetry run ruff format . # All code is reformatted
|
||||
poetry run ruff check . # Python linter
|
||||
poetry run mypy . # Ensure there are no typing errors
|
||||
poetry run black . # All code is reformatted
|
||||
poetry run ruff . # Python linter
|
||||
poetry run mypy . # Ensure there are no typing errors
|
||||
```
|
||||
|
||||
**WARNING**: do not run `mypy` from a directory other than the root of the repository.
|
||||
|
||||
@@ -26,13 +26,6 @@ pub struct ComputeSpec {
|
||||
// but we don't use it for anything. Serde will ignore missing fields when
|
||||
// deserializing it.
|
||||
pub operation_uuid: Option<String>,
|
||||
|
||||
/// Compute features to enable. These feature flags are provided, when we
|
||||
/// know all the details about client's compute, so they cannot be used
|
||||
/// to change `Empty` compute behavior.
|
||||
#[serde(default)]
|
||||
pub features: Vec<ComputeFeature>,
|
||||
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
@@ -73,31 +66,6 @@ pub struct ComputeSpec {
|
||||
|
||||
// information about available remote extensions
|
||||
pub remote_extensions: Option<RemoteExtSpec>,
|
||||
|
||||
pub pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeFeature {
|
||||
// XXX: Add more feature flags here.
|
||||
/// Enable the experimental activity monitor logic, which uses `pg_stat_database` to
|
||||
/// track short-lived connections as user activity.
|
||||
ActivityMonitorExperimental,
|
||||
|
||||
/// Enable running migrations
|
||||
Migrations,
|
||||
|
||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
/// `parse_unknown_features()` for more details.
|
||||
#[serde(other)]
|
||||
UnknownFeature,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
@@ -219,6 +187,8 @@ pub struct DeltaOp {
|
||||
pub struct Role {
|
||||
pub name: PgIdent,
|
||||
pub encrypted_password: Option<String>,
|
||||
pub replication: Option<bool>,
|
||||
pub bypassrls: Option<bool>,
|
||||
pub options: GenericOptions,
|
||||
}
|
||||
|
||||
@@ -259,10 +229,7 @@ mod tests {
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
// Features list defaults to empty vector.
|
||||
assert!(spec.features.is_empty());
|
||||
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -274,41 +241,4 @@ mod tests {
|
||||
ob.insert("unknown_field_123123123".into(), "hello".into());
|
||||
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_unknown_features() {
|
||||
// Test that unknown feature flags do not cause any errors.
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
|
||||
let ob = json.as_object_mut().unwrap();
|
||||
|
||||
// Add unknown feature flags.
|
||||
let features = vec!["foo_bar_feature", "baz_feature"];
|
||||
ob.insert("features".into(), features.into());
|
||||
|
||||
let spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||
|
||||
assert!(spec.features.len() == 2);
|
||||
assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
|
||||
assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_known_features() {
|
||||
// Test that we can properly parse known feature flags.
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
|
||||
let ob = json.as_object_mut().unwrap();
|
||||
|
||||
// Add known feature flags.
|
||||
let features = vec!["activity_monitor_experimental"];
|
||||
ob.insert("features".into(), features.into());
|
||||
|
||||
let spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
spec.features,
|
||||
vec![ComputeFeature::ActivityMonitorExperimental]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,9 +243,5 @@
|
||||
"public_extensions": [
|
||||
"postgis"
|
||||
]
|
||||
},
|
||||
"pgbouncer_settings": {
|
||||
"default_pool_size": "42",
|
||||
"pool_mode": "session"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,10 +9,5 @@ prometheus.workspace = true
|
||||
libc.workspace = true
|
||||
once_cell.workspace = true
|
||||
chrono.workspace = true
|
||||
twox-hash.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8"
|
||||
rand_distr = "0.4.3"
|
||||
|
||||
@@ -1,523 +0,0 @@
|
||||
//! HyperLogLog is an algorithm for the count-distinct problem,
|
||||
//! approximating the number of distinct elements in a multiset.
|
||||
//! Calculating the exact cardinality of the distinct elements
|
||||
//! of a multiset requires an amount of memory proportional to
|
||||
//! the cardinality, which is impractical for very large data sets.
|
||||
//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
|
||||
//! use significantly less memory than this, but can only approximate the cardinality.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
|
||||
sync::{atomic::AtomicU8, Arc, RwLock},
|
||||
};
|
||||
|
||||
use prometheus::{
|
||||
core::{self, Describer},
|
||||
proto, Opts,
|
||||
};
|
||||
use twox_hash::xxh3;
|
||||
|
||||
/// Create an [`HyperLogLogVec`] and registers to default registry.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! register_hll_vec {
|
||||
($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
|
||||
let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
|
||||
$crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
|
||||
}};
|
||||
|
||||
($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
|
||||
$crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||
}};
|
||||
}
|
||||
|
||||
/// Create an [`HyperLogLog`] and registers to default registry.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! register_hll {
|
||||
($N:literal, $OPTS:expr $(,)?) => {{
|
||||
let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
|
||||
$crate::register(Box::new(hll.clone())).map(|_| hll)
|
||||
}};
|
||||
|
||||
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||
}};
|
||||
}
|
||||
|
||||
/// HLL is a probabilistic cardinality measure.
|
||||
///
|
||||
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// If you want an estimate over time, you can use the following query:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (
|
||||
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
||||
/// ) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
||||
///
|
||||
/// ```promql
|
||||
/// # LinearCounting(m, V) = m log (m / V)
|
||||
/// shards_count * ln(shards_count /
|
||||
/// # calculate V = how many shards contain a 0
|
||||
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
||||
/// )
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLogVec<const N: usize> {
|
||||
core: Arc<HyperLogLogVecCore<N>>,
|
||||
}
|
||||
|
||||
struct HyperLogLogVecCore<const N: usize> {
|
||||
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
|
||||
pub desc: core::Desc,
|
||||
pub opts: Opts,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
for child in self.core.children.read().unwrap().values() {
|
||||
child.core.collect_into(&mut metrics);
|
||||
}
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogVec<N> {
|
||||
/// Create a new [`HyperLogLogVec`] based on the provided
|
||||
/// [`Opts`] and partitioned by the given label names. At least one label name must be
|
||||
/// provided.
|
||||
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
|
||||
let opts = opts.variable_labels(variable_names);
|
||||
|
||||
let desc = opts.describe()?;
|
||||
let v = HyperLogLogVecCore {
|
||||
children: RwLock::new(HashMap::default()),
|
||||
desc,
|
||||
opts,
|
||||
};
|
||||
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
|
||||
/// of label values (same order as the VariableLabels in Desc). If that combination of
|
||||
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
|
||||
///
|
||||
/// An error is returned if the number of label values is not the same as the
|
||||
/// number of VariableLabels in Desc.
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
self.core.get_metric_with_label_values(vals)
|
||||
}
|
||||
|
||||
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
|
||||
/// occurs.
|
||||
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogVecCore<N> {
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let h = self.hash_label_values(vals)?;
|
||||
|
||||
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
self.get_or_create_metric(h, vals)
|
||||
}
|
||||
|
||||
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
|
||||
if vals.len() != self.desc.variable_labels.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: self.desc.variable_labels.len(),
|
||||
got: vals.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut h = xxh3::Hash64::default();
|
||||
for val in vals {
|
||||
h.write(val.as_bytes());
|
||||
}
|
||||
|
||||
Ok(h.finish())
|
||||
}
|
||||
|
||||
fn get_or_create_metric(
|
||||
&self,
|
||||
hash: u64,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let mut children = self.children.write().unwrap();
|
||||
// Check exist first.
|
||||
if let Some(metric) = children.get(&hash).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
|
||||
children.insert(hash, metric.clone());
|
||||
Ok(metric)
|
||||
}
|
||||
}
|
||||
|
||||
/// HLL is a probabilistic cardinality measure.
|
||||
///
|
||||
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// If you want an estimate over time, you can use the following query:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (
|
||||
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
||||
/// ) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
||||
///
|
||||
/// ```promql
|
||||
/// # LinearCounting(m, V) = m log (m / V)
|
||||
/// shards_count * ln(shards_count /
|
||||
/// # calculate V = how many shards contain a 0
|
||||
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
||||
/// )
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLog<const N: usize> {
|
||||
core: Arc<HyperLogLogCore<N>>,
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLog<N> {
|
||||
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
|
||||
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let opts = Opts::new(name, help);
|
||||
Self::with_opts(opts)
|
||||
}
|
||||
|
||||
/// Create a [`HyperLogLog`] with the `opts` options.
|
||||
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
|
||||
Self::with_opts_and_label_values(&opts, &[])
|
||||
}
|
||||
|
||||
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
|
||||
let desc = opts.describe()?;
|
||||
let labels = make_label_pairs(&desc, label_values)?;
|
||||
|
||||
let v = HyperLogLogCore {
|
||||
shards: [0; N].map(AtomicU8::new),
|
||||
desc,
|
||||
labels,
|
||||
};
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
pub fn measure(&self, item: &impl Hash) {
|
||||
// changing the hasher will break compatibility with previous measurements.
|
||||
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
||||
}
|
||||
|
||||
fn record(&self, hash: u64) {
|
||||
let p = N.ilog2() as u8;
|
||||
let j = hash & (N as u64 - 1);
|
||||
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
|
||||
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
struct HyperLogLogCore<const N: usize> {
|
||||
shards: [AtomicU8; N],
|
||||
desc: core::Desc,
|
||||
labels: Vec<proto::LabelPair>,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLog<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
self.core.collect_into(&mut metrics);
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogCore<N> {
|
||||
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
|
||||
self.shards.iter().enumerate().for_each(|(i, x)| {
|
||||
let mut shard_label = proto::LabelPair::default();
|
||||
shard_label.set_name("hll_shard".to_owned());
|
||||
shard_label.set_value(format!("{i}"));
|
||||
|
||||
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
|
||||
|
||||
// This seems like it would be a race condition,
|
||||
// but HLL is not impacted by a write in one shard happening in between.
|
||||
// This is because in PromQL we will be implementing a harmonic mean of all buckets.
|
||||
// we will also merge samples in a time series using `max by (hll_shard)`.
|
||||
|
||||
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
|
||||
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
|
||||
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
let mut m = proto::Metric::default();
|
||||
let mut c = proto::Gauge::default();
|
||||
c.set_value(v as f64);
|
||||
m.set_gauge(c);
|
||||
|
||||
let mut labels = Vec::with_capacity(self.labels.len() + 1);
|
||||
labels.extend_from_slice(&self.labels);
|
||||
labels.push(shard_label);
|
||||
|
||||
m.set_label(labels);
|
||||
metrics.push(m);
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn make_label_pairs(
|
||||
desc: &core::Desc,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<Vec<proto::LabelPair>> {
|
||||
if desc.variable_labels.len() != label_values.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: desc.variable_labels.len(),
|
||||
got: label_values.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
|
||||
if total_len == 0 {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
if desc.variable_labels.is_empty() {
|
||||
return Ok(desc.const_label_pairs.clone());
|
||||
}
|
||||
|
||||
let mut label_pairs = Vec::with_capacity(total_len);
|
||||
for (i, n) in desc.variable_labels.iter().enumerate() {
|
||||
let mut label_pair = proto::LabelPair::default();
|
||||
label_pair.set_name(n.clone());
|
||||
label_pair.set_value(label_values[i].to_owned());
|
||||
label_pairs.push(label_pair);
|
||||
}
|
||||
|
||||
for label_pair in &desc.const_label_pairs {
|
||||
label_pairs.push(label_pair.clone());
|
||||
}
|
||||
label_pairs.sort();
|
||||
Ok(label_pairs)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use prometheus::{proto, Opts};
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use rand_distr::{Distribution, Zipf};
|
||||
|
||||
use crate::HyperLogLogVec;
|
||||
|
||||
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
|
||||
let mut metrics = vec![];
|
||||
hll.core
|
||||
.children
|
||||
.read()
|
||||
.unwrap()
|
||||
.values()
|
||||
.for_each(|c| c.core.collect_into(&mut metrics));
|
||||
metrics
|
||||
}
|
||||
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
|
||||
let mut buckets = [0.0; 32];
|
||||
for metric in metrics.chunks_exact(32) {
|
||||
if filter(&metric[0]) {
|
||||
for (i, m) in metric.iter().enumerate() {
|
||||
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buckets
|
||||
.into_iter()
|
||||
.map(|f| 2.0f64.powf(-f))
|
||||
.sum::<f64>()
|
||||
.recip()
|
||||
* 0.697
|
||||
* 32.0
|
||||
* 32.0
|
||||
}
|
||||
|
||||
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
|
||||
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
|
||||
|
||||
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
|
||||
let mut set_a = HashSet::new();
|
||||
let mut set_b = HashSet::new();
|
||||
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_a.insert(x.to_bits());
|
||||
hll.with_label_values(&["a"]).measure(&x.to_bits());
|
||||
}
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_b.insert(x.to_bits());
|
||||
hll.with_label_values(&["b"]).measure(&x.to_bits());
|
||||
}
|
||||
let merge = &set_a | &set_b;
|
||||
|
||||
let metrics = collect(&hll);
|
||||
let len = get_cardinality(&metrics, |_| true);
|
||||
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
|
||||
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
|
||||
|
||||
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_small() {
|
||||
let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [46, 30, 32]);
|
||||
assert!(51.3 < estimate[0] && estimate[0] < 51.4);
|
||||
assert!(44.0 < estimate[1] && estimate[1] < 44.1);
|
||||
assert!(39.0 < estimate[2] && estimate[2] < 39.1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_medium() {
|
||||
let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [2529, 1618, 1629]);
|
||||
assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
|
||||
assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
|
||||
assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_large() {
|
||||
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [129077, 79579, 79630]);
|
||||
assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
|
||||
assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
|
||||
assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_small2() {
|
||||
let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [92, 58, 60]);
|
||||
assert!(116.1 < estimate[0] && estimate[0] < 116.2);
|
||||
assert!(81.7 < estimate[1] && estimate[1] < 81.8);
|
||||
assert!(69.3 < estimate[2] && estimate[2] < 69.4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_medium2() {
|
||||
let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [8201, 5131, 5051]);
|
||||
assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
|
||||
assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
|
||||
assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_large2() {
|
||||
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [777847, 482069, 482246]);
|
||||
assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
|
||||
assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
|
||||
assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
|
||||
}
|
||||
}
|
||||
@@ -3,11 +3,8 @@
|
||||
//! Otherwise, we might not see all metrics registered via
|
||||
//! a default registry.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use prometheus::core::{
|
||||
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
|
||||
};
|
||||
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
|
||||
pub use prometheus::opts;
|
||||
pub use prometheus::register;
|
||||
pub use prometheus::Error;
|
||||
@@ -28,9 +25,7 @@ use prometheus::{Registry, Result};
|
||||
pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub mod metric_vec_duration;
|
||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||
|
||||
pub type UIntGauge = GenericGauge<AtomicU64>;
|
||||
pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
|
||||
@@ -137,137 +132,3 @@ fn get_rusage_stats() -> libc::rusage {
|
||||
rusage.assume_init()
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an [`IntCounterPairVec`] and registers to default registry.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! register_int_counter_pair_vec {
|
||||
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{
|
||||
match (
|
||||
$crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES),
|
||||
$crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES),
|
||||
) {
|
||||
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)),
|
||||
(Err(e), _) | (_, Err(e)) => Err(e),
|
||||
}
|
||||
}};
|
||||
}
|
||||
/// Create an [`IntCounterPair`] and registers to default registry.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! register_int_counter_pair {
|
||||
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{
|
||||
match (
|
||||
$crate::register_int_counter!($NAME1, $HELP1),
|
||||
$crate::register_int_counter!($NAME2, $HELP2),
|
||||
) {
|
||||
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)),
|
||||
(Err(e), _) | (_, Err(e)) => Err(e),
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes
|
||||
pub struct GenericCounterPairVec<P: Atomic> {
|
||||
inc: GenericCounterVec<P>,
|
||||
dec: GenericCounterVec<P>,
|
||||
}
|
||||
|
||||
/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes
|
||||
pub struct GenericCounterPair<P: Atomic> {
|
||||
inc: GenericCounter<P>,
|
||||
dec: GenericCounter<P>,
|
||||
}
|
||||
|
||||
impl<P: Atomic> GenericCounterPairVec<P> {
|
||||
pub fn new(inc: GenericCounterVec<P>, dec: GenericCounterVec<P>) -> Self {
|
||||
Self { inc, dec }
|
||||
}
|
||||
|
||||
/// `get_metric_with_label_values` returns the [`GenericCounterPair<P>`] for the given slice
|
||||
/// of label values (same order as the VariableLabels in Desc). If that combination of
|
||||
/// label values is accessed for the first time, a new [`GenericCounterPair<P>`] is created.
|
||||
///
|
||||
/// An error is returned if the number of label values is not the same as the
|
||||
/// number of VariableLabels in Desc.
|
||||
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
|
||||
Ok(GenericCounterPair {
|
||||
inc: self.inc.get_metric_with_label_values(vals)?,
|
||||
dec: self.dec.get_metric_with_label_values(vals)?,
|
||||
})
|
||||
}
|
||||
|
||||
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
|
||||
/// occurs.
|
||||
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: Atomic> GenericCounterPair<P> {
|
||||
pub fn new(inc: GenericCounter<P>, dec: GenericCounter<P>) -> Self {
|
||||
Self { inc, dec }
|
||||
}
|
||||
|
||||
/// Increment the gauge by 1, returning a guard that decrements by 1 on drop.
|
||||
pub fn guard(&self) -> GenericCounterPairGuard<P> {
|
||||
self.inc.inc();
|
||||
GenericCounterPairGuard(self.dec.clone())
|
||||
}
|
||||
|
||||
/// Increment the gauge by n, returning a guard that decrements by n on drop.
|
||||
pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy<P> {
|
||||
self.inc.inc_by(n);
|
||||
GenericCounterPairGuardBy(self.dec.clone(), n)
|
||||
}
|
||||
|
||||
/// Increase the gauge by 1.
|
||||
#[inline]
|
||||
pub fn inc(&self) {
|
||||
self.inc.inc();
|
||||
}
|
||||
|
||||
/// Decrease the gauge by 1.
|
||||
#[inline]
|
||||
pub fn dec(&self) {
|
||||
self.dec.inc();
|
||||
}
|
||||
|
||||
/// Add the given value to the gauge. (The value can be
|
||||
/// negative, resulting in a decrement of the gauge.)
|
||||
#[inline]
|
||||
pub fn inc_by(&self, v: P::T) {
|
||||
self.inc.inc_by(v);
|
||||
}
|
||||
|
||||
/// Subtract the given value from the gauge. (The value can be
|
||||
/// negative, resulting in an increment of the gauge.)
|
||||
#[inline]
|
||||
pub fn dec_by(&self, v: P::T) {
|
||||
self.dec.inc_by(v);
|
||||
}
|
||||
}
|
||||
|
||||
/// Guard returned by [`GenericCounterPair::guard`]
|
||||
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
|
||||
|
||||
impl<P: Atomic> Drop for GenericCounterPairGuard<P> {
|
||||
fn drop(&mut self) {
|
||||
self.0.inc();
|
||||
}
|
||||
}
|
||||
/// Guard returned by [`GenericCounterPair::guard_by`]
|
||||
pub struct GenericCounterPairGuardBy<P: Atomic>(GenericCounter<P>, P::T);
|
||||
|
||||
impl<P: Atomic> Drop for GenericCounterPairGuardBy<P> {
|
||||
fn drop(&mut self) {
|
||||
self.0.inc_by(self.1);
|
||||
}
|
||||
}
|
||||
|
||||
/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes
|
||||
pub type IntCounterPairVec = GenericCounterPairVec<AtomicU64>;
|
||||
|
||||
/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes
|
||||
pub type IntCounterPair = GenericCounterPair<AtomicU64>;
|
||||
|
||||
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
|
||||
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
|
||||
|
||||
@@ -18,12 +18,8 @@ enum-map.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
hex.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
bincode.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
@@ -4,9 +4,7 @@
|
||||
//! See docs/rfcs/025-generation-numbers.md
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::shard::TenantShardId;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachRequest {
|
||||
@@ -15,7 +13,7 @@ pub struct ReAttachRequest {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponseTenant {
|
||||
pub id: TenantShardId,
|
||||
pub id: TenantId,
|
||||
pub gen: u32,
|
||||
}
|
||||
|
||||
@@ -26,7 +24,7 @@ pub struct ReAttachResponse {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateRequestTenant {
|
||||
pub id: TenantShardId,
|
||||
pub id: TenantId,
|
||||
pub gen: u32,
|
||||
}
|
||||
|
||||
@@ -42,6 +40,6 @@ pub struct ValidateResponse {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateResponseTenant {
|
||||
pub id: TenantShardId,
|
||||
pub id: TenantId,
|
||||
pub valid: bool,
|
||||
}
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{fmt, ops::Range};
|
||||
|
||||
use crate::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use std::fmt;
|
||||
|
||||
/// Key used in the Repository kv-store.
|
||||
///
|
||||
@@ -144,436 +140,3 @@ impl Key {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Layout of the Key address space
|
||||
//
|
||||
// The Key struct, used to address the underlying key-value store, consists of
|
||||
// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
|
||||
// all the data and metadata keys into those 18 bytes.
|
||||
//
|
||||
// Principles for the mapping:
|
||||
//
|
||||
// - Things that are often accessed or modified together, should be close to
|
||||
// each other in the key space. For example, if a relation is extended by one
|
||||
// block, we create a new key-value pair for the block data, and update the
|
||||
// relation size entry. Because of that, the RelSize key comes after all the
|
||||
// RelBlocks of a relation: the RelSize and the last RelBlock are always next
|
||||
// to each other.
|
||||
//
|
||||
// The key space is divided into four major sections, identified by the first
|
||||
// byte, and the form a hierarchy:
|
||||
//
|
||||
// 00 Relation data and metadata
|
||||
//
|
||||
// DbDir () -> (dbnode, spcnode)
|
||||
// Filenodemap
|
||||
// RelDir -> relnode forknum
|
||||
// RelBlocks
|
||||
// RelSize
|
||||
//
|
||||
// 01 SLRUs
|
||||
//
|
||||
// SlruDir kind
|
||||
// SlruSegBlocks segno
|
||||
// SlruSegSize
|
||||
//
|
||||
// 02 pg_twophase
|
||||
//
|
||||
// 03 misc
|
||||
// Controlfile
|
||||
// checkpoint
|
||||
// pg_version
|
||||
//
|
||||
// 04 aux files
|
||||
//
|
||||
// Below is a full list of the keyspace allocation:
|
||||
//
|
||||
// DbDir:
|
||||
// 00 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Filenodemap:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000000
|
||||
//
|
||||
// RelDir:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
|
||||
//
|
||||
// RelBlock:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK BLKNUM
|
||||
//
|
||||
// RelSize:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
|
||||
//
|
||||
// SlruDir:
|
||||
// 01 kind 00000000 00000000 00 00000000
|
||||
//
|
||||
// SlruSegBlock:
|
||||
// 01 kind 00000001 SEGNO 00 BLKNUM
|
||||
//
|
||||
// SlruSegSize:
|
||||
// 01 kind 00000001 SEGNO 00 FFFFFFFF
|
||||
//
|
||||
// TwoPhaseDir:
|
||||
// 02 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// TwoPhaseFile:
|
||||
// 02 00000000 00000000 00000000 00 XID
|
||||
//
|
||||
// ControlFile:
|
||||
// 03 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Checkpoint:
|
||||
// 03 00000000 00000000 00000000 00 00000001
|
||||
//
|
||||
// AuxFiles:
|
||||
// 03 00000000 00000000 00000000 00 00000002
|
||||
//
|
||||
|
||||
//-- Section 01: relation data and metadata
|
||||
|
||||
pub const DBDIR_KEY: Key = Key {
|
||||
field1: 0x00,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
#[inline(always)]
|
||||
pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0xffffffff,
|
||||
field5: 0xff,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_size_to_key(rel: RelTag) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_key_range(rel: RelTag) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum + 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 02: SLRUs
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_dir_to_key(kind: SlruKind) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
|
||||
let field2 = match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
};
|
||||
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: pg_twophase
|
||||
|
||||
pub const TWOPHASEDIR_KEY: Key = Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
#[inline(always)]
|
||||
pub fn twophase_file_key(xid: TransactionId) -> Key {
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
|
||||
let (next_xid, overflowed) = xid.overflowing_add(1);
|
||||
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}..Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: u8::from(overflowed),
|
||||
field6: next_xid,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: Control file
|
||||
pub const CONTROLFILE_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
pub const CHECKPOINT_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
};
|
||||
|
||||
pub const AUX_FILES_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 2,
|
||||
};
|
||||
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
key != AUX_FILES_KEY
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_vm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00
|
||||
&& key.field4 != 0
|
||||
&& key.field5 == VISIBILITYMAP_FORKNUM
|
||||
&& key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
let kind = match key.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
|
||||
};
|
||||
let segno = key.field4;
|
||||
let blknum = key.field6;
|
||||
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_slru_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x01 // SLRU-related
|
||||
&& key.field3 == 0x00000001 // but not SlruDir
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
spcnode: key.field2,
|
||||
dbnode: key.field3,
|
||||
relnode: key.field4,
|
||||
forknum: key.field5,
|
||||
},
|
||||
key.field6,
|
||||
),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
Self::from_hex(s)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use crate::key::Key;
|
||||
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
|
||||
#[test]
|
||||
fn display_fromstr_bijection() {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||
|
||||
let key = Key {
|
||||
field1: rng.gen(),
|
||||
field2: rng.gen(),
|
||||
field3: rng.gen(),
|
||||
field4: rng.gen(),
|
||||
field5: rng.gen(),
|
||||
field6: rng.gen(),
|
||||
};
|
||||
|
||||
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ use const_format::formatcp;
|
||||
/// Public API types
|
||||
pub mod control_api;
|
||||
pub mod key;
|
||||
pub mod keyspace;
|
||||
pub mod models;
|
||||
pub mod reltag;
|
||||
pub mod shard;
|
||||
|
||||
@@ -1,30 +1,24 @@
|
||||
pub mod partitioning;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
time::{Duration, SystemTime},
|
||||
time::SystemTime,
|
||||
};
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::serde_as;
|
||||
use strum_macros;
|
||||
use utils::{
|
||||
completion,
|
||||
generation::Generation,
|
||||
history_buffer::HistoryBufferWithDropCounter,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use crate::{reltag::RelTag, shard::TenantShardId};
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
///
|
||||
@@ -186,37 +180,10 @@ pub struct TimelineCreateRequest {
|
||||
#[serde(default)]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
pub existing_initdb_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
pub ancestor_start_lsn: Option<Lsn>,
|
||||
pub pg_version: Option<u32>,
|
||||
}
|
||||
|
||||
/// Parameters that apply to all shards in a tenant. Used during tenant creation.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct ShardParameters {
|
||||
pub count: ShardCount,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
}
|
||||
|
||||
impl ShardParameters {
|
||||
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.count == ShardCount(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ShardParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: ShardCount(0),
|
||||
stripe_size: Self::DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantCreateRequest {
|
||||
@@ -224,12 +191,6 @@ pub struct TenantCreateRequest {
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
|
||||
// If omitted, create a single shard with TenantShardId::unsharded()
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
|
||||
pub shard_parameters: ShardParameters,
|
||||
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
@@ -252,7 +213,7 @@ impl std::ops::Deref for TenantCreateRequest {
|
||||
|
||||
/// An alternative representation of `pageserver::tenant::TenantConf` with
|
||||
/// simpler types.
|
||||
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
pub struct TenantConfig {
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub checkpoint_timeout: Option<String>,
|
||||
@@ -267,42 +228,20 @@ pub struct TenantConfig {
|
||||
pub lagging_wal_timeout: Option<String>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
pub eviction_policy: Option<EvictionPolicy>,
|
||||
// We defer the parsing of the eviction_policy field to the request handler.
|
||||
// Otherwise we'd have to move the types for eviction policy into this package.
|
||||
// We might do that once the eviction feature has stabilizied.
|
||||
// For now, this field is not even documented in the openapi_spec.yml.
|
||||
pub eviction_policy: Option<serde_json::Value>,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
||||
pub gc_feedback: Option<bool>,
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum EvictionPolicy {
|
||||
NoEviction,
|
||||
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
|
||||
}
|
||||
|
||||
impl EvictionPolicy {
|
||||
pub fn discriminant_str(&self) -> &'static str {
|
||||
match self {
|
||||
EvictionPolicy::NoEviction => "NoEviction",
|
||||
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub period: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub threshold: Duration,
|
||||
}
|
||||
|
||||
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
|
||||
/// lists out all possible states (and the virtual "Detached" state)
|
||||
/// in a flat form rather than using rust-style enums.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub enum LocationConfigMode {
|
||||
AttachedSingle,
|
||||
AttachedMulti,
|
||||
@@ -311,44 +250,27 @@ pub enum LocationConfigMode {
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct LocationConfigSecondary {
|
||||
pub warm: bool,
|
||||
}
|
||||
|
||||
/// An alternative representation of `pageserver::tenant::LocationConf`,
|
||||
/// for use in external-facing APIs.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct LocationConfig {
|
||||
pub mode: LocationConfigMode,
|
||||
/// If attaching, in what generation?
|
||||
#[serde(default)]
|
||||
pub generation: Option<u32>,
|
||||
|
||||
// If requesting mode `Secondary`, configuration for that.
|
||||
pub generation: Option<Generation>,
|
||||
#[serde(default)]
|
||||
pub secondary_conf: Option<LocationConfigSecondary>,
|
||||
|
||||
// Shard parameters: if shard_count is nonzero, then other shard_* fields
|
||||
// must be set accurately.
|
||||
#[serde(default)]
|
||||
pub shard_number: u8,
|
||||
#[serde(default)]
|
||||
pub shard_count: u8,
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: u32,
|
||||
|
||||
// This configuration only affects attached mode, but should be provided irrespective
|
||||
// of the mode, as a secondary location might transition on startup if the response
|
||||
// to the `/re-attach` control plane API requests it.
|
||||
// If requesting mode `Secondary`, configuration for that.
|
||||
// Custom storage configuration for the tenant, if any
|
||||
pub tenant_conf: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct LocationConfigListResponse {
|
||||
pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TenantCreateResponse(pub TenantId);
|
||||
@@ -361,24 +283,11 @@ pub struct StatusResponse {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
pub tenant_id: TenantShardId,
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantShardLocation {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigResponse {
|
||||
pub shards: Vec<TenantShardLocation>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantConfigRequest {
|
||||
@@ -397,14 +306,31 @@ impl std::ops::Deref for TenantConfigRequest {
|
||||
|
||||
impl TenantConfigRequest {
|
||||
pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
|
||||
let config = TenantConfig::default();
|
||||
let config = TenantConfig {
|
||||
checkpoint_distance: None,
|
||||
checkpoint_timeout: None,
|
||||
compaction_target_size: None,
|
||||
compaction_period: None,
|
||||
compaction_threshold: None,
|
||||
gc_horizon: None,
|
||||
gc_period: None,
|
||||
image_creation_threshold: None,
|
||||
pitr_interval: None,
|
||||
walreceiver_connect_timeout: None,
|
||||
lagging_wal_timeout: None,
|
||||
max_lsn_wal_lag: None,
|
||||
trace_read_requests: None,
|
||||
eviction_policy: None,
|
||||
min_resident_size_override: None,
|
||||
evictions_low_residence_duration_metric_threshold: None,
|
||||
gc_feedback: None,
|
||||
};
|
||||
TenantConfigRequest { tenant_id, config }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct TenantAttachRequest {
|
||||
#[serde(default)]
|
||||
pub config: TenantAttachConfig,
|
||||
#[serde(default)]
|
||||
pub generation: Option<u32>,
|
||||
@@ -412,7 +338,7 @@ pub struct TenantAttachRequest {
|
||||
|
||||
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
||||
/// its usage inside `TenantAttachRequest`.
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantAttachConfig {
|
||||
#[serde(flatten)]
|
||||
@@ -438,31 +364,19 @@ pub enum TenantAttachmentStatus {
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TenantInfo {
|
||||
pub id: TenantShardId,
|
||||
pub id: TenantId,
|
||||
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
|
||||
pub state: TenantState,
|
||||
/// Sum of the size of all layer files.
|
||||
/// If a layer is present in both local FS and S3, it counts only once.
|
||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TenantDetails {
|
||||
#[serde(flatten)]
|
||||
pub tenant_info: TenantInfo,
|
||||
|
||||
pub walredo: Option<WalRedoManagerStatus>,
|
||||
|
||||
pub timelines: Vec<TimelineId>,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
pub tenant_id: TenantShardId,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
@@ -478,12 +392,7 @@ pub struct TimelineInfo {
|
||||
/// The LSN that we are advertizing to safekeepers
|
||||
pub remote_consistent_lsn_visible: Lsn,
|
||||
|
||||
/// The LSN from the start of the root timeline (never changes)
|
||||
pub initdb_lsn: Lsn,
|
||||
|
||||
pub current_logical_size: u64,
|
||||
pub current_logical_size_is_accurate: bool,
|
||||
|
||||
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
/// Sum of the size of all layer files.
|
||||
/// If a layer is present in both local FS and S3, it counts only once.
|
||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
@@ -638,17 +547,24 @@ pub enum DownloadRemoteLayersTaskState {
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerStatus {
|
||||
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
pub pid: Option<u32>,
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum PagestreamFeMessage {
|
||||
@@ -656,43 +572,15 @@ pub enum PagestreamFeMessage {
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(strum_macros::EnumProperty)]
|
||||
pub enum PagestreamBeMessage {
|
||||
Exists(PagestreamExistsResponse),
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
GetPage(PagestreamGetPageResponse),
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamBeMessageTag {
|
||||
Exists = 100,
|
||||
Nblocks = 101,
|
||||
GetPage = 102,
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
}
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
100 => Ok(PagestreamBeMessageTag::Exists),
|
||||
101 => Ok(PagestreamBeMessageTag::Nblocks),
|
||||
102 => Ok(PagestreamBeMessageTag::GetPage),
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
@@ -724,14 +612,6 @@ pub struct PagestreamDbSizeRequest {
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamGetSlruSegmentRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub kind: u8,
|
||||
pub segno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamExistsResponse {
|
||||
pub exists: bool,
|
||||
@@ -747,11 +627,6 @@ pub struct PagestreamGetPageResponse {
|
||||
pub page: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetSlruSegmentResponse {
|
||||
pub segment: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamErrorResponse {
|
||||
pub message: String,
|
||||
@@ -762,17 +637,6 @@ pub struct PagestreamDbSizeResponse {
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
|
||||
// that require pageserver-internal types. It is sufficient to get the total size.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantHistorySize {
|
||||
pub id: TenantId,
|
||||
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
||||
///
|
||||
/// Will be none if `?inputs_only=true` was given.
|
||||
pub size: Option<u64>,
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
@@ -815,14 +679,6 @@ impl PagestreamFeMessage {
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.dbnode);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(req) => {
|
||||
bytes.put_u8(4);
|
||||
bytes.put_u8(u8::from(req.latest));
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -873,14 +729,6 @@ impl PagestreamFeMessage {
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
4 => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentRequest {
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
kind: body.read_u8()?,
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
}
|
||||
}
|
||||
@@ -890,107 +738,35 @@ impl PagestreamBeMessage {
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u8(100); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u8(101); /* tag from pagestore_client.h */
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put_u8(102); /* tag from pagestore_client.h */
|
||||
bytes.put(&resp.page[..]);
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put_u8(103); /* tag from pagestore_client.h */
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_u8(104); /* tag from pagestore_client.h */
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(resp) => {
|
||||
bytes.put_u8(Tag::GetSlruSegment as u8);
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
|
||||
let mut buf = buf.reader();
|
||||
let msg_tag = buf.read_u8()?;
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
let ok =
|
||||
match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
|
||||
Tag::Exists => {
|
||||
let exists = buf.read_u8()?;
|
||||
Self::Exists(PagestreamExistsResponse {
|
||||
exists: exists != 0,
|
||||
})
|
||||
}
|
||||
Tag::Nblocks => {
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
Self::Nblocks(PagestreamNblocksResponse { n_blocks })
|
||||
}
|
||||
Tag::GetPage => {
|
||||
let mut page = vec![0; 8192]; // TODO: use MaybeUninit
|
||||
buf.read_exact(&mut page)?;
|
||||
PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
|
||||
}
|
||||
Tag::Error => {
|
||||
let mut msg = Vec::new();
|
||||
buf.read_until(0, &mut msg)?;
|
||||
let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
|
||||
let rust_str = cstring.to_str()?;
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: rust_str.to_owned(),
|
||||
})
|
||||
}
|
||||
Tag::DbSize => {
|
||||
let db_size = buf.read_i64::<BigEndian>()?;
|
||||
Self::DbSize(PagestreamDbSizeResponse { db_size })
|
||||
}
|
||||
Tag::GetSlruSegment => {
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
|
||||
buf.read_exact(&mut segment)?;
|
||||
Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
anyhow::bail!(
|
||||
"remaining bytes in msg with tag={msg_tag}: {}",
|
||||
remaining.len()
|
||||
);
|
||||
}
|
||||
Ok(ok)
|
||||
}
|
||||
|
||||
pub fn kind(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Exists(_) => "Exists",
|
||||
Self::Nblocks(_) => "Nblocks",
|
||||
Self::GetPage(_) => "GetPage",
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -1052,11 +828,10 @@ mod tests {
|
||||
fn test_tenantinfo_serde() {
|
||||
// Test serialization/deserialization of TenantInfo
|
||||
let original_active = TenantInfo {
|
||||
id: TenantShardId::unsharded(TenantId::generate()),
|
||||
id: TenantId::generate(),
|
||||
state: TenantState::Active,
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: None,
|
||||
};
|
||||
let expected_active = json!({
|
||||
"id": original_active.id.to_string(),
|
||||
@@ -1070,14 +845,13 @@ mod tests {
|
||||
});
|
||||
|
||||
let original_broken = TenantInfo {
|
||||
id: TenantShardId::unsharded(TenantId::generate()),
|
||||
id: TenantId::generate(),
|
||||
state: TenantState::Broken {
|
||||
reason: "reason".into(),
|
||||
backtrace: "backtrace info".into(),
|
||||
},
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: None,
|
||||
};
|
||||
let expected_broken = json!({
|
||||
"id": original_broken.id.to_string(),
|
||||
|
||||
@@ -1,151 +0,0 @@
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct Partitioning {
|
||||
pub keys: crate::keyspace::KeySpace,
|
||||
|
||||
pub at_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl serde::Serialize for Partitioning {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
|
||||
|
||||
impl<'a> serde::Serialize for KeySpace<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeSeq;
|
||||
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
|
||||
for kr in &self.0.ranges {
|
||||
seq.serialize_element(&KeyRange(kr))?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
}
|
||||
|
||||
use serde::ser::SerializeMap;
|
||||
let mut map = serializer.serialize_map(Some(2))?;
|
||||
map.serialize_key("keys")?;
|
||||
map.serialize_value(&KeySpace(&self.keys))?;
|
||||
map.serialize_key("at_lsn")?;
|
||||
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
||||
map.end()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WithDisplay<'a, T>(&'a T);
|
||||
|
||||
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
|
||||
|
||||
impl<'a> serde::Serialize for KeyRange<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeTuple;
|
||||
let mut t = serializer.serialize_tuple(2)?;
|
||||
t.serialize_element(&WithDisplay(&self.0.start))?;
|
||||
t.serialize_element(&WithDisplay(&self.0.end))?;
|
||||
t.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> serde::Deserialize<'a> for Partitioning {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'a>,
|
||||
{
|
||||
pub struct KeySpace(crate::keyspace::KeySpace);
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for KeySpace {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key);
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Deserialize)]
|
||||
struct Range(Key, Key);
|
||||
|
||||
let ranges: Vec<Range> = serde::Deserialize::deserialize(deserializer)?;
|
||||
Ok(Self(crate::keyspace::KeySpace {
|
||||
ranges: ranges
|
||||
.into_iter()
|
||||
.map(|Range(start, end)| (start.0..end.0))
|
||||
.collect(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Deserialize)]
|
||||
struct De {
|
||||
keys: KeySpace,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
at_lsn: Lsn,
|
||||
}
|
||||
|
||||
let de: De = serde::Deserialize::deserialize(deserializer)?;
|
||||
Ok(Self {
|
||||
at_lsn: de.at_lsn,
|
||||
keys: de.keys.0,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_serialization_roundtrip() {
|
||||
let reference = r#"
|
||||
{
|
||||
"keys": [
|
||||
[
|
||||
"000000000000000000000000000000000000",
|
||||
"000000000000000000000000000000000001"
|
||||
],
|
||||
[
|
||||
"000000067F00000001000000000000000000",
|
||||
"000000067F00000001000000000000000002"
|
||||
],
|
||||
[
|
||||
"030000000000000000000000000000000000",
|
||||
"030000000000000000000000000000000003"
|
||||
]
|
||||
],
|
||||
"at_lsn": "0/2240160"
|
||||
}
|
||||
"#;
|
||||
|
||||
let de: Partitioning = serde_json::from_str(reference).unwrap();
|
||||
|
||||
let ser = serde_json::to_string(&de).unwrap();
|
||||
|
||||
let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
ser_de,
|
||||
serde_json::from_str::<'_, serde_json::Value>(reference).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -32,9 +32,6 @@ pub struct RelTag {
|
||||
pub relnode: Oid,
|
||||
}
|
||||
|
||||
/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
|
||||
pub type BlockNumber = u32;
|
||||
|
||||
impl PartialOrd for RelTag {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
@@ -111,23 +108,9 @@ impl RelTag {
|
||||
/// These files are divided into segments, which are divided into
|
||||
/// pages of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
Hash,
|
||||
Serialize,
|
||||
Deserialize,
|
||||
PartialEq,
|
||||
Eq,
|
||||
PartialOrd,
|
||||
Ord,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::FromRepr,
|
||||
)]
|
||||
#[repr(u8)]
|
||||
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum SlruKind {
|
||||
Clog = 0,
|
||||
Clog,
|
||||
MultiXactMembers,
|
||||
MultiXactOffsets,
|
||||
}
|
||||
|
||||
@@ -1,18 +1,13 @@
|
||||
use std::{ops::RangeInclusive, str::FromStr};
|
||||
|
||||
use crate::{
|
||||
key::{is_rel_block_key, Key},
|
||||
models::ShardParameters,
|
||||
};
|
||||
use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror;
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
|
||||
pub struct ShardNumber(pub u8);
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
|
||||
pub struct ShardCount(pub u8);
|
||||
|
||||
impl ShardCount {
|
||||
@@ -43,7 +38,7 @@ impl ShardNumber {
|
||||
/// Note that the binary encoding is _not_ backward compatible, because
|
||||
/// at the time sharding is introduced, there are no existing binary structures
|
||||
/// containing TenantId that we need to handle.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub struct TenantShardId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_number: ShardNumber,
|
||||
@@ -76,43 +71,19 @@ impl TenantShardId {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
|
||||
ShardSlug(self)
|
||||
}
|
||||
|
||||
/// Convenience for code that has special behavior on the 0th shard.
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0)
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
pub fn to_index(&self) -> ShardIndex {
|
||||
ShardIndex {
|
||||
shard_number: self.shard_number,
|
||||
shard_count: self.shard_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Formatting helper
|
||||
struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{:02x}{:02x}",
|
||||
self.0.shard_number.0, self.0.shard_count.0
|
||||
)
|
||||
pub fn shard_slug(&self) -> String {
|
||||
format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TenantShardId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.shard_count != ShardCount(0) {
|
||||
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
|
||||
write!(
|
||||
f,
|
||||
"{}-{:02x}{:02x}",
|
||||
self.tenant_id, self.shard_number.0, self.shard_count.0
|
||||
)
|
||||
} else {
|
||||
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
|
||||
// is distinct from the normal single shard case (shard count == 1).
|
||||
@@ -168,89 +139,6 @@ impl From<[u8; 18]> for TenantShardId {
|
||||
}
|
||||
}
|
||||
|
||||
/// For use within the context of a particular tenant, when we need to know which
|
||||
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
||||
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
||||
/// TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl ShardIndex {
|
||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
shard_number: number,
|
||||
shard_count: count,
|
||||
}
|
||||
}
|
||||
pub fn unsharded() -> Self {
|
||||
Self {
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
|
||||
/// For use in constructing remote storage paths: concatenate this with a TenantId
|
||||
/// to get a fully qualified TenantShardId.
|
||||
///
|
||||
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
|
||||
/// that the legacy pre-sharding remote key format is preserved.
|
||||
pub fn get_suffix(&self) -> String {
|
||||
if self.is_unsharded() {
|
||||
"".to_string()
|
||||
} else {
|
||||
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ShardIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// Debug is the same as Display: the compact hex representation
|
||||
write!(f, "{}", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for ShardIndex {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
// Expect format: 1 byte shard number, 1 byte shard count
|
||||
if s.len() == 4 {
|
||||
let bytes = s.as_bytes();
|
||||
let mut shard_parts: [u8; 2] = [0u8; 2];
|
||||
hex::decode_to_slice(bytes, &mut shard_parts)?;
|
||||
Ok(Self {
|
||||
shard_number: ShardNumber(shard_parts[0]),
|
||||
shard_count: ShardCount(shard_parts[1]),
|
||||
})
|
||||
} else {
|
||||
Err(hex::FromHexError::InvalidStringLength)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; 2]> for ShardIndex {
|
||||
fn from(b: [u8; 2]) -> Self {
|
||||
Self {
|
||||
shard_number: ShardNumber(b[0]),
|
||||
shard_count: ShardCount(b[1]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for TenantShardId {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
@@ -321,282 +209,6 @@ impl<'de> Deserialize<'de> for TenantShardId {
|
||||
}
|
||||
}
|
||||
|
||||
/// Stripe size in number of pages
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardStripeSize(pub u32);
|
||||
|
||||
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardLayout(u8);
|
||||
|
||||
const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
||||
/// ShardIdentity uses a magic layout value to indicate if it is unusable
|
||||
const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
||||
|
||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
/// The ShardIdentity contains the information needed for one member of map
|
||||
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardIdentity {
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||
pub enum ShardConfigError {
|
||||
#[error("Invalid shard count")]
|
||||
InvalidCount,
|
||||
#[error("Invalid shard number")]
|
||||
InvalidNumber,
|
||||
#[error("Invalid stripe size")]
|
||||
InvalidStripeSize,
|
||||
}
|
||||
|
||||
impl ShardIdentity {
|
||||
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
|
||||
/// tenants. Modern single-shard tenants should not use this: they should
|
||||
/// have number=0 count=1.
|
||||
pub fn unsharded() -> Self {
|
||||
Self {
|
||||
number: ShardNumber(0),
|
||||
count: ShardCount(0),
|
||||
layout: LAYOUT_V1,
|
||||
stripe_size: DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
/// A broken instance of this type is only used for `TenantState::Broken` tenants,
|
||||
/// which are constructed in code paths that don't have access to proper configuration.
|
||||
///
|
||||
/// A ShardIdentity in this state may not be used for anything, and should not be persisted.
|
||||
/// Enforcement is via assertions, to avoid making our interface fallible for this
|
||||
/// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
|
||||
/// state, and by extension to avoid trying to do any page->shard resolution.
|
||||
pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
number,
|
||||
count,
|
||||
layout: LAYOUT_BROKEN,
|
||||
stripe_size: DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
||||
}
|
||||
|
||||
/// Count must be nonzero, and number must be < count. To construct
|
||||
/// the legacy case (count==0), use Self::unsharded instead.
|
||||
pub fn new(
|
||||
number: ShardNumber,
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
) -> Result<Self, ShardConfigError> {
|
||||
if count.0 == 0 {
|
||||
Err(ShardConfigError::InvalidCount)
|
||||
} else if number.0 > count.0 - 1 {
|
||||
Err(ShardConfigError::InvalidNumber)
|
||||
} else if stripe_size.0 == 0 {
|
||||
Err(ShardConfigError::InvalidStripeSize)
|
||||
} else {
|
||||
Ok(Self {
|
||||
number,
|
||||
count,
|
||||
layout: LAYOUT_V1,
|
||||
stripe_size,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// For use when creating ShardIdentity instances for new shards, where a creation request
|
||||
/// specifies the ShardParameters that apply to all shards.
|
||||
pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
|
||||
Self {
|
||||
number,
|
||||
count: params.count,
|
||||
layout: LAYOUT_V1,
|
||||
stripe_size: params.stripe_size,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_broken(&self) -> bool {
|
||||
self.layout == LAYOUT_BROKEN
|
||||
}
|
||||
|
||||
pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
|
||||
assert!(!self.is_broken());
|
||||
key_to_shard_number(self.count, self.stripe_size, key)
|
||||
}
|
||||
|
||||
/// Return true if the key should be ingested by this shard
|
||||
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||
assert!(!self.is_broken());
|
||||
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
||||
true
|
||||
} else {
|
||||
key_to_shard_number(self.count, self.stripe_size, key) == self.number
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
// A: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
false
|
||||
} else {
|
||||
!self.is_key_local(key)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
if self.count > ShardCount(0) {
|
||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
||||
/// for special cases on shard 0 such as ingesting relation sizes.
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.number == ShardNumber(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for ShardIndex {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
// Binary encoding is not used in index_part.json, but is included in anticipation of
|
||||
// switching various structures (e.g. inter-process communication, remote metadata) to more
|
||||
// compact binary encodings in future.
|
||||
let mut packed: [u8; 2] = [0; 2];
|
||||
packed[0] = self.shard_number.0;
|
||||
packed[1] = self.shard_count.0;
|
||||
packed.serialize(serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for ShardIndex {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct IdVisitor {
|
||||
is_human_readable_deserializer: bool,
|
||||
}
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
||||
type Value = ShardIndex;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
if self.is_human_readable_deserializer {
|
||||
formatter.write_str("value in form of hex string")
|
||||
} else {
|
||||
formatter.write_str("value in form of integer array([u8; 2])")
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||
let id: [u8; 2] = Deserialize::deserialize(s)?;
|
||||
Ok(ShardIndex::from(id))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
ShardIndex::from_str(v).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
|
||||
if deserializer.is_human_readable() {
|
||||
deserializer.deserialize_str(IdVisitor {
|
||||
is_human_readable_deserializer: true,
|
||||
})
|
||||
} else {
|
||||
deserializer.deserialize_tuple(
|
||||
2,
|
||||
IdVisitor {
|
||||
is_human_readable_deserializer: false,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
|
||||
/// in order to be able to serve basebackup requests without peer communication).
|
||||
fn key_is_shard0(key: &Key) -> bool {
|
||||
// To decide what to shard out to shards >0, we apply a simple rule that only
|
||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||
// requests, and any request other than those for particular blocks in relations.
|
||||
!is_rel_block_key(key)
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
fn murmurhash32(mut h: u32) -> u32 {
|
||||
h ^= h >> 16;
|
||||
h = h.wrapping_mul(0x85ebca6b);
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(0xc2b2ae35);
|
||||
h ^= h >> 16;
|
||||
h
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
fn hash_combine(mut a: u32, mut b: u32) -> u32 {
|
||||
b = b.wrapping_add(0x9e3779b9);
|
||||
b = b.wrapping_add(a << 6);
|
||||
b = b.wrapping_add(a >> 2);
|
||||
|
||||
a ^= b;
|
||||
a
|
||||
}
|
||||
|
||||
/// Where a Key is to be distributed across shards, select the shard. This function
|
||||
/// does not account for keys that should be broadcast across shards.
|
||||
///
|
||||
/// The hashing in this function must exactly match what we do in postgres smgr
|
||||
/// code. The resulting distribution of pages is intended to preserve locality within
|
||||
/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
|
||||
/// distributing data pseudo-randomly.
|
||||
///
|
||||
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
|
||||
/// and will be handled at higher levels when shards are split.
|
||||
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
|
||||
// Fast path for un-sharded tenants or broadcast keys
|
||||
if count < ShardCount(2) || key_is_shard0(key) {
|
||||
return ShardNumber(0);
|
||||
}
|
||||
|
||||
// relNode
|
||||
let mut hash = murmurhash32(key.field4);
|
||||
// blockNum/stripe size
|
||||
hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
|
||||
|
||||
ShardNumber((hash % count.0 as u32) as u8)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
@@ -706,91 +318,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_identity_validation() -> Result<(), ShardConfigError> {
|
||||
// Happy cases
|
||||
ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?;
|
||||
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?;
|
||||
ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?;
|
||||
|
||||
assert_eq!(
|
||||
ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE),
|
||||
Err(ShardConfigError::InvalidCount)
|
||||
);
|
||||
assert_eq!(
|
||||
ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE),
|
||||
Err(ShardConfigError::InvalidNumber)
|
||||
);
|
||||
assert_eq!(
|
||||
ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE),
|
||||
Err(ShardConfigError::InvalidNumber)
|
||||
);
|
||||
assert_eq!(
|
||||
ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE),
|
||||
Err(ShardConfigError::InvalidNumber)
|
||||
);
|
||||
assert_eq!(
|
||||
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)),
|
||||
Err(ShardConfigError::InvalidStripeSize)
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_index_human_encoding() -> Result<(), hex::FromHexError> {
|
||||
let example = ShardIndex {
|
||||
shard_number: ShardNumber(13),
|
||||
shard_count: ShardCount(17),
|
||||
};
|
||||
let expected: String = "0d11".to_string();
|
||||
let encoded = format!("{example}");
|
||||
assert_eq!(&encoded, &expected);
|
||||
|
||||
let decoded = ShardIndex::from_str(&encoded)?;
|
||||
assert_eq!(example, decoded);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> {
|
||||
let example = ShardIndex {
|
||||
shard_number: ShardNumber(13),
|
||||
shard_count: ShardCount(17),
|
||||
};
|
||||
let expected: [u8; 2] = [0x0d, 0x11];
|
||||
|
||||
let encoded = bincode::serialize(&example).unwrap();
|
||||
assert_eq!(Hex(&encoded), Hex(&expected));
|
||||
let decoded = bincode::deserialize(&encoded).unwrap();
|
||||
assert_eq!(example, decoded);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// These are only smoke tests to spot check that our implementation doesn't
|
||||
// deviate from a few examples values: not aiming to validate the overall
|
||||
// hashing algorithm.
|
||||
#[test]
|
||||
fn murmur_hash() {
|
||||
assert_eq!(murmurhash32(0), 0);
|
||||
|
||||
assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_mapping() {
|
||||
let key = Key {
|
||||
field1: 0x00,
|
||||
field2: 0x67f,
|
||||
field3: 0x5,
|
||||
field4: 0x400c,
|
||||
field5: 0x00,
|
||||
field6: 0x7d06,
|
||||
};
|
||||
|
||||
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
|
||||
assert_eq!(shard, ShardNumber(8));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,12 +35,6 @@ pub enum QueryError {
|
||||
/// We were instructed to shutdown while processing the query
|
||||
#[error("Shutting down")]
|
||||
Shutdown,
|
||||
/// Query handler indicated that client should reconnect
|
||||
#[error("Server requested reconnect")]
|
||||
Reconnect,
|
||||
/// Query named an entity that was not found
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(std::borrow::Cow<'static, str>),
|
||||
/// Authentication failure
|
||||
#[error("Unauthorized: {0}")]
|
||||
Unauthorized(std::borrow::Cow<'static, str>),
|
||||
@@ -60,9 +54,9 @@ impl From<io::Error> for QueryError {
|
||||
impl QueryError {
|
||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||
match self {
|
||||
Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
|
||||
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
|
||||
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
||||
Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
|
||||
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
|
||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||
}
|
||||
}
|
||||
@@ -431,11 +425,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
info!("Stopped due to shutdown");
|
||||
Ok(())
|
||||
}
|
||||
Err(QueryError::Reconnect) => {
|
||||
// Dropping out of this loop implicitly disconnects
|
||||
info!("Stopped due to handler reconnect request");
|
||||
Ok(())
|
||||
}
|
||||
Err(QueryError::Disconnected(e)) => {
|
||||
info!("Disconnected ({e:#})");
|
||||
// Disconnection is not an error: we just use it that way internally to drop
|
||||
@@ -985,9 +974,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
|
||||
pub fn short_error(e: &QueryError) -> String {
|
||||
match e {
|
||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||
QueryError::Reconnect => "reconnect".to_string(),
|
||||
QueryError::Shutdown => "shutdown".to_string(),
|
||||
QueryError::NotFound(_) => "not found".to_string(),
|
||||
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
|
||||
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
|
||||
QueryError::Other(e) => format!("{e:#}"),
|
||||
@@ -1009,15 +996,9 @@ fn log_query_error(query: &str, e: &QueryError) {
|
||||
QueryError::SimulatedConnectionError => {
|
||||
error!("query handler for query '{query}' failed due to a simulated connection error")
|
||||
}
|
||||
QueryError::Reconnect => {
|
||||
info!("query handler for '{query}' requested client to reconnect")
|
||||
}
|
||||
QueryError::Shutdown => {
|
||||
info!("query handler for '{query}' cancelled during tenant shutdown")
|
||||
}
|
||||
QueryError::NotFound(reason) => {
|
||||
info!("query handler for '{query}' entity not found: {reason}")
|
||||
}
|
||||
QueryError::Unauthorized(e) => {
|
||||
warn!("query handler for '{query}' failed with authentication error: {e}");
|
||||
}
|
||||
|
||||
@@ -163,18 +163,8 @@ impl PgConnectionConfig {
|
||||
}
|
||||
|
||||
/// Connect using postgres protocol with TLS disabled.
|
||||
pub async fn connect_no_tls(
|
||||
&self,
|
||||
) -> Result<
|
||||
(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
),
|
||||
postgres::Error,
|
||||
> {
|
||||
self.to_tokio_postgres_config()
|
||||
.connect(postgres::NoTls)
|
||||
.await
|
||||
pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -207,16 +207,10 @@ pub fn find_end_of_wal(
|
||||
let seg_offs = curr_lsn.segment_offset(wal_seg_size);
|
||||
segment.seek(SeekFrom::Start(seg_offs as u64))?;
|
||||
// loop inside segment
|
||||
while curr_lsn.segment_number(wal_seg_size) == segno {
|
||||
loop {
|
||||
let bytes_read = segment.read(&mut buf)?;
|
||||
if bytes_read == 0 {
|
||||
debug!(
|
||||
"find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
|
||||
result,
|
||||
seg_file_path,
|
||||
curr_lsn.segment_offset(wal_seg_size)
|
||||
);
|
||||
return Ok(result);
|
||||
break; // EOF
|
||||
}
|
||||
curr_lsn += bytes_read as u64;
|
||||
decoder.feed_bytes(&buf[0..bytes_read]);
|
||||
@@ -335,8 +329,8 @@ impl CheckPoint {
|
||||
///
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||
let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
// nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||
let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
|
||||
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
|
||||
new_xid =
|
||||
|
||||
@@ -10,7 +10,6 @@ byteorder.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
smallvec.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -7,8 +7,7 @@ pub mod framed;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use smallvec::SmallVec;
|
||||
use std::{borrow::Cow, fmt, io, ops::Range, str};
|
||||
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
|
||||
|
||||
// re-export for use in utils pageserver_feedback.rs
|
||||
pub use postgres_protocol::PG_EPOCH;
|
||||
@@ -50,67 +49,29 @@ pub enum FeStartupPacket {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct StartupMessageParams {
|
||||
data: String,
|
||||
pairs: SmallVec<[Range<u32>; 4]>,
|
||||
// for easy access
|
||||
user: Option<Range<u32>>,
|
||||
database: Option<Range<u32>>,
|
||||
options: Option<Range<u32>>,
|
||||
replication: Option<Range<u32>>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for StartupMessageParams {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_map().entries(self.iter()).finish()
|
||||
}
|
||||
params: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl StartupMessageParams {
|
||||
/// Get parameter's value by its name.
|
||||
pub fn get(&self, name: &str) -> Option<&str> {
|
||||
self.pairs
|
||||
.iter()
|
||||
.map(|r| &self.data[r.start as usize..r.end as usize])
|
||||
.find_map(|pair| pair.strip_prefix(name).and_then(|x| x.strip_prefix('\0')))
|
||||
}
|
||||
|
||||
pub fn user(&self) -> Option<&str> {
|
||||
self.user
|
||||
.clone()
|
||||
.and_then(|r| self.data.get(r.start as usize..r.end as usize))
|
||||
}
|
||||
|
||||
pub fn database(&self) -> Option<&str> {
|
||||
self.database
|
||||
.clone()
|
||||
.and_then(|r| self.data.get(r.start as usize..r.end as usize))
|
||||
}
|
||||
|
||||
pub(crate) fn options_str(&self) -> Option<&str> {
|
||||
self.options
|
||||
.clone()
|
||||
.and_then(|r| self.data.get(r.start as usize..r.end as usize))
|
||||
}
|
||||
|
||||
pub fn replication(&self) -> Option<&str> {
|
||||
self.replication
|
||||
.clone()
|
||||
.and_then(|r| self.data.get(r.start as usize..r.end as usize))
|
||||
self.params.get(name).map(|s| s.as_str())
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// taking into account all escape sequences but leaving them as-is.
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
|
||||
self.options_str().map(Self::parse_options_raw)
|
||||
self.get("options").map(Self::parse_options_raw)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// applying all escape sequences (using owned strings as needed).
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
|
||||
self.options_str().map(Self::parse_options_escaped)
|
||||
self.get("options").map(Self::parse_options_escaped)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
@@ -150,44 +111,15 @@ impl StartupMessageParams {
|
||||
|
||||
/// Iterate through key-value pairs in an arbitrary order.
|
||||
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
|
||||
self.pairs
|
||||
.iter()
|
||||
.map(|r| &self.data[r.start as usize..r.end as usize])
|
||||
.flat_map(|pair| pair.split_once('\0'))
|
||||
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
|
||||
}
|
||||
|
||||
// This function is mostly useful in tests.
|
||||
#[doc(hidden)]
|
||||
pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
|
||||
let mut this = Self {
|
||||
data: Default::default(),
|
||||
pairs: Default::default(),
|
||||
user: Default::default(),
|
||||
database: Default::default(),
|
||||
options: Default::default(),
|
||||
replication: Default::default(),
|
||||
};
|
||||
for (k, v) in pairs {
|
||||
let start = this.data.len();
|
||||
this.data.push_str(k);
|
||||
this.data.push('\0');
|
||||
let value_offset = this.data.len();
|
||||
this.data.push_str(v);
|
||||
let end = this.data.len();
|
||||
this.data.push('\0');
|
||||
let range = start as u32..end as u32;
|
||||
this.pairs.push(range);
|
||||
let value_range = value_offset as u32..end as u32;
|
||||
match k {
|
||||
"user" => this.user = Some(value_range),
|
||||
"database" => this.database = Some(value_range),
|
||||
"options" => this.options = Some(value_range),
|
||||
"replication" => this.replication = Some(value_range),
|
||||
_ => {}
|
||||
}
|
||||
Self {
|
||||
params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
|
||||
}
|
||||
this.data.push('\0');
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
@@ -357,10 +289,10 @@ impl FeStartupPacket {
|
||||
// We shouldn't advance `buf` as probably full message is not there yet,
|
||||
// so can't directly use Bytes::get_u32 etc.
|
||||
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
|
||||
// The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
|
||||
// The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
|
||||
// which is less readable
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(ProtocolError::Protocol(format!(
|
||||
"invalid startup packet message length {}",
|
||||
len
|
||||
@@ -414,62 +346,33 @@ impl FeStartupPacket {
|
||||
|
||||
// Parse pairs of null-terminated strings (key, value).
|
||||
// See `postgres: ProcessStartupPacket, build_startup_packet`.
|
||||
let data = str::from_utf8(&msg)
|
||||
let mut tokens = str::from_utf8(&msg)
|
||||
.map_err(|_e| {
|
||||
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
|
||||
})?
|
||||
.to_owned();
|
||||
|
||||
let mut params = StartupMessageParams {
|
||||
data,
|
||||
pairs: Default::default(),
|
||||
user: Default::default(),
|
||||
database: Default::default(),
|
||||
options: Default::default(),
|
||||
replication: Default::default(),
|
||||
};
|
||||
|
||||
let mut offset = 0;
|
||||
let mut rest = params.data.as_str();
|
||||
loop {
|
||||
let Some((key, rest1)) = rest.split_once('\0') else {
|
||||
return Err(ProtocolError::Protocol(
|
||||
.strip_suffix('\0') // drop packet's own null
|
||||
.ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
));
|
||||
};
|
||||
// pairs terminated
|
||||
if key.is_empty() {
|
||||
params.data.truncate(offset + 1);
|
||||
params.data.shrink_to_fit();
|
||||
break;
|
||||
}
|
||||
let Some((value, rest2)) = rest1.split_once('\0') else {
|
||||
return Err(ProtocolError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
));
|
||||
};
|
||||
rest = rest2;
|
||||
)
|
||||
})?
|
||||
.split_terminator('\0');
|
||||
|
||||
let start = offset;
|
||||
let value_offset = offset + key.len() + 1;
|
||||
let end = value_offset + value.len();
|
||||
offset = end + 1;
|
||||
let mut params = HashMap::new();
|
||||
while let Some(name) = tokens.next() {
|
||||
let value = tokens.next().ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: key without value".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
params.pairs.push(start as u32..end as u32);
|
||||
let value_range = value_offset as u32..end as u32;
|
||||
match key {
|
||||
"user" => params.user = Some(value_range),
|
||||
"database" => params.database = Some(value_range),
|
||||
"options" => params.options = Some(value_range),
|
||||
"replication" => params.replication = Some(value_range),
|
||||
_ => {}
|
||||
}
|
||||
params.insert(name.to_owned(), value.to_owned());
|
||||
}
|
||||
|
||||
FeStartupPacket::StartupMessage {
|
||||
major_version,
|
||||
minor_version,
|
||||
params,
|
||||
params: StartupMessageParams { params },
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -1072,10 +975,4 @@ mod tests {
|
||||
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
|
||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_fe_startup_packet_regression() {
|
||||
let data = [0, 0, 0, 7, 0, 0, 0, 0];
|
||||
FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,18 +9,18 @@ anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
once_cell.workspace = true
|
||||
aws-smithy-async.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
aws-smithy-http.workspace = true
|
||||
aws-types.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
aws-credential-types.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
tokio-util = { workspace = true, features = ["compat"] }
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
scopeguard.workspace = true
|
||||
|
||||
@@ -1,34 +1,24 @@
|
||||
//! Azure Blob Storage wrapper
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::SystemTime;
|
||||
use std::{borrow::Cow, io::Cursor};
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
use azure_core::request_options::{MaxResults, Metadata, Range};
|
||||
use azure_core::RetryOptions;
|
||||
use azure_identity::DefaultAzureCredential;
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::blob::CopyStatus;
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use http_types::{StatusCode, Url};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use http_types::StatusCode;
|
||||
use tokio::io::AsyncRead;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
use crate::TimeTravelError;
|
||||
use crate::{
|
||||
AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
|
||||
RemoteStorage, StorageMetadata,
|
||||
@@ -59,8 +49,7 @@ impl AzureBlobStorage {
|
||||
StorageCredentials::token_credential(Arc::new(token_credential))
|
||||
};
|
||||
|
||||
// we have an outer retry
|
||||
let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
|
||||
let builder = ClientBuilder::new(account, credentials);
|
||||
|
||||
let client = builder.container_client(azure_config.container_name.to_owned());
|
||||
|
||||
@@ -124,22 +113,12 @@ impl AzureBlobStorage {
|
||||
) -> Result<Download, DownloadError> {
|
||||
let mut response = builder.into_stream();
|
||||
|
||||
let mut etag = None;
|
||||
let mut last_modified = None;
|
||||
let mut metadata = HashMap::new();
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
|
||||
let mut bufs = Vec::new();
|
||||
let mut buf = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
@@ -148,12 +127,10 @@ impl AzureBlobStorage {
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
bufs.push(data);
|
||||
buf.extend_from_slice(&data.slice(..));
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
last_modified,
|
||||
download_stream: Box::pin(Cursor::new(buf)),
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
@@ -186,6 +163,7 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for AzureBlobStorage {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -239,10 +217,9 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -250,12 +227,13 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
let _permit = self.permit(RequestKind::Put).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
// TODO FIX THIS UGLY HACK and don't buffer the entire object
|
||||
// into RAM here, but use the streaming interface. For that,
|
||||
// we'd have to change the interface though...
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::with_capacity(data_size_bytes);
|
||||
tokio::io::copy(&mut from, &mut buf).await?;
|
||||
let body = azure_core::Body::Bytes(buf.into());
|
||||
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
|
||||
@@ -288,12 +266,17 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
|
||||
let mut builder = blob_client.get();
|
||||
|
||||
let range: Range = if let Some(end_exclusive) = end_exclusive {
|
||||
(start_inclusive..end_exclusive).into()
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
builder = builder.range(Range::new(start_inclusive, end_exclusive));
|
||||
} else {
|
||||
(start_inclusive..).into()
|
||||
};
|
||||
builder = builder.range(range);
|
||||
// Open ranges are not supported by the SDK so we work around
|
||||
// by setting the upper limit extremely high (but high enough
|
||||
// to still be representable by signed 64 bit integers).
|
||||
// TODO remove workaround once the SDK adds open range support
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1438
|
||||
let end_exclusive = u64::MAX / 4;
|
||||
builder = builder.range(Range::new(start_inclusive, end_exclusive));
|
||||
}
|
||||
|
||||
self.download_for_builder(builder).await
|
||||
}
|
||||
@@ -328,211 +311,4 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Copy).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
|
||||
let source_url = format!(
|
||||
"{}/{}",
|
||||
self.client.url()?,
|
||||
self.relative_path_to_name(from)
|
||||
);
|
||||
let builder = blob_client.copy(Url::from_str(&source_url)?);
|
||||
|
||||
let result = builder.into_future().await?;
|
||||
|
||||
let mut copy_status = result.copy_status;
|
||||
let start_time = Instant::now();
|
||||
const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
|
||||
loop {
|
||||
match copy_status {
|
||||
CopyStatus::Aborted => {
|
||||
anyhow::bail!("Received abort for copy from {from} to {to}.");
|
||||
}
|
||||
CopyStatus::Failed => {
|
||||
anyhow::bail!("Received failure response for copy from {from} to {to}.");
|
||||
}
|
||||
CopyStatus::Success => return Ok(()),
|
||||
CopyStatus::Pending => (),
|
||||
}
|
||||
// The copy is taking longer. Waiting a second and then re-trying.
|
||||
// TODO estimate time based on copy_progress and adjust time based on that
|
||||
tokio::time::sleep(Duration::from_millis(1000)).await;
|
||||
let properties = blob_client.get_properties().into_future().await?;
|
||||
let Some(status) = properties.blob.properties.copy_status else {
|
||||
tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
|
||||
return Ok(());
|
||||
};
|
||||
if start_time.elapsed() > MAX_WAIT_TIME {
|
||||
anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
|
||||
MAX_WAIT_TIME.as_secs_f32(),
|
||||
properties.blob.properties.copy_progress,
|
||||
);
|
||||
}
|
||||
copy_status = status;
|
||||
}
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
// TODO use Azure point in time recovery feature for this
|
||||
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// Hack to work around not being able to stream once with azure sdk.
|
||||
///
|
||||
/// Azure sdk clones streams around with the assumption that they are like
|
||||
/// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
|
||||
/// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
|
||||
/// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
|
||||
/// seekable, but we can also just re-try the request easier.
|
||||
#[project = NonSeekableStreamProj]
|
||||
enum NonSeekableStream<S> {
|
||||
/// A stream wrappers initial form.
|
||||
///
|
||||
/// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
|
||||
/// clone before first request, then this must be changed.
|
||||
Initial {
|
||||
inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
|
||||
len: usize,
|
||||
},
|
||||
/// The actually readable variant, produced by cloning the Initial variant.
|
||||
///
|
||||
/// The sdk currently always clones once, even without retry policy.
|
||||
Actual {
|
||||
#[pin]
|
||||
inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
|
||||
len: usize,
|
||||
read_any: bool,
|
||||
},
|
||||
/// Most likely unneeded, but left to make life easier, in case more clones are added.
|
||||
Cloned {
|
||||
len_was: usize,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> NonSeekableStream<S>
|
||||
where
|
||||
S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
{
|
||||
fn new(inner: S, len: usize) -> NonSeekableStream<S> {
|
||||
use tokio_util::compat::TokioAsyncReadCompatExt;
|
||||
|
||||
let inner = tokio_util::io::StreamReader::new(inner).compat();
|
||||
let inner = Some(inner);
|
||||
let inner = std::sync::Mutex::new(inner);
|
||||
NonSeekableStream::Initial { inner, len }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> std::fmt::Debug for NonSeekableStream<S> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
|
||||
Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
|
||||
Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> futures::io::AsyncRead for NonSeekableStream<S>
|
||||
where
|
||||
S: Stream<Item = std::io::Result<Bytes>>,
|
||||
{
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut [u8],
|
||||
) -> std::task::Poll<std::io::Result<usize>> {
|
||||
match self.project() {
|
||||
NonSeekableStreamProj::Actual {
|
||||
inner, read_any, ..
|
||||
} => {
|
||||
*read_any = true;
|
||||
inner.poll_read(cx, buf)
|
||||
}
|
||||
// NonSeekableStream::Initial does not support reading because it is just much easier
|
||||
// to have the mutex in place where one does not poll the contents, or that's how it
|
||||
// seemed originally. If there is a version upgrade which changes the cloning, then
|
||||
// that support needs to be hacked in.
|
||||
//
|
||||
// including {self:?} into the message would be useful, but unsure how to unproject.
|
||||
_ => std::task::Poll::Ready(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"cloned or initial values cannot be read",
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> Clone for NonSeekableStream<S> {
|
||||
/// Weird clone implementation exists to support the sdk doing cloning before issuing the first
|
||||
/// request, see type documentation.
|
||||
fn clone(&self) -> Self {
|
||||
use NonSeekableStream::*;
|
||||
|
||||
match self {
|
||||
Initial { inner, len } => {
|
||||
if let Some(inner) = inner.lock().unwrap().take() {
|
||||
Actual {
|
||||
inner,
|
||||
len: *len,
|
||||
read_any: false,
|
||||
}
|
||||
} else {
|
||||
Self::Cloned { len_was: *len }
|
||||
}
|
||||
}
|
||||
Actual { len, .. } => Cloned { len_was: *len },
|
||||
Cloned { len_was } => Cloned { len_was: *len_was },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<S> azure_core::SeekableStream for NonSeekableStream<S>
|
||||
where
|
||||
S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
|
||||
{
|
||||
async fn reset(&mut self) -> azure_core::error::Result<()> {
|
||||
use NonSeekableStream::*;
|
||||
|
||||
let msg = match self {
|
||||
Initial { inner, .. } => {
|
||||
if inner.get_mut().unwrap().is_some() {
|
||||
return Ok(());
|
||||
} else {
|
||||
"reset after first clone is not supported"
|
||||
}
|
||||
}
|
||||
Actual { read_any, .. } if !*read_any => return Ok(()),
|
||||
Actual { .. } => "reset after reading is not supported",
|
||||
Cloned { .. } => "reset after second clone is not supported",
|
||||
};
|
||||
Err(azure_core::error::Error::new(
|
||||
azure_core::error::ErrorKind::Io,
|
||||
std::io::Error::new(std::io::ErrorKind::Other, msg),
|
||||
))
|
||||
}
|
||||
|
||||
// Note: it is not documented if this should be the total or remaining length, total passes the
|
||||
// tests.
|
||||
fn len(&self) -> usize {
|
||||
use NonSeekableStream::*;
|
||||
match self {
|
||||
Initial { len, .. } => *len,
|
||||
Actual { len, .. } => *len,
|
||||
Cloned { len_was, .. } => *len_was,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,18 +14,13 @@ mod local_fs;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
|
||||
use std::{
|
||||
collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
|
||||
};
|
||||
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
|
||||
@@ -143,7 +138,7 @@ pub struct Listing {
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[allow(async_fn_in_trait)]
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
@@ -184,7 +179,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
@@ -208,27 +203,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||
|
||||
/// Copy a remote object inside a bucket from one path to another.
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
/// Resets the content of everything with the given prefix to the given state
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError>;
|
||||
}
|
||||
|
||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||
pub struct Download {
|
||||
pub download_stream: DownloadStream,
|
||||
/// The last time the file was modified (`last-modified` HTTP header)
|
||||
pub last_modified: Option<SystemTime>,
|
||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
||||
pub etag: Option<String>,
|
||||
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
@@ -269,57 +247,17 @@ impl std::fmt::Display for DownloadError {
|
||||
|
||||
impl std::error::Error for DownloadError {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TimeTravelError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The used remote storage does not have time travel recovery implemented
|
||||
Unimplemented,
|
||||
/// The number of versions/deletion markers is above our limit.
|
||||
TooManyVersions,
|
||||
/// A cancellation token aborted the process, typically during
|
||||
/// request closure or process shutdown.
|
||||
Cancelled,
|
||||
/// Other errors
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TimeTravelError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
TimeTravelError::BadInput(e) => {
|
||||
write!(
|
||||
f,
|
||||
"Failed to time travel recover a prefix due to user input: {e}"
|
||||
)
|
||||
}
|
||||
TimeTravelError::Unimplemented => write!(
|
||||
f,
|
||||
"time travel recovery is not implemented for the current storage backend"
|
||||
),
|
||||
TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
TimeTravelError::TooManyVersions => {
|
||||
write!(f, "Number of versions/delete markers above limit")
|
||||
}
|
||||
TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TimeTravelError {}
|
||||
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
#[derive(Clone)]
|
||||
// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
|
||||
pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
|
||||
pub enum GenericRemoteStorage {
|
||||
LocalFs(LocalFs),
|
||||
AwsS3(Arc<S3Bucket>),
|
||||
AzureBlob(Arc<AzureBlobStorage>),
|
||||
Unreliable(Other),
|
||||
Unreliable(Arc<UnreliableWrapper>),
|
||||
}
|
||||
|
||||
impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
impl GenericRemoteStorage {
|
||||
pub async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -362,7 +300,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -427,42 +365,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.copy(from, to).await,
|
||||
Self::AwsS3(s) => s.copy(from, to).await,
|
||||
Self::AzureBlob(s) => s.copy(from, to).await,
|
||||
Self::Unreliable(s) => s.copy(from, to).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
@@ -473,12 +375,7 @@ impl GenericRemoteStorage {
|
||||
Self::LocalFs(LocalFs::new(root.clone())?)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
// The profile and access key id are only printed here for debugging purposes,
|
||||
// their values don't indicate the eventually taken choice for auth.
|
||||
let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "<none>".into());
|
||||
let access_key_id =
|
||||
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
|
||||
}
|
||||
@@ -501,7 +398,7 @@ impl GenericRemoteStorage {
|
||||
/// this path is used for the remote object id conversion only.
|
||||
pub async fn upload_storage_object(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -754,8 +651,6 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::Put => &self.write,
|
||||
RequestKind::List => &self.read,
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
RequestKind::TimeTravel => &self.write,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,23 +4,18 @@
|
||||
//! This storage used in tests, but can also be used in cases when a certain persistent
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::Stream;
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
|
||||
};
|
||||
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -104,35 +99,27 @@ impl LocalFs {
|
||||
};
|
||||
|
||||
// If we were given a directory, we may use it as our starting point.
|
||||
// Otherwise, we must go up to the first ancestor dir that exists. This is because
|
||||
// Otherwise, we must go up to the parent directory. This is because
|
||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||
// the local filesystem we need a directory to start calling read_dir on.
|
||||
let mut initial_dir = full_path.clone();
|
||||
loop {
|
||||
// Did we make it to the root?
|
||||
if initial_dir.parent().is_none() {
|
||||
anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
|
||||
}
|
||||
|
||||
match fs::metadata(initial_dir.clone()).await {
|
||||
Ok(meta) if meta.is_dir() => {
|
||||
// We found a directory, break
|
||||
break;
|
||||
}
|
||||
Ok(_meta) => {
|
||||
match fs::metadata(full_path.clone()).await {
|
||||
Ok(meta) => {
|
||||
if !meta.is_dir() {
|
||||
// It's not a directory: strip back to the parent
|
||||
initial_dir.pop();
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// It's not a file that exists: strip the prefix back to the parent directory
|
||||
initial_dir.pop();
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected I/O error
|
||||
anyhow::bail!(e)
|
||||
}
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// It's not a file that exists: strip the prefix back to the parent directory
|
||||
initial_dir.pop();
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected I/O error
|
||||
anyhow::bail!(e)
|
||||
}
|
||||
}
|
||||
|
||||
// Note that Utf8PathBuf starts_with only considers full path segments, but
|
||||
// object prefixes are arbitrary strings, so we need the strings for doing
|
||||
// starts_with later.
|
||||
@@ -159,6 +146,7 @@ impl LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -223,7 +211,7 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -256,12 +244,9 @@ impl RemoteStorage for LocalFs {
|
||||
);
|
||||
|
||||
let from_size_bytes = data_size_bytes as u64;
|
||||
let data = tokio_util::io::StreamReader::new(data);
|
||||
let data = std::pin::pin!(data);
|
||||
let mut buffer_to_read = data.take(from_size_bytes);
|
||||
|
||||
// alternatively we could just write the bytes to a file, but local_fs is a testing utility
|
||||
let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
|
||||
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
@@ -315,7 +300,7 @@ impl RemoteStorage for LocalFs {
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let source = ReaderStream::new(
|
||||
let source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
@@ -332,8 +317,6 @@ impl RemoteStorage for LocalFs {
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
} else {
|
||||
@@ -357,14 +340,16 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let mut source = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
@@ -375,17 +360,15 @@ impl RemoteStorage for LocalFs {
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download_stream: DownloadStream = match end_exclusive {
|
||||
Some(end_exclusive) => Box::pin(ReaderStream::new(
|
||||
source.take(end_exclusive - start_inclusive),
|
||||
)),
|
||||
None => Box::pin(ReaderStream::new(source)),
|
||||
};
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream,
|
||||
Ok(match end_exclusive {
|
||||
Some(end_exclusive) => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
|
||||
},
|
||||
None => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(source),
|
||||
},
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
@@ -410,31 +393,6 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
let from_path = from.with_base(&self.storage_root);
|
||||
let to_path = to.with_base(&self.storage_root);
|
||||
create_target_directory(&to_path).await?;
|
||||
fs::copy(&from_path, &to_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to copy file from '{from_path}' to '{to_path}'",
|
||||
from_path = from_path,
|
||||
to_path = to_path
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::diverging_sub_expression)]
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
@@ -509,9 +467,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use camino_tempfile::tempdir;
|
||||
use futures_util::Stream;
|
||||
use std::{collections::HashMap, io::Write};
|
||||
|
||||
async fn read_and_assert_remote_file_contents(
|
||||
@@ -521,7 +477,7 @@ mod fs_tests {
|
||||
remote_storage_path: &RemotePath,
|
||||
expected_metadata: Option<&StorageMetadata>,
|
||||
) -> anyhow::Result<String> {
|
||||
let download = storage
|
||||
let mut download = storage
|
||||
.download(remote_storage_path)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
|
||||
@@ -530,9 +486,13 @@ mod fs_tests {
|
||||
"Unexpected metadata returned for the downloaded file"
|
||||
);
|
||||
|
||||
let contents = aggregate(download.download_stream).await?;
|
||||
|
||||
String::from_utf8(contents).map_err(anyhow::Error::new)
|
||||
let mut contents = String::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_string(&mut contents)
|
||||
.await
|
||||
.context("Failed to read remote file contents into string")?;
|
||||
Ok(contents)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -561,26 +521,25 @@ mod fs_tests {
|
||||
let storage = create_storage()?;
|
||||
|
||||
let id = RemotePath::new(Utf8Path::new("dummy"))?;
|
||||
let content = Bytes::from_static(b"12345");
|
||||
let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));
|
||||
let content = std::io::Cursor::new(b"12345");
|
||||
|
||||
// Check that you get an error if the size parameter doesn't match the actual
|
||||
// size of the stream.
|
||||
storage
|
||||
.upload(content(), 0, &id, None)
|
||||
.upload(Box::new(content.clone()), 0, &id, None)
|
||||
.await
|
||||
.expect_err("upload with zero size succeeded");
|
||||
storage
|
||||
.upload(content(), 4, &id, None)
|
||||
.upload(Box::new(content.clone()), 4, &id, None)
|
||||
.await
|
||||
.expect_err("upload with too short size succeeded");
|
||||
storage
|
||||
.upload(content(), 6, &id, None)
|
||||
.upload(Box::new(content.clone()), 6, &id, None)
|
||||
.await
|
||||
.expect_err("upload with too large size succeeded");
|
||||
|
||||
// Correct size is 5, this should succeed.
|
||||
storage.upload(content(), 5, &id, None).await?;
|
||||
storage.upload(Box::new(content), 5, &id, None).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -628,7 +587,7 @@ mod fs_tests {
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||
|
||||
let first_part_download = storage
|
||||
let mut first_part_download = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.await?;
|
||||
assert!(
|
||||
@@ -636,13 +595,21 @@ mod fs_tests {
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
let first_part_remote = aggregate(first_part_download.download_stream).await?;
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut first_part_download.download_stream,
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
first_part_local, first_part_remote,
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
"First part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
let second_part_download = storage
|
||||
let mut second_part_download = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
@@ -654,9 +621,17 @@ mod fs_tests {
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
let second_part_remote = aggregate(second_part_download.download_stream).await?;
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut second_part_download.download_stream,
|
||||
&mut second_part_remote,
|
||||
)
|
||||
.await?;
|
||||
second_part_remote.flush().await?;
|
||||
let second_part_remote = second_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
second_part_local, second_part_remote,
|
||||
second_part_local,
|
||||
second_part_remote.as_slice(),
|
||||
"Second part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
@@ -746,10 +721,17 @@ mod fs_tests {
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
||||
|
||||
let partial_download_with_metadata = storage
|
||||
let mut partial_download_with_metadata = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.await?;
|
||||
let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut partial_download_with_metadata.download_stream,
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
@@ -825,16 +807,16 @@ mod fs_tests {
|
||||
)
|
||||
})?;
|
||||
|
||||
let file = tokio_util::io::ReaderStream::new(file);
|
||||
|
||||
storage.upload(file, size, &relative_path, metadata).await?;
|
||||
storage
|
||||
.upload(Box::new(file), size, &relative_path, metadata)
|
||||
.await?;
|
||||
Ok(relative_path)
|
||||
}
|
||||
|
||||
async fn create_file_for_upload(
|
||||
path: &Utf8Path,
|
||||
contents: &str,
|
||||
) -> anyhow::Result<(fs::File, usize)> {
|
||||
) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
|
||||
std::fs::create_dir_all(path.parent().unwrap())?;
|
||||
let mut file_for_writing = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
@@ -844,7 +826,7 @@ mod fs_tests {
|
||||
drop(file_for_writing);
|
||||
let file_size = path.metadata()?.len() as usize;
|
||||
Ok((
|
||||
fs::OpenOptions::new().read(true).open(&path).await?,
|
||||
io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
|
||||
file_size,
|
||||
))
|
||||
}
|
||||
@@ -858,16 +840,4 @@ mod fs_tests {
|
||||
files.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
async fn aggregate(
|
||||
stream: impl Stream<Item = std::io::Result<Bytes>>,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
use futures::stream::StreamExt;
|
||||
let mut out = Vec::new();
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
while let Some(res) = stream.next().await {
|
||||
out.extend_from_slice(&res?[..]);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,49 +4,38 @@
|
||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||
//! their bucket prefixes are both specified and different.
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{Context, Poll},
|
||||
time::SystemTime,
|
||||
};
|
||||
use std::{borrow::Cow, sync::Arc};
|
||||
|
||||
use anyhow::{anyhow, Context as _};
|
||||
use anyhow::Context;
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
meta::credentials::CredentialsProviderChain,
|
||||
profile::ProfileFileCredentialsProvider,
|
||||
provider_config::ProviderConfig,
|
||||
retry::{RetryConfigBuilder, RetryMode},
|
||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
BehaviorVersion,
|
||||
};
|
||||
use aws_credential_types::provider::SharedCredentialsProvider;
|
||||
use aws_credential_types::cache::CredentialsCache;
|
||||
use aws_sdk_s3::{
|
||||
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
|
||||
config::{AsyncSleep, Config, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
|
||||
primitives::ByteStream,
|
||||
types::{Delete, ObjectIdentifier},
|
||||
Client,
|
||||
};
|
||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use aws_smithy_types::byte_stream::ByteStream;
|
||||
use aws_smithy_types::{body::SdkBody, DateTime};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
use tokio::io::{self, AsyncRead};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
@@ -72,36 +61,28 @@ struct GetObjectRequest {
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||
tracing::debug!(
|
||||
debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
);
|
||||
|
||||
let region = Some(Region::new(aws_config.bucket_region.clone()));
|
||||
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
)
|
||||
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
.or_else(
|
||||
"profile-sso",
|
||||
ProfileFileCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// needed to access remote extensions bucket
|
||||
.or_else(
|
||||
"token",
|
||||
.or_else("token", {
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
|
||||
WebIdentityTokenCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
.build()
|
||||
})
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
@@ -117,20 +98,18 @@ impl S3Bucket {
|
||||
.set_max_attempts(Some(1))
|
||||
.set_mode(Some(RetryMode::Adaptive));
|
||||
|
||||
let mut config_builder = Builder::default()
|
||||
.behavior_version(BehaviorVersion::v2023_11_09())
|
||||
let mut config_builder = Config::builder()
|
||||
.region(region)
|
||||
.identity_cache(IdentityCache::lazy().build())
|
||||
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
|
||||
.retry_config(retry_config.build())
|
||||
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
|
||||
.credentials_cache(CredentialsCache::lazy())
|
||||
.credentials_provider(credentials_provider)
|
||||
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
|
||||
.retry_config(retry_config.build());
|
||||
|
||||
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
||||
config_builder = config_builder
|
||||
.endpoint_url(custom_endpoint)
|
||||
.force_path_style(true);
|
||||
}
|
||||
|
||||
let client = Client::from_conf(config_builder.build());
|
||||
|
||||
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
|
||||
@@ -232,152 +211,58 @@ impl S3Bucket {
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
if get_object.is_err() {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
}
|
||||
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
let etag = object_output.e_tag.clone();
|
||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||
|
||||
let body = object_output.body;
|
||||
let body = ByteStreamAsStream::from(body);
|
||||
let body = PermitCarrying::new(permit, body);
|
||||
let body = TimedDownload::new(started_at, body);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
etag,
|
||||
last_modified,
|
||||
download_stream: Box::pin(body),
|
||||
download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
|
||||
started_at,
|
||||
RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
|
||||
))),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
);
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
|
||||
Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
))
|
||||
}
|
||||
Err(e) => Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_oids(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
delete_objects: &[ObjectIdentifier],
|
||||
) -> anyhow::Result<()> {
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(
|
||||
Delete::builder()
|
||||
.set_objects(Some(chunk.to_vec()))
|
||||
.build()?,
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
let resp = resp?;
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
// would be too verbose, especially as errors may lead us
|
||||
// to retry repeatedly.
|
||||
const LOG_UP_TO_N_ERRORS: usize = 10;
|
||||
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
|
||||
tracing::warn!(
|
||||
"DeleteObjects key {} failed: {}: {}",
|
||||
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.message.as_ref().map(Cow::from).unwrap_or("".into())
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
struct ByteStreamAsStream {
|
||||
#[pin]
|
||||
inner: aws_smithy_types::byte_stream::ByteStream
|
||||
}
|
||||
}
|
||||
|
||||
impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
|
||||
fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
|
||||
ByteStreamAsStream { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for ByteStreamAsStream {
|
||||
type Item = std::io::Result<Bytes>;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
// this does the std::io::ErrorKind::Other conversion
|
||||
self.project().inner.poll_next(cx).map_err(|x| x.into())
|
||||
}
|
||||
|
||||
// cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
|
||||
// sense and Stream::size_hint does not really
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
||||
struct PermitCarrying<S> {
|
||||
struct RatelimitedAsyncRead<S> {
|
||||
permit: tokio::sync::OwnedSemaphorePermit,
|
||||
#[pin]
|
||||
inner: S,
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> PermitCarrying<S> {
|
||||
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
|
||||
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
||||
Self { permit, inner }
|
||||
RatelimitedAsyncRead { permit, inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
|
||||
type Item = <S as Stream>::Item;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
self.project().inner.poll_next(cx)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.inner.size_hint()
|
||||
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
let this = self.project();
|
||||
this.inner.poll_read(cx, buf)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -397,7 +282,7 @@ pin_project_lite::pin_project! {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> TimedDownload<S> {
|
||||
impl<S: AsyncRead> TimedDownload<S> {
|
||||
fn new(started_at: std::time::Instant, inner: S) -> Self {
|
||||
TimedDownload {
|
||||
started_at,
|
||||
@@ -407,29 +292,29 @@ impl<S> TimedDownload<S> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||
type Item = <S as Stream>::Item;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
use std::task::ready;
|
||||
|
||||
impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
let this = self.project();
|
||||
let before = buf.filled().len();
|
||||
let read = std::task::ready!(this.inner.poll_read(cx, buf));
|
||||
|
||||
let res = ready!(this.inner.poll_next(cx));
|
||||
match &res {
|
||||
Some(Ok(_)) => {}
|
||||
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
|
||||
None => *this.outcome = metrics::AttemptOutcome::Ok,
|
||||
let read_eof = buf.filled().len() == before;
|
||||
|
||||
match read {
|
||||
Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
|
||||
Ok(()) => { /* still in progress */ }
|
||||
Err(_) => *this.outcome = AttemptOutcome::Err,
|
||||
}
|
||||
|
||||
Poll::Ready(res)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.inner.size_hint()
|
||||
std::task::Poll::Ready(read)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -486,11 +371,11 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let response = response?;
|
||||
|
||||
let keys = response.contents();
|
||||
let keys = response.contents().unwrap_or_default();
|
||||
let empty = Vec::new();
|
||||
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
||||
|
||||
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
|
||||
for object in keys {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
@@ -515,7 +400,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -525,8 +410,8 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let body = Body::wrap_stream(from);
|
||||
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
|
||||
let body = Body::wrap_stream(ReaderStream::new(from));
|
||||
let bytes_stream = ByteStream::new(SdkBody::from(body));
|
||||
|
||||
let res = self
|
||||
.client
|
||||
@@ -549,38 +434,6 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Copy;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
// we need to specify bucket_name as a prefix
|
||||
let copy_source = format!(
|
||||
"{}/{}",
|
||||
self.bucket_name,
|
||||
self.relative_path_to_s3_object(from)
|
||||
);
|
||||
|
||||
let res = self
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(to))
|
||||
.copy_source(copy_source)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
res?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
// if prefix is not none then download file `prefix/from`
|
||||
// if prefix is none then download file `from`
|
||||
@@ -621,222 +474,64 @@ impl RemoteStorage for S3Bucket {
|
||||
for path in paths {
|
||||
let obj_id = ObjectIdentifier::builder()
|
||||
.set_key(Some(self.relative_path_to_s3_object(path)))
|
||||
.build()?;
|
||||
.build();
|
||||
delete_objects.push(obj_id);
|
||||
}
|
||||
|
||||
self.delete_oids(kind, &delete_objects).await
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
match resp {
|
||||
Ok(resp) => {
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
// would be too verbose, especially as errors may lead us
|
||||
// to retry repeatedly.
|
||||
const LOG_UP_TO_N_ERRORS: usize = 10;
|
||||
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
|
||||
tracing::warn!(
|
||||
"DeleteObjects key {} failed: {}: {}",
|
||||
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.message.as_ref().map(Cow::from).unwrap_or("".into())
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
let timestamp = DateTime::from(timestamp);
|
||||
let done_if_after = DateTime::from(done_if_after);
|
||||
|
||||
tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
let is_permanent = |_e: &_| false;
|
||||
|
||||
let mut key_marker = None;
|
||||
let mut version_id_marker = None;
|
||||
let mut versions_and_deletes = Vec::new();
|
||||
|
||||
loop {
|
||||
let response = backoff::retry(
|
||||
|| async {
|
||||
self.client
|
||||
.list_object_versions()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(prefix.clone())
|
||||
.set_key_marker(key_marker.clone())
|
||||
.set_version_id_marker(version_id_marker.clone())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"listing object versions for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
|
||||
)
|
||||
.await?;
|
||||
|
||||
tracing::trace!(
|
||||
" Got List response version_id_marker={:?}, key_marker={:?}",
|
||||
response.version_id_marker,
|
||||
response.key_marker
|
||||
);
|
||||
let versions = response
|
||||
.versions
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.map(VerOrDelete::from_version);
|
||||
let deletes = response
|
||||
.delete_markers
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.map(VerOrDelete::from_delete_marker);
|
||||
itertools::process_results(versions.chain(deletes), |n_vds| {
|
||||
versions_and_deletes.extend(n_vds)
|
||||
})
|
||||
.map_err(TimeTravelError::Other)?;
|
||||
fn none_if_empty(v: Option<String>) -> Option<String> {
|
||||
v.filter(|v| !v.is_empty())
|
||||
}
|
||||
version_id_marker = none_if_empty(response.next_version_id_marker);
|
||||
key_marker = none_if_empty(response.next_key_marker);
|
||||
if version_id_marker.is_none() {
|
||||
// The final response is not supposed to be truncated
|
||||
if response.is_truncated.unwrap_or_default() {
|
||||
return Err(TimeTravelError::Other(anyhow::anyhow!(
|
||||
"Received truncated ListObjectVersions response for prefix={prefix:?}"
|
||||
)));
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Limit the number of versions deletions, mostly so that we don't
|
||||
// keep requesting forever if the list is too long, as we'd put the
|
||||
// list in RAM.
|
||||
// Building a list of 100k entries that reaches the limit roughly takes
|
||||
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
|
||||
const COMPLEXITY_LIMIT: usize = 100_000;
|
||||
if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
|
||||
return Err(TimeTravelError::TooManyVersions);
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Built list for time travel with {} versions and deletions",
|
||||
versions_and_deletes.len()
|
||||
);
|
||||
|
||||
// Work on the list of references instead of the objects directly,
|
||||
// otherwise we get lifetime errors in the sort_by_key call below.
|
||||
let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
|
||||
|
||||
versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
|
||||
|
||||
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
|
||||
|
||||
for vd in &versions_and_deletes {
|
||||
let VerOrDelete {
|
||||
version_id, key, ..
|
||||
} = &vd;
|
||||
if version_id == "null" {
|
||||
return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \
|
||||
indicating either disabled versioning, or legacy objects with null version id values")));
|
||||
}
|
||||
tracing::trace!(
|
||||
"Parsing version key={key} version_id={version_id} kind={:?}",
|
||||
vd.kind
|
||||
);
|
||||
|
||||
vds_for_key.entry(key).or_default().push(vd);
|
||||
}
|
||||
for (key, versions) in vds_for_key {
|
||||
let last_vd = versions.last().unwrap();
|
||||
if last_vd.last_modified > done_if_after {
|
||||
tracing::trace!("Key {key} has version later than done_if_after, skipping");
|
||||
continue;
|
||||
}
|
||||
// the version we want to restore to.
|
||||
let version_to_restore_to =
|
||||
match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) {
|
||||
Ok(v) => v,
|
||||
Err(e) => e,
|
||||
};
|
||||
if version_to_restore_to == versions.len() {
|
||||
tracing::trace!("Key {key} has no changes since timestamp, skipping");
|
||||
continue;
|
||||
}
|
||||
let mut do_delete = false;
|
||||
if version_to_restore_to == 0 {
|
||||
// All versions more recent, so the key didn't exist at the specified time point.
|
||||
tracing::trace!(
|
||||
"All {} versions more recent for {key}, deleting",
|
||||
versions.len()
|
||||
);
|
||||
do_delete = true;
|
||||
} else {
|
||||
match &versions[version_to_restore_to - 1] {
|
||||
VerOrDelete {
|
||||
kind: VerOrDeleteKind::Version,
|
||||
version_id,
|
||||
..
|
||||
} => {
|
||||
tracing::trace!("Copying old version {version_id} for {key}...");
|
||||
// Restore the state to the last version by copying
|
||||
let source_id =
|
||||
format!("{}/{key}?versionId={version_id}", self.bucket_name);
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(key)
|
||||
.copy_source(&source_id)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"copying object version for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
|
||||
)
|
||||
.await?;
|
||||
tracing::info!(%version_id, %key, "Copied old version in S3");
|
||||
}
|
||||
VerOrDelete {
|
||||
kind: VerOrDeleteKind::DeleteMarker,
|
||||
..
|
||||
} => {
|
||||
do_delete = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
if do_delete {
|
||||
if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
|
||||
// Key has since been deleted (but there was some history), no need to do anything
|
||||
tracing::trace!("Key {key} already deleted, skipping.");
|
||||
} else {
|
||||
tracing::trace!("Deleting {key}...");
|
||||
|
||||
let oid = ObjectIdentifier::builder()
|
||||
.key(key.to_owned())
|
||||
.build()
|
||||
.map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
|
||||
self.delete_oids(kind, &[oid])
|
||||
.await
|
||||
.map_err(TimeTravelError::Other)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
@@ -861,62 +556,6 @@ fn start_measuring_requests(
|
||||
})
|
||||
}
|
||||
|
||||
// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
|
||||
struct VerOrDelete {
|
||||
kind: VerOrDeleteKind,
|
||||
last_modified: DateTime,
|
||||
version_id: String,
|
||||
key: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum VerOrDeleteKind {
|
||||
Version,
|
||||
DeleteMarker,
|
||||
}
|
||||
|
||||
impl VerOrDelete {
|
||||
fn with_kind(
|
||||
kind: VerOrDeleteKind,
|
||||
last_modified: Option<DateTime>,
|
||||
version_id: Option<String>,
|
||||
key: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let lvk = (last_modified, version_id, key);
|
||||
let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
|
||||
anyhow::bail!(
|
||||
"One (or more) of last_modified, key, and id is None. \
|
||||
Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
|
||||
lvk.0,
|
||||
lvk.1,
|
||||
lvk.2,
|
||||
);
|
||||
};
|
||||
Ok(Self {
|
||||
kind,
|
||||
last_modified,
|
||||
version_id,
|
||||
key,
|
||||
})
|
||||
}
|
||||
fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
|
||||
Self::with_kind(
|
||||
VerOrDeleteKind::Version,
|
||||
v.last_modified,
|
||||
v.version_id,
|
||||
v.key,
|
||||
)
|
||||
}
|
||||
fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
|
||||
Self::with_kind(
|
||||
VerOrDeleteKind::DeleteMarker,
|
||||
v.last_modified,
|
||||
v.version_id,
|
||||
v.key,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -11,8 +11,6 @@ pub(crate) enum RequestKind {
|
||||
Put = 1,
|
||||
Delete = 2,
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
TimeTravel = 5,
|
||||
}
|
||||
|
||||
use RequestKind::*;
|
||||
@@ -24,8 +22,6 @@ impl RequestKind {
|
||||
Put => "put_object",
|
||||
Delete => "delete_object",
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
TimeTravel => "time_travel_recover",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -33,7 +29,7 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct RequestTyped<C>([C; 6]);
|
||||
pub(super) struct RequestTyped<C>([C; 4]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -42,8 +38,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
|
||||
let arr = std::array::from_fn::<C, 6, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List].into_iter();
|
||||
let arr = std::array::from_fn::<C, 4, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -1,21 +1,16 @@
|
||||
//! This module provides a wrapper around a real RemoteStorage implementation that
|
||||
//! causes the first N attempts at each upload or download operatio to fail. For
|
||||
//! testing purposes.
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
use std::time::SystemTime;
|
||||
use std::{collections::hash_map::Entry, sync::Arc};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
StorageMetadata, TimeTravelError,
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
};
|
||||
|
||||
pub struct UnreliableWrapper {
|
||||
inner: GenericRemoteStorage<Arc<VoidStorage>>,
|
||||
inner: crate::GenericRemoteStorage,
|
||||
|
||||
// This many attempts of each operation will fail, then we let it succeed.
|
||||
attempts_to_fail: u64,
|
||||
@@ -32,21 +27,11 @@ enum RemoteOp {
|
||||
Download(RemotePath),
|
||||
Delete(RemotePath),
|
||||
DeleteObjects(Vec<RemotePath>),
|
||||
TimeTravelRecover(Option<RemotePath>),
|
||||
}
|
||||
|
||||
impl UnreliableWrapper {
|
||||
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
|
||||
assert!(attempts_to_fail > 0);
|
||||
let inner = match inner {
|
||||
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
|
||||
GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
|
||||
GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
|
||||
// We could also make this a no-op, as in, extract the inner of the passed generic remote storage
|
||||
GenericRemoteStorage::Unreliable(_s) => {
|
||||
panic!("Can't wrap unreliable wrapper unreliably")
|
||||
}
|
||||
};
|
||||
UnreliableWrapper {
|
||||
inner,
|
||||
attempts_to_fail,
|
||||
@@ -97,9 +82,7 @@ impl UnreliableWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
|
||||
type VoidStorage = crate::LocalFs;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for UnreliableWrapper {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
@@ -125,7 +108,7 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
@@ -177,25 +160,4 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
// copy is equivalent to download + upload
|
||||
self.attempt(RemoteOp::Download(from.clone()))?;
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.copy_object(from, to).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
|
||||
.map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
|
||||
self.inner
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
use std::collections::HashSet;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{Download, GenericRemoteStorage, RemotePath};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
pub(crate) fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
pub(crate) fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy_buf(
|
||||
&mut tokio_util::io::StreamReader::new(dl.download_stream),
|
||||
&mut buf,
|
||||
)
|
||||
.await?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
pub(crate) async fn upload_simple_remote_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn cleanup(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
objects_to_delete: HashSet<RemotePath>,
|
||||
) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) struct Uploads {
|
||||
pub(crate) prefixes: HashSet<RemotePath>,
|
||||
pub(crate) blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
pub(crate) async fn upload_remote_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
@@ -1,288 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use remote_storage::RemotePath;
|
||||
use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
use test_context::test_context;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::common::{download_to_vec, upload_stream, wrap_stream};
|
||||
|
||||
use super::{
|
||||
MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs,
|
||||
};
|
||||
|
||||
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
|
||||
/// See the client creation in [`create_s3_client`] for details on the required env vars.
|
||||
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
///
|
||||
/// Then, verifies that the client does return correct prefixes when queried:
|
||||
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
||||
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
||||
///
|
||||
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
|
||||
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
|
||||
/// since current default AWS S3 pagination limit is 1000.
|
||||
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
|
||||
///
|
||||
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
|
||||
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||
#[test_context(MaybeEnabledStorageWithTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorageWithTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorageWithTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledStorageWithTestBlobs::UploadsFailed(e, _) => {
|
||||
anyhow::bail!("S3 init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||
|
||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
|
||||
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list_prefixes(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let remote_only_prefixes = nested_remote_prefixes
|
||||
.difference(&expected_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
let missing_uploaded_prefixes = expected_remote_prefixes
|
||||
.difference(&nested_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
||||
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `s3_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
|
||||
anyhow::bail!("S3 init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let base_prefix =
|
||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list_files(None)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_files,
|
||||
ctx.remote_blobs.clone(),
|
||||
"remote storage list_files on root mismatches with the uploads."
|
||||
);
|
||||
let nested_remote_files = test_client
|
||||
.list_files(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let trim_remote_blobs: HashSet<_> = ctx
|
||||
.remote_blobs
|
||||
.iter()
|
||||
.map(|x| x.get_path())
|
||||
.filter(|x| x.starts_with("folder1"))
|
||||
.map(|x| RemotePath::new(x).expect("must be valid path"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
nested_remote_files, trim_remote_blobs,
|
||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorage::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
ctx.client.delete(&path).await.expect("should succeed");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorage::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None).await?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None).await?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None).await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
let prefixes = ctx.client.list_prefixes(None).await?;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
ctx.client.delete_objects(&[path3]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
|
||||
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// Full range (end specified)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(len as u64))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete(&path)
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
let path_dest = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/file_dest", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let orig = bytes::Bytes::from_static("remote blob data content".as_bytes());
|
||||
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
|
||||
// Normal download request
|
||||
ctx.client.copy_object(&path, &path_dest).await?;
|
||||
|
||||
let dl = ctx.client.download(&path_dest).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete_objects(&[path.clone(), path_dest.clone()])
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2,27 +2,290 @@ use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
};
|
||||
use test_context::AsyncTestContext;
|
||||
use tracing::info;
|
||||
use test_context::{test_context, AsyncTestContext};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
mod common;
|
||||
|
||||
#[path = "common/tests.rs"]
|
||||
mod tests_azure;
|
||||
|
||||
use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test";
|
||||
|
||||
/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
|
||||
/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
|
||||
/// See the client creation in [`create_azure_client`] for details on the required env vars.
|
||||
/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
///
|
||||
/// Then, verifies that the client does return correct prefixes when queried:
|
||||
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
||||
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
||||
///
|
||||
/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
|
||||
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
|
||||
///
|
||||
/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
|
||||
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||
#[test_context(MaybeEnabledAzureWithTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn azure_pagination_should_work(
|
||||
ctx: &mut MaybeEnabledAzureWithTestBlobs,
|
||||
) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
|
||||
anyhow::bail!("Azure init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||
|
||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
|
||||
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list_prefixes(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let remote_only_prefixes = nested_remote_prefixes
|
||||
.difference(&expected_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
let missing_uploaded_prefixes = expected_remote_prefixes
|
||||
.difference(&nested_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
||||
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
|
||||
/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `Azure_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn azure_list_files_works(
|
||||
ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
|
||||
) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
|
||||
anyhow::bail!("Azure init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let base_prefix =
|
||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list_files(None)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_files,
|
||||
ctx.remote_blobs.clone(),
|
||||
"remote storage list_files on root mismatches with the uploads."
|
||||
);
|
||||
let nested_remote_files = test_client
|
||||
.list_files(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let trim_remote_blobs: HashSet<_> = ctx
|
||||
.remote_blobs
|
||||
.iter()
|
||||
.map(|x| x.get_path())
|
||||
.filter(|x| x.starts_with("folder1"))
|
||||
.map(|x| RemotePath::new(x).expect("must be valid path"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
nested_remote_files, trim_remote_blobs,
|
||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledAzure)]
|
||||
#[tokio::test]
|
||||
async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledAzure::Enabled(ctx) => ctx,
|
||||
MaybeEnabledAzure::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
ctx.client.delete(&path).await.expect("should succeed");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledAzure)]
|
||||
#[tokio::test]
|
||||
async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledAzure::Enabled(ctx) => ctx,
|
||||
MaybeEnabledAzure::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
let data3 = "remote blob data3".as_bytes();
|
||||
let data3_len = data3.len();
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
||||
.await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
let prefixes = ctx.client.list_prefixes(None).await?;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
ctx.client.delete_objects(&[path3]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledAzure)]
|
||||
#[tokio::test]
|
||||
async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
|
||||
let MaybeEnabledAzure::Enabled(ctx) = ctx else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data = "remote blob data here".as_bytes();
|
||||
let data_len = data.len() as u64;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data), data.len(), &path, None)
|
||||
.await?;
|
||||
|
||||
async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
|
||||
// Full range (end specified)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(data_len))
|
||||
.await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(data_len * 100))
|
||||
.await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete(&path)
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
|
||||
struct EnabledAzure {
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
base_prefix: &'static str,
|
||||
@@ -41,13 +304,13 @@ impl EnabledAzure {
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledStorage {
|
||||
enum MaybeEnabledAzure {
|
||||
Enabled(EnabledAzure),
|
||||
Disabled,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorage {
|
||||
impl AsyncTestContext for MaybeEnabledAzure {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
|
||||
@@ -63,7 +326,7 @@ impl AsyncTestContext for MaybeEnabledStorage {
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledStorageWithTestBlobs {
|
||||
enum MaybeEnabledAzureWithTestBlobs {
|
||||
Enabled(AzureWithTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, AzureWithTestBlobs),
|
||||
@@ -76,7 +339,7 @@ struct AzureWithTestBlobs {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
@@ -92,7 +355,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
|
||||
let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -127,7 +390,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
enum MaybeEnabledAzureWithSimpleTestBlobs {
|
||||
Enabled(AzureWithSimpleTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
|
||||
@@ -138,7 +401,7 @@ struct AzureWithSimpleTestBlobs {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
@@ -154,7 +417,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
|
||||
let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
|
||||
match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -218,3 +481,143 @@ fn create_azure_client(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
struct Uploads {
|
||||
prefixes: HashSet<RemotePath>,
|
||||
blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
async fn upload_azure_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} Azure files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
async fn upload_simple_azure_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} Azure files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,80 +1,171 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, UNIX_EPOCH};
|
||||
use std::{collections::HashSet, time::SystemTime};
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use crate::common::{download_to_vec, upload_stream};
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use futures_util::Future;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
use test_context::{test_context, AsyncTestContext};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
mod common;
|
||||
|
||||
#[path = "common/tests.rs"]
|
||||
mod tests_s3;
|
||||
|
||||
use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
|
||||
use utils::backoff;
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test";
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
|
||||
/// See the client creation in [`create_s3_client`] for details on the required env vars.
|
||||
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
///
|
||||
/// Then, verifies that the client does return correct prefixes when queried:
|
||||
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
||||
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
||||
///
|
||||
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
|
||||
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
|
||||
/// since current default AWS S3 pagination limit is 1000.
|
||||
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
|
||||
///
|
||||
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
|
||||
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||
#[test_context(MaybeEnabledS3WithTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorage::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
|
||||
};
|
||||
// Our test depends on discrepancies in the clock between S3 and the environment the tests
|
||||
// run in. Therefore, wait a little bit before and after. The alternative would be
|
||||
// to take the time from S3 response headers.
|
||||
const WAIT_TIME: Duration = Duration::from_millis(3_000);
|
||||
|
||||
async fn retry<T, O, F, E>(op: O) -> Result<T, E>
|
||||
where
|
||||
E: Display + Debug + 'static,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
{
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
backoff::retry(
|
||||
op,
|
||||
|_e| false,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"test retry",
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||
|
||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None)
|
||||
.await
|
||||
}
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
|
||||
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
|
||||
);
|
||||
|
||||
async fn time_point() -> SystemTime {
|
||||
tokio::time::sleep(WAIT_TIME).await;
|
||||
let ret = SystemTime::now();
|
||||
tokio::time::sleep(WAIT_TIME).await;
|
||||
ret
|
||||
}
|
||||
let nested_remote_prefixes = test_client
|
||||
.list_prefixes(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let remote_only_prefixes = nested_remote_prefixes
|
||||
.difference(&expected_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
let missing_uploaded_prefixes = expected_remote_prefixes
|
||||
.difference(&nested_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
||||
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||
);
|
||||
|
||||
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(retry(|| client.list_files(None))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>())
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `s3_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
|
||||
anyhow::bail!("S3 init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let base_prefix =
|
||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list_files(None)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_files,
|
||||
ctx.remote_blobs.clone(),
|
||||
"remote storage list_files on root mismatches with the uploads."
|
||||
);
|
||||
let nested_remote_files = test_client
|
||||
.list_files(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let trim_remote_blobs: HashSet<_> = ctx
|
||||
.remote_blobs
|
||||
.iter()
|
||||
.map(|x| x.get_path())
|
||||
.filter(|x| x.starts_with("folder1"))
|
||||
.map(|x| RemotePath::new(x).expect("must be valid path"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
nested_remote_files, trim_remote_blobs,
|
||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
ctx.client.delete(&path).await.expect("should succeed");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
@@ -85,99 +176,45 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t0_files = list_files(&ctx.client).await?;
|
||||
let t0 = time_point().await;
|
||||
println!("at t0: {t0_files:?}");
|
||||
|
||||
let old_data = "remote blob data2";
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(old_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t1_files = list_files(&ctx.client).await?;
|
||||
let t1 = time_point().await;
|
||||
println!("at t1: {t1_files:?}");
|
||||
|
||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||
{
|
||||
let dl = retry(|| ctx.client.download(&path2)).await?;
|
||||
let last_modified = dl.last_modified.unwrap();
|
||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||
let t0_hwt = t0 + half_wt;
|
||||
let t1_hwt = t1 - half_wt;
|
||||
if !(t0_hwt..=t1_hwt).contains(&last_modified) {
|
||||
panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
|
||||
This likely means a large lock discrepancy between S3 and the local clock.");
|
||||
}
|
||||
}
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let new_data = "new remote blob data2";
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(new_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
retry(|| ctx.client.delete(&path1)).await?;
|
||||
let t2_files = list_files(&ctx.client).await?;
|
||||
let t2 = time_point().await;
|
||||
println!("at t2: {t2_files:?}");
|
||||
|
||||
// No changes after recovery to t2 (no-op)
|
||||
let t_final = time_point().await;
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
let data3 = "remote blob data3".as_bytes();
|
||||
let data3_len = data3.len();
|
||||
ctx.client
|
||||
.time_travel_recover(None, t2, t_final, CancellationToken::new())
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
let t2_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t2: {t2_files_recovered:?}");
|
||||
assert_eq!(t2_files, t2_files_recovered);
|
||||
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
assert_eq!(path2_recovered_t2, new_data.as_bytes());
|
||||
|
||||
// after recovery to t1: path1 is back, path2 has the old content
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t1, t_final, CancellationToken::new())
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
let t1_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t1: {t1_files_recovered:?}");
|
||||
assert_eq!(t1_files, t1_files_recovered);
|
||||
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
assert_eq!(path2_recovered_t1, old_data.as_bytes());
|
||||
|
||||
// after recovery to t0: everything is gone except for path1
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t0, t_final, CancellationToken::new())
|
||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
||||
.await?;
|
||||
let t0_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t0: {t0_files_recovered:?}");
|
||||
assert_eq!(t0_files, t0_files_recovered);
|
||||
|
||||
// cleanup
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
let paths = &[path1, path2, path3];
|
||||
retry(|| ctx.client.delete_objects(paths)).await?;
|
||||
let prefixes = ctx.client.list_prefixes(None).await?;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
ctx.client.delete_objects(&[path3]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
|
||||
struct EnabledS3 {
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
base_prefix: &'static str,
|
||||
@@ -196,13 +233,13 @@ impl EnabledS3 {
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledStorage {
|
||||
enum MaybeEnabledS3 {
|
||||
Enabled(EnabledS3),
|
||||
Disabled,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorage {
|
||||
impl AsyncTestContext for MaybeEnabledS3 {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
|
||||
@@ -218,7 +255,7 @@ impl AsyncTestContext for MaybeEnabledStorage {
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledStorageWithTestBlobs {
|
||||
enum MaybeEnabledS3WithTestBlobs {
|
||||
Enabled(S3WithTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, S3WithTestBlobs),
|
||||
@@ -231,7 +268,7 @@ struct S3WithTestBlobs {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
@@ -247,7 +284,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
|
||||
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -282,7 +319,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
enum MaybeEnabledS3WithSimpleTestBlobs {
|
||||
Enabled(S3WithSimpleTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
|
||||
@@ -293,7 +330,7 @@ struct S3WithSimpleTestBlobs {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
@@ -309,7 +346,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
|
||||
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
|
||||
match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -372,3 +409,143 @@ fn create_s3_client(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
struct Uploads {
|
||||
prefixes: HashSet<RemotePath>,
|
||||
blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
async fn upload_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
async fn upload_simple_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,9 +51,3 @@ pub struct SkTimelineInfo {
|
||||
#[serde(default)]
|
||||
pub http_connstr: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct TimelineCopyRequest {
|
||||
pub target_timeline_id: TimelineId,
|
||||
pub until_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -4,12 +4,6 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
@@ -22,7 +16,6 @@ chrono.workspace = true
|
||||
heapless.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
@@ -57,8 +50,6 @@ const_format.workspace = true
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
tokio-stream = { version = "0.1.14" }
|
||||
|
||||
serde_path_to_error.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user