mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-23 21:30:36 +00:00
Compare commits
88 Commits
actorsssss
...
proxy-forw
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
db7b244fdb | ||
|
|
e00127e84b | ||
|
|
6506fd14c4 | ||
|
|
46fb1a90ce | ||
|
|
56171cbe8c | ||
|
|
48b05b7c50 | ||
|
|
0856fe6676 | ||
|
|
4133d14a77 | ||
|
|
30c9e145d7 | ||
|
|
24e916d37f | ||
|
|
23f58145ed | ||
|
|
350865392c | ||
|
|
1be5e564ce | ||
|
|
7a70ef991f | ||
|
|
be30388901 | ||
|
|
3525080031 | ||
|
|
527cdbc010 | ||
|
|
39be2b0108 | ||
|
|
fa52cd575e | ||
|
|
d2c410c748 | ||
|
|
221531c9db | ||
|
|
4c173456dc | ||
|
|
e82625b77d | ||
|
|
0ac1e71524 | ||
|
|
271133d960 | ||
|
|
3d5fab127a | ||
|
|
66719d7eaf | ||
|
|
9a9d9beaee | ||
|
|
2bfc831c60 | ||
|
|
799db161d3 | ||
|
|
47380be12d | ||
|
|
c7b02ce8ec | ||
|
|
4010adf653 | ||
|
|
e10a7ee391 | ||
|
|
e8c9a51273 | ||
|
|
3c3ee8f3e8 | ||
|
|
6928a34f59 | ||
|
|
bc684e9d3b | ||
|
|
08532231ee | ||
|
|
79137a089f | ||
|
|
e3cb715e8a | ||
|
|
c70bf9150f | ||
|
|
8e4da52069 | ||
|
|
2ff1a5cecd | ||
|
|
ec8dcc2231 | ||
|
|
b844c6f0c7 | ||
|
|
6a85a06e1b | ||
|
|
b04a6acd6c | ||
|
|
0c7b89235c | ||
|
|
1e9a50bca8 | ||
|
|
511e730cc0 | ||
|
|
c1148dc9ac | ||
|
|
8253cf1931 | ||
|
|
3a82430432 | ||
|
|
734755eaca | ||
|
|
e34166a28f | ||
|
|
3a36a0a227 | ||
|
|
58f6cb649e | ||
|
|
dcc7610ad6 | ||
|
|
4c245b0f5a | ||
|
|
55b7cde665 | ||
|
|
5b34d5f561 | ||
|
|
26c55b0255 | ||
|
|
12e9b2a909 | ||
|
|
918b03b3b0 | ||
|
|
d36623ad74 | ||
|
|
689ad72e92 | ||
|
|
fd4cce9417 | ||
|
|
d52b81340f | ||
|
|
8dee9908f8 | ||
|
|
19ed230708 | ||
|
|
463b6a26b5 | ||
|
|
c9b1657e4c | ||
|
|
b92be77e19 | ||
|
|
8cb8c8d7b5 | ||
|
|
210700d0d9 | ||
|
|
a0a3ba85e7 | ||
|
|
d820aa1d08 | ||
|
|
996abc9563 | ||
|
|
a72af29d12 | ||
|
|
4f51824820 | ||
|
|
743f6dfb9b | ||
|
|
faf275d4a2 | ||
|
|
001f0d6db7 | ||
|
|
42c17a6fc6 | ||
|
|
37638fce79 | ||
|
|
50288c16b1 | ||
|
|
e03f8abba9 |
@@ -1,27 +1,28 @@
|
||||
*
|
||||
|
||||
!rust-toolchain.toml
|
||||
!Cargo.toml
|
||||
# Files
|
||||
!Cargo.lock
|
||||
!Cargo.toml
|
||||
!Makefile
|
||||
!rust-toolchain.toml
|
||||
!scripts/combine_control_files.py
|
||||
!scripts/ninstall.sh
|
||||
!vm-cgconfig.conf
|
||||
|
||||
# Directories
|
||||
!.cargo/
|
||||
!.config/
|
||||
!control_plane/
|
||||
!compute_tools/
|
||||
!control_plane/
|
||||
!libs/
|
||||
!neon_local/
|
||||
!pageserver/
|
||||
!patches/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!safekeeper/
|
||||
!s3_scrubber/
|
||||
!safekeeper/
|
||||
!storage_broker/
|
||||
!trace/
|
||||
!vendor/postgres-v14/
|
||||
!vendor/postgres-v15/
|
||||
!vendor/postgres-v16/
|
||||
!vendor/postgres-*/
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
!scripts/combine_control_files.py
|
||||
!vm-cgconfig.conf
|
||||
|
||||
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -4,6 +4,8 @@ self-hosted-runner:
|
||||
- dev
|
||||
- gen3
|
||||
- large
|
||||
# Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
|
||||
- macos-14
|
||||
- small
|
||||
- us-east-2
|
||||
config-variables:
|
||||
|
||||
@@ -179,23 +179,6 @@ runs:
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
|
||||
run: |
|
||||
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
poetry run python3 scripts/ingest_regress_test_result.py \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--reference ${GITHUB_REF} \
|
||||
--build-type unified \
|
||||
--ingest ${WORKDIR}/report/data/suites.json
|
||||
|
||||
- name: Store Allure test stat in the DB (new)
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
@@ -69,7 +69,15 @@ jobs:
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
|
||||
kaniko-arm:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
@@ -85,7 +93,15 @@ jobs:
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
manifest:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
@@ -99,7 +115,10 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Create manifest
|
||||
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
run: |
|
||||
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
- name: Push manifest
|
||||
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
|
||||
|
||||
68
.github/workflows/build_and_test.yml
vendored
68
.github/workflows/build_and_test.yml
vendored
@@ -21,6 +21,8 @@ env:
|
||||
COPT: '-Werror'
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
@@ -44,6 +46,20 @@ jobs:
|
||||
|
||||
exit 1
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Cancel previous e2e-tests runs for this PR
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
gh workflow --repo neondatabase/cloud \
|
||||
run cancel-previous-in-concurrency-group.yml \
|
||||
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -186,7 +202,11 @@ jobs:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
options: --init
|
||||
# Raise locked memory limit for tokio-epoll-uring.
|
||||
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
|
||||
# io_uring will account the memory of the CQ and SQ as locked.
|
||||
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -340,8 +360,12 @@ jobs:
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run rust tests
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
@@ -419,8 +443,8 @@ jobs:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -448,6 +472,7 @@ jobs:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
@@ -458,12 +483,13 @@ jobs:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# the amount of groups (N) should be reflected in `extra_params: --splits N ...`
|
||||
pytest_split_group: [ 1, 2, 3, 4 ]
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
@@ -477,11 +503,12 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
|
||||
extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
@@ -504,7 +531,6 @@ jobs:
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
@@ -582,17 +608,6 @@ jobs:
|
||||
--input-objects=/tmp/coverage/binaries.list \
|
||||
--format=lcov
|
||||
|
||||
- name: Upload coverage report
|
||||
id: upload-coverage-report
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build coverage report NEW
|
||||
id: upload-coverage-report-new
|
||||
env:
|
||||
@@ -629,21 +644,11 @@ jobs:
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
env:
|
||||
REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
|
||||
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
script: |
|
||||
const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
sha: `${COMMIT_SHA}`,
|
||||
state: 'success',
|
||||
target_url: `${REPORT_URL}`,
|
||||
context: 'Code coverage report',
|
||||
})
|
||||
const { REPORT_URL_NEW, COMMIT_SHA } = process.env
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: context.repo.owner,
|
||||
@@ -695,7 +700,8 @@ jobs:
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
|
||||
26
.github/workflows/neon_extra_builds.yml
vendored
26
.github/workflows/neon_extra_builds.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
timeout-minutes: 90
|
||||
runs-on: macos-latest
|
||||
runs-on: macos-14
|
||||
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
@@ -60,21 +60,21 @@ jobs:
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
@@ -89,7 +89,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
@@ -110,7 +110,7 @@ jobs:
|
||||
run: make walproposer-lib -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Run cargo build
|
||||
run: cargo build --all --release
|
||||
run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release
|
||||
|
||||
- name: Check that no warnings are produced
|
||||
run: ./run_clippy.sh
|
||||
@@ -124,12 +124,12 @@ jobs:
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
CARGO_FEATURES: --features testing
|
||||
CARGO_FLAGS: --locked --release
|
||||
CARGO_FLAGS: --release
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -210,18 +210,20 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run cargo test
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
cargo nextest run $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
cargo nextest run --package remote_storage --test test_real_s3
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
@@ -231,7 +233,7 @@ jobs:
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
cargo nextest run --package remote_storage --test test_real_azure
|
||||
|
||||
check-codestyle-rust-arm:
|
||||
timeout-minutes: 90
|
||||
|
||||
122
.github/workflows/update_build_tools_image.yml
vendored
122
.github/workflows/update_build_tools_image.yml
vendored
@@ -20,111 +20,51 @@ defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
outputs:
|
||||
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
|
||||
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Get source image digest
|
||||
id: next-digest
|
||||
run: |
|
||||
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
|
||||
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Get destination image digest (if already exists)
|
||||
id: prev-digest
|
||||
run: |
|
||||
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
|
||||
else
|
||||
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
|
||||
|
||||
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Tag image
|
||||
run: |
|
||||
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
|
||||
|
||||
rollback-tag-image:
|
||||
needs: tag-image
|
||||
if: ${{ !success() }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- name: Configure ECR login
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.21'
|
||||
|
||||
- name: Install crane
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
|
||||
|
||||
- name: Restore previous tag if needed
|
||||
- name: Copy images
|
||||
run: |
|
||||
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
|
||||
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
|
||||
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
# I guess we should delete the tag here/untag the image, but crane does not support it
|
||||
# - https://github.com/google/go-containerregistry/issues/999
|
||||
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
|
||||
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
|
||||
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
|
||||
|
||||
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
|
||||
else
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
|
||||
fi
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
@@ -20,7 +20,7 @@ ln -s ../../pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
This will run following checks on staged files before each commit:
|
||||
- `rustfmt`
|
||||
- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
|
||||
- checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
|
||||
|
||||
There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
|
||||
and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
|
||||
|
||||
218
Cargo.lock
generated
218
Cargo.lock
generated
@@ -278,13 +278,13 @@ dependencies = [
|
||||
"camino",
|
||||
"clap",
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -1144,16 +1144,6 @@ version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
|
||||
|
||||
[[package]]
|
||||
name = "close_fds"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.0"
|
||||
@@ -1327,6 +1317,8 @@ dependencies = [
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"diesel",
|
||||
"diesel_migrations",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
@@ -1341,6 +1333,7 @@ dependencies = [
|
||||
"regex",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
@@ -1636,6 +1629,52 @@ dependencies = [
|
||||
"rusticata-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diesel"
|
||||
version = "2.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"byteorder",
|
||||
"diesel_derives",
|
||||
"itoa",
|
||||
"pq-sys",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diesel_derives"
|
||||
version = "2.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
|
||||
dependencies = [
|
||||
"diesel_table_macro_syntax",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diesel_migrations"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
|
||||
dependencies = [
|
||||
"diesel",
|
||||
"migrations_internals",
|
||||
"migrations_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diesel_table_macro_syntax"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
|
||||
dependencies = [
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "digest"
|
||||
version = "0.10.7"
|
||||
@@ -2562,6 +2601,16 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "io-uring"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ipnet"
|
||||
version = "2.9.0"
|
||||
@@ -2676,6 +2725,12 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libm"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.1.4"
|
||||
@@ -2746,6 +2801,15 @@ version = "2.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.8.0"
|
||||
@@ -2772,9 +2836,33 @@ dependencies = [
|
||||
"libc",
|
||||
"once_cell",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"twox-hash",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "migrations_internals"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "migrations_macros"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
|
||||
dependencies = [
|
||||
"migrations_internals",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mime"
|
||||
version = "0.3.17"
|
||||
@@ -2854,6 +2942,19 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.26.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"memoffset 0.7.1",
|
||||
"pin-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.27.1"
|
||||
@@ -2976,6 +3077,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3306,7 +3408,6 @@ dependencies = [
|
||||
"camino-tempfile",
|
||||
"chrono",
|
||||
"clap",
|
||||
"close_fds",
|
||||
"const_format",
|
||||
"consumption_metrics",
|
||||
"crc32c",
|
||||
@@ -3360,6 +3461,7 @@ dependencies = [
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-epoll-uring",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
@@ -3381,6 +3483,7 @@ dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"const_format",
|
||||
"enum-map",
|
||||
"hex",
|
||||
@@ -3782,6 +3885,15 @@ version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
||||
|
||||
[[package]]
|
||||
name = "pq-sys"
|
||||
version = "0.4.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
|
||||
dependencies = [
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pq_proto"
|
||||
version = "0.1.0"
|
||||
@@ -3791,6 +3903,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"rand 0.8.5",
|
||||
"smallvec",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -3980,6 +4093,8 @@ dependencies = [
|
||||
"sync_wrapper",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
"tikv-jemalloc-ctl",
|
||||
"tikv-jemallocator",
|
||||
"tls-listener",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -4080,6 +4195,16 @@ dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
@@ -5112,9 +5237,9 @@ checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
|
||||
|
||||
[[package]]
|
||||
name = "smol_str"
|
||||
version = "0.2.0"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
|
||||
checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -5381,18 +5506,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.40"
|
||||
version = "1.0.47"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
|
||||
checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.40"
|
||||
version = "1.0.47"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
|
||||
checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -5420,6 +5545,37 @@ dependencies = [
|
||||
"ordered-float 2.10.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemalloc-ctl"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"paste",
|
||||
"tikv-jemalloc-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemalloc-sys"
|
||||
version = "0.5.4+5.3.0-patched"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemallocator"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tikv-jemalloc-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.21"
|
||||
@@ -5516,6 +5672,22 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
"once_cell",
|
||||
"scopeguard",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"uring-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-io-timeout"
|
||||
version = "1.2.0"
|
||||
@@ -6025,6 +6197,15 @@ dependencies = [
|
||||
"webpki-roots 0.23.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
|
||||
dependencies = [
|
||||
"io-uring",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.3.1"
|
||||
@@ -6586,6 +6767,7 @@ dependencies = [
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"crossbeam-utils",
|
||||
"diesel",
|
||||
"either",
|
||||
"fail",
|
||||
"futures-channel",
|
||||
|
||||
@@ -64,7 +64,6 @@ camino = "1.1.6"
|
||||
cfg-if = "1.0.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
close_fds = "0.3.2"
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
@@ -149,8 +148,11 @@ tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
test-context = "0.1"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
tikv-jemalloc-ctl = "0.5"
|
||||
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.10.0"
|
||||
tokio-rustls = "0.24"
|
||||
@@ -164,6 +166,7 @@ tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.20.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
|
||||
@@ -53,6 +53,7 @@ RUN set -e \
|
||||
--bin pagectl \
|
||||
--bin safekeeper \
|
||||
--bin storage_broker \
|
||||
--bin attachment_service \
|
||||
--bin proxy \
|
||||
--bin neon_local \
|
||||
--locked --release \
|
||||
@@ -80,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
|
||||
|
||||
@@ -52,7 +52,7 @@ RUN cd postgres && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
@@ -63,14 +63,14 @@ RUN cd postgres && \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done
|
||||
done
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -144,30 +144,23 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export PLV8_VERSION=3.1.5 \
|
||||
export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
|
||||
;; \
|
||||
"v16") \
|
||||
export PLV8_VERSION=3.1.8 \
|
||||
export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
|
||||
;; \
|
||||
*) \
|
||||
echo "Export the valid PG_VERSION variable" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
|
||||
echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
|
||||
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
|
||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
# generate and copy upgrade scripts
|
||||
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
|
||||
cp upgrade/* /usr/local/pgsql/share/extension/ && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
# don't break computes with installed old version of plv8
|
||||
cd /usr/local/pgsql/lib/ && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.5.so && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.8.so && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
|
||||
@@ -248,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
@@ -527,8 +523,7 @@ RUN apt-get update && \
|
||||
libboost-regex1.74-dev \
|
||||
libboost-serialization1.74-dev \
|
||||
libboost-system1.74-dev \
|
||||
libeigen3-dev \
|
||||
libfreetype6-dev
|
||||
libeigen3-dev
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
|
||||
@@ -553,6 +548,8 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
|
||||
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
|
||||
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
|
||||
-D RDK_INSTALL_INTREE=OFF \
|
||||
-D RDK_INSTALL_COMIC_FONTS=OFF \
|
||||
-D RDK_BUILD_FREETYPE_SUPPORT=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -907,7 +904,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost*, libfreetype6, and zlib1g for rdkit
|
||||
# libboost* for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
@@ -920,7 +917,6 @@ RUN apt update && \
|
||||
libboost-serialization1.74.0 \
|
||||
libboost-system1.74.0 \
|
||||
libossp-uuid16 \
|
||||
libfreetype6 \
|
||||
libgeos-c1v5 \
|
||||
libgdal28 \
|
||||
libproj19 \
|
||||
@@ -932,7 +928,6 @@ RUN apt update && \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
zlib1g \
|
||||
ca-certificates && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
6
Makefile
6
Makefile
@@ -51,6 +51,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
|
||||
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
||||
|
||||
#
|
||||
# Top level Makefile to build Neon and PostgreSQL
|
||||
@@ -174,10 +176,10 @@ neon-pg-ext-clean-%:
|
||||
|
||||
# Build walproposer as a static library. walproposer source code is located
|
||||
# in the pgxn/neon directory.
|
||||
#
|
||||
#
|
||||
# We also need to include libpgport.a and libpgcommon.a, because walproposer
|
||||
# uses some functions from those libraries.
|
||||
#
|
||||
#
|
||||
# Some object files are removed from libpgport.a and libpgcommon.a because
|
||||
# they depend on openssl and other libraries that are not included in our
|
||||
# Rust build.
|
||||
|
||||
20
README.md
20
README.md
@@ -14,8 +14,8 @@ Alternatively, compile and run the project [locally](#running-local-installation
|
||||
A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
|
||||
|
||||
The Neon storage engine consists of two major components:
|
||||
- Pageserver. Scalable storage backend for the compute nodes.
|
||||
- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
|
||||
- Pageserver: Scalable storage backend for the compute nodes.
|
||||
- Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
|
||||
|
||||
See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
|
||||
|
||||
@@ -81,9 +81,9 @@ The project uses [rust toolchain file](./rust-toolchain.toml) to define the vers
|
||||
|
||||
This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file.
|
||||
|
||||
rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
|
||||
rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
|
||||
|
||||
non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file.
|
||||
non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file.
|
||||
Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates.
|
||||
|
||||
#### Building on Linux
|
||||
@@ -124,7 +124,7 @@ make -j`sysctl -n hw.logicalcpu` -s
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
||||
|
||||
|
||||
#### Running neon database
|
||||
@@ -166,7 +166,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
|
||||
|
||||
2. Now, it is possible to connect to postgres and run some queries:
|
||||
```text
|
||||
> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
|
||||
> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
postgres=# insert into t values(1,1);
|
||||
@@ -205,7 +205,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
|
||||
> psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
@@ -216,7 +216,7 @@ postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
|
||||
# check that the new change doesn't affect the 'main' postgres
|
||||
> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
|
||||
> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
@@ -224,7 +224,7 @@ postgres=# select * from t;
|
||||
(1 row)
|
||||
```
|
||||
|
||||
4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
|
||||
4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances
|
||||
you have just started. You can terminate them all with one command:
|
||||
```sh
|
||||
> cargo neon stop
|
||||
@@ -243,7 +243,7 @@ CARGO_BUILD_FLAGS="--features=testing" make
|
||||
```
|
||||
|
||||
By default, this runs both debug and release modes, and all supported postgres versions. When
|
||||
testing locally, it is convenient to run just run one set of permutations, like this:
|
||||
testing locally, it is convenient to run just one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
|
||||
@@ -319,7 +319,7 @@ impl ComputeNode {
|
||||
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let start_time = Instant::now();
|
||||
|
||||
@@ -390,6 +390,34 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Gets the basebackup in a retry loop
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let mut retry_period_ms = 500;
|
||||
let mut attempts = 0;
|
||||
let max_attempts = 5;
|
||||
loop {
|
||||
let result = self.try_get_basebackup(compute_state, lsn);
|
||||
match result {
|
||||
Ok(_) => {
|
||||
return result;
|
||||
}
|
||||
Err(ref e) if attempts < max_attempts => {
|
||||
warn!(
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
|
||||
retry_period_ms *= 2;
|
||||
}
|
||||
Err(_) => {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn check_safekeepers_synced_async(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
|
||||
@@ -758,6 +758,14 @@ BEGIN
|
||||
END LOOP;
|
||||
END $$;
|
||||
"#,
|
||||
r#"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
|
||||
@@ -10,6 +10,8 @@ async-trait.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
diesel = { version = "2.1.4", features = ["postgres"]}
|
||||
diesel_migrations = { version = "2.1.0", features = ["postgres"]}
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
nix.workspace = true
|
||||
@@ -19,6 +21,7 @@ hex.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
|
||||
@@ -21,9 +21,7 @@ tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
# TODO: remove this after DB persistence is added, it is only used for
|
||||
# a parsing function when loading pageservers from neon_local LocalEnv
|
||||
postgres_backend.workspace = true
|
||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
|
||||
|
||||
utils = { path = "../../libs/utils/" }
|
||||
metrics = { path = "../../libs/metrics/" }
|
||||
|
||||
0
control_plane/attachment_service/migrations/.keep
Normal file
0
control_plane/attachment_service/migrations/.keep
Normal file
@@ -0,0 +1,6 @@
|
||||
-- This file was automatically created by Diesel to setup helper functions
|
||||
-- and other internal bookkeeping. This file is safe to edit, any future
|
||||
-- changes will be added to existing projects as new migrations.
|
||||
|
||||
DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
|
||||
DROP FUNCTION IF EXISTS diesel_set_updated_at();
|
||||
@@ -0,0 +1,36 @@
|
||||
-- This file was automatically created by Diesel to setup helper functions
|
||||
-- and other internal bookkeeping. This file is safe to edit, any future
|
||||
-- changes will be added to existing projects as new migrations.
|
||||
|
||||
|
||||
|
||||
|
||||
-- Sets up a trigger for the given table to automatically set a column called
|
||||
-- `updated_at` whenever the row is modified (unless `updated_at` was included
|
||||
-- in the modified columns)
|
||||
--
|
||||
-- # Example
|
||||
--
|
||||
-- ```sql
|
||||
-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
|
||||
--
|
||||
-- SELECT diesel_manage_updated_at('users');
|
||||
-- ```
|
||||
CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
|
||||
BEGIN
|
||||
EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
|
||||
FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
IF (
|
||||
NEW IS DISTINCT FROM OLD AND
|
||||
NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
|
||||
) THEN
|
||||
NEW.updated_at := current_timestamp;
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
@@ -0,0 +1 @@
|
||||
DROP TABLE tenant_shards;
|
||||
@@ -0,0 +1,12 @@
|
||||
CREATE TABLE tenant_shards (
|
||||
tenant_id VARCHAR NOT NULL,
|
||||
shard_number INTEGER NOT NULL,
|
||||
shard_count INTEGER NOT NULL,
|
||||
PRIMARY KEY(tenant_id, shard_number, shard_count),
|
||||
shard_stripe_size INTEGER NOT NULL,
|
||||
generation INTEGER NOT NULL,
|
||||
generation_pageserver BIGINT NOT NULL,
|
||||
placement_policy VARCHAR NOT NULL,
|
||||
-- config is JSON encoded, opaque to the database.
|
||||
config TEXT NOT NULL
|
||||
);
|
||||
@@ -0,0 +1 @@
|
||||
DROP TABLE nodes;
|
||||
@@ -0,0 +1,10 @@
|
||||
CREATE TABLE nodes (
|
||||
node_id BIGINT PRIMARY KEY NOT NULL,
|
||||
|
||||
scheduling_policy VARCHAR NOT NULL,
|
||||
|
||||
listen_http_addr VARCHAR NOT NULL,
|
||||
listen_http_port INTEGER NOT NULL,
|
||||
listen_pg_addr VARCHAR NOT NULL,
|
||||
listen_pg_port INTEGER NOT NULL
|
||||
);
|
||||
@@ -1,14 +1,18 @@
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::Service;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver_api::models::{
|
||||
TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
use utils::http::endpoint::{auth_middleware, request_span};
|
||||
use utils::http::request::parse_request_param;
|
||||
use utils::id::TenantId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
@@ -104,34 +108,163 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
json_response(StatusCode::OK, state.service.inspect(inspect_req))
|
||||
}
|
||||
|
||||
async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_tenant_create(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
|
||||
}
|
||||
|
||||
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
|
||||
// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. This avoids
|
||||
// needing to track a "deleting" state for tenants.
|
||||
async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
|
||||
where
|
||||
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
|
||||
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
let started_at = Instant::now();
|
||||
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
|
||||
// completed.
|
||||
let mut retry_period = Duration::from_secs(1);
|
||||
// On subsequent retries, wait longer.
|
||||
let max_retry_period = Duration::from_secs(5);
|
||||
// Enable callers with a 30 second request timeout to reliably get a response
|
||||
let max_wait = Duration::from_secs(25);
|
||||
|
||||
loop {
|
||||
let status = f(service.clone()).await?;
|
||||
match status {
|
||||
StatusCode::ACCEPTED => {
|
||||
tracing::info!("Deletion accepted, waiting to try again...");
|
||||
tokio::time::sleep(retry_period).await;
|
||||
retry_period = max_retry_period;
|
||||
}
|
||||
StatusCode::NOT_FOUND => {
|
||||
tracing::info!("Deletion complete");
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!("Unexpected status {status}");
|
||||
return json_response(status, ());
|
||||
}
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
if now + retry_period > started_at + max_wait {
|
||||
tracing::info!("Deletion timed out waiting for 404");
|
||||
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
|
||||
// the pageserver's swagger definition for this endpoint, and has the same desired
|
||||
// effect of causing the control plane to retry later.
|
||||
return json_response(StatusCode::CONFLICT, ());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_tenant_location_config(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.tenant_create(create_req).await?,
|
||||
service
|
||||
.tenant_location_config(tenant_id, config_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_tenant_delete(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service.tenant_delete(tenant_id).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_create(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_tenant_timeline_delete(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let state = get_state(&req);
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
|
||||
json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service.tenant_timeline_delete(tenant_id, timeline_id).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_passthrough(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
|
||||
let Some(path) = req.uri().path_and_query() else {
|
||||
// This should never happen, our request router only calls us if there is a path
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
|
||||
};
|
||||
|
||||
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
||||
|
||||
// Find the node that holds shard zero
|
||||
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
|
||||
|
||||
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
||||
// rewrite this to a shard-aware shard zero ID.
|
||||
let path = format!("{}", path);
|
||||
let tenant_str = tenant_id.to_string();
|
||||
let tenant_shard_str = format!("{}", tenant_shard_id);
|
||||
let path = path.replace(&tenant_str, &tenant_shard_str);
|
||||
|
||||
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
|
||||
let resp = client.get_raw(path).await.map_err(|_e|
|
||||
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
||||
// if we can't successfully send a request to the pageserver, we aren't available.
|
||||
ApiError::ShuttingDown)?;
|
||||
|
||||
// We have a reqest::Response, would like a http::Response
|
||||
let mut builder = hyper::Response::builder()
|
||||
.status(resp.status())
|
||||
.version(resp.version());
|
||||
for (k, v) in resp.headers() {
|
||||
builder = builder.header(k, v);
|
||||
}
|
||||
|
||||
let response = builder
|
||||
.body(Body::wrap_stream(resp.bytes_stream()))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
async fn handle_tenant_locate(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -141,6 +274,11 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.node_list().await?)
|
||||
}
|
||||
|
||||
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
|
||||
@@ -154,14 +292,15 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_tenant_shard_migrate(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
service
|
||||
.tenant_shard_migrate(tenant_shard_id, migrate_req)
|
||||
.await?,
|
||||
)
|
||||
@@ -178,6 +317,35 @@ impl From<ReconcileError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
|
||||
where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
let state = get_state(&request);
|
||||
let service = state.service.clone();
|
||||
|
||||
let startup_complete = service.startup_complete.clone();
|
||||
if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
// This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
|
||||
// timeouts around its remote calls, to bound its runtime.
|
||||
return Err(ApiError::Timeout(
|
||||
"Timed out waiting for service readiness".into(),
|
||||
));
|
||||
}
|
||||
|
||||
request_span(
|
||||
request,
|
||||
|request| async move { handler(service, request).await },
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn make_router(
|
||||
service: Arc<Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
@@ -196,23 +364,67 @@ pub fn make_router(
|
||||
|
||||
router
|
||||
.data(Arc::new(HttpState::new(service, auth)))
|
||||
// Non-prefixed generic endpoints (status, metrics)
|
||||
.get("/status", |r| request_span(r, handle_status))
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||
.post("/inspect", |r| request_span(r, handle_inspect))
|
||||
.post("/node", |r| request_span(r, handle_node_register))
|
||||
.put("/node/:node_id/config", |r| {
|
||||
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
||||
.post("/upcall/v1/re-attach", |r| {
|
||||
request_span(r, handle_re_attach)
|
||||
})
|
||||
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
|
||||
// Test/dev/debug endpoints
|
||||
.post("/debug/v1/attach-hook", |r| {
|
||||
request_span(r, handle_attach_hook)
|
||||
})
|
||||
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
// Node operations
|
||||
.post("/control/v1/node", |r| {
|
||||
request_span(r, handle_node_register)
|
||||
})
|
||||
.get("/control/v1/node", |r| request_span(r, handle_node_list))
|
||||
.put("/control/v1/node/:node_id/config", |r| {
|
||||
request_span(r, handle_node_configure)
|
||||
})
|
||||
.post("/tenant", |r| request_span(r, handle_tenant_create))
|
||||
.post("/tenant/:tenant_id/timeline", |r| {
|
||||
request_span(r, handle_tenant_timeline_create)
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
.get("/tenant/:tenant_id/locate", |r| {
|
||||
request_span(r, handle_tenant_locate)
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
.post("/v1/tenant", |r| {
|
||||
tenant_service_handler(r, handle_tenant_create)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_delete)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
// Tenant Shard operations (low level/maintenance)
|
||||
.put("/tenant/:tenant_shard_id/migrate", |r| {
|
||||
request_span(r, handle_tenant_shard_migrate)
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_create)
|
||||
})
|
||||
// Tenant detail GET passthrough to shard zero
|
||||
.get("/v1/tenant/:tenant_id*", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
||||
// timeline GET APIs will be implicitly included.
|
||||
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
// Path aliases for tests_forward_compatibility
|
||||
// TODO: remove these in future PR
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ mod node;
|
||||
pub mod persistence;
|
||||
mod reconciler;
|
||||
mod scheduler;
|
||||
mod schema;
|
||||
pub mod service;
|
||||
mod tenant_state;
|
||||
|
||||
@@ -17,6 +18,8 @@ enum PlacementPolicy {
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
/// Do not attach to any pageservers
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
|
||||
|
||||
@@ -12,9 +12,9 @@ use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use std::sync::Arc;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
use utils::{project_build_tag, project_git_version, tcp_listener};
|
||||
|
||||
@@ -39,7 +39,11 @@ struct Cli {
|
||||
|
||||
/// Path to the .json file to store state (will be created if it doesn't exist)
|
||||
#[arg(short, long)]
|
||||
path: Utf8PathBuf,
|
||||
path: Option<Utf8PathBuf>,
|
||||
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||
#[arg(long)]
|
||||
database_url: String,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -58,7 +62,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
GIT_VERSION,
|
||||
launch_ts.to_string(),
|
||||
BUILD_TAG,
|
||||
args.path,
|
||||
args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
|
||||
args.listen
|
||||
);
|
||||
|
||||
@@ -66,9 +70,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
jwt_token: args.jwt_token,
|
||||
};
|
||||
|
||||
let persistence = Arc::new(Persistence::new(&args.path).await);
|
||||
let json_path = args.path;
|
||||
let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
|
||||
|
||||
let service = Service::spawn(config, persistence).await?;
|
||||
let service = Service::spawn(config, persistence.clone()).await?;
|
||||
|
||||
let http_listener = tcp_listener::bind(args.listen)?;
|
||||
|
||||
@@ -81,20 +86,31 @@ async fn main() -> anyhow::Result<()> {
|
||||
let router = make_router(service, auth)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||
let router_service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
|
||||
tokio::task::spawn(server);
|
||||
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
|
||||
tracing::info!("Got {}. Terminating", signal.name());
|
||||
// We're just a test helper: no graceful shutdown.
|
||||
std::process::exit(0);
|
||||
}
|
||||
})?;
|
||||
// Wait until we receive a signal
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
|
||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
|
||||
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
|
||||
tokio::select! {
|
||||
_ = sigint.recv() => {},
|
||||
_ = sigterm.recv() => {},
|
||||
_ = sigquit.recv() => {},
|
||||
}
|
||||
tracing::info!("Terminating on signal");
|
||||
|
||||
Ok(())
|
||||
if json_path.is_some() {
|
||||
// Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
|
||||
// full postgres dumps around.
|
||||
if let Err(e) = persistence.write_tenants_json().await {
|
||||
tracing::error!("Failed to write JSON on shutdown: {e}")
|
||||
}
|
||||
}
|
||||
|
||||
std::process::exit(0);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::persistence::NodePersistence;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Node {
|
||||
pub(crate) id: NodeId,
|
||||
@@ -34,4 +36,15 @@ impl Node {
|
||||
NodeSchedulingPolicy::Pause => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn to_persistent(&self) -> NodePersistence {
|
||||
NodePersistence {
|
||||
node_id: self.id.0 as i64,
|
||||
scheduling_policy: self.scheduling.into(),
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port as i32,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
listen_pg_port: self.listen_pg_port as i32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,181 +1,295 @@
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use control_plane::{
|
||||
attachment_service::{NodeAvailability, NodeSchedulingPolicy},
|
||||
local_env::LocalEnv,
|
||||
};
|
||||
use pageserver_api::{
|
||||
models::TenantConfig,
|
||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
||||
};
|
||||
use postgres_connection::parse_host_port;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::{node::Node, PlacementPolicy};
|
||||
use crate::node::Node;
|
||||
use crate::PlacementPolicy;
|
||||
|
||||
/// Placeholder for storage. This will be replaced with a database client.
|
||||
/// ## What do we store?
|
||||
///
|
||||
/// The attachment service does not store most of its state durably.
|
||||
///
|
||||
/// The essential things to store durably are:
|
||||
/// - generation numbers, as these must always advance monotonically to ensure data safety.
|
||||
/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
|
||||
/// - Node's scheduling policies, as the source of truth for these is something external.
|
||||
///
|
||||
/// Other things we store durably as an implementation detail:
|
||||
/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
|
||||
/// but it is operationally simpler to make this service the authority for which nodes
|
||||
/// it talks to.
|
||||
///
|
||||
/// ## Performance/efficiency
|
||||
///
|
||||
/// The attachment service does not go via the database for most things: there are
|
||||
/// a couple of places where we must, and where efficiency matters:
|
||||
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
|
||||
/// before it can attach a tenant, so this acts as a bound on how fast things like
|
||||
/// failover can happen.
|
||||
/// - Pageserver re-attach: we will increment many shards' generations when this happens,
|
||||
/// so it is important to avoid e.g. issuing O(N) queries.
|
||||
///
|
||||
/// Database calls relating to nodes have low performance requirements, as they are very rarely
|
||||
/// updated, and reads of nodes are always from memory, not the database. We only require that
|
||||
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
|
||||
pub struct Persistence {
|
||||
state: std::sync::Mutex<PersistentState>,
|
||||
database_url: String,
|
||||
|
||||
// In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of
|
||||
// test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
|
||||
// compatible just yet.
|
||||
json_path: Option<Utf8PathBuf>,
|
||||
}
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
/// Legacy format, for use in JSON compat objects in test environment
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PersistentState {
|
||||
struct JsonPersistence {
|
||||
tenants: HashMap<TenantShardId, TenantShardPersistence>,
|
||||
|
||||
#[serde(skip)]
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
/// A convenience for serializing the state inside a sync lock, and then
|
||||
/// writing it to disk outside of the lock. This will go away when switching
|
||||
/// to a database backend.
|
||||
struct PendingWrite {
|
||||
bytes: Vec<u8>,
|
||||
path: Utf8PathBuf,
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum DatabaseError {
|
||||
#[error(transparent)]
|
||||
Query(#[from] diesel::result::Error),
|
||||
#[error(transparent)]
|
||||
Connection(#[from] diesel::result::ConnectionError),
|
||||
#[error("Logical error: {0}")]
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
impl PendingWrite {
|
||||
async fn commit(&self) -> anyhow::Result<()> {
|
||||
tokio::fs::write(&self.path, &self.bytes).await?;
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentState {
|
||||
fn save(&self) -> PendingWrite {
|
||||
PendingWrite {
|
||||
bytes: serde_json::to_vec(self).expect("Serialization error"),
|
||||
path: self.path.clone(),
|
||||
impl Persistence {
|
||||
pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
|
||||
Self {
|
||||
database_url,
|
||||
json_path,
|
||||
}
|
||||
}
|
||||
|
||||
async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
|
||||
decoded.path = path.to_owned();
|
||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let database_url = self.database_url.clone();
|
||||
tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
|
||||
// TODO: connection pooling, such as via diesel::r2d2
|
||||
let mut conn = PgConnection::establish(&database_url)?;
|
||||
func(&mut conn)
|
||||
})
|
||||
.await
|
||||
.expect("Task panic")
|
||||
}
|
||||
|
||||
/// When a node is first registered, persist it before using it for anything
|
||||
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
|
||||
let np = node.to_persistent();
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::insert_into(crate::schema::nodes::table)
|
||||
.values(&np)
|
||||
.execute(conn)?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// At startup, populate the list of nodes which our shards may be placed on
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
|
||||
let nodes: Vec<Node> = self
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table
|
||||
.load::<NodePersistence>(conn)?
|
||||
.into_iter()
|
||||
.map(|n| Node {
|
||||
id: NodeId(n.node_id as u64),
|
||||
// At startup we consider a node offline until proven otherwise.
|
||||
availability: NodeAvailability::Offline,
|
||||
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
|
||||
.expect("Bad scheduling policy in DB"),
|
||||
listen_http_addr: n.listen_http_addr,
|
||||
listen_http_port: n.listen_http_port as u16,
|
||||
listen_pg_addr: n.listen_pg_addr,
|
||||
listen_pg_port: n.listen_pg_port as u16,
|
||||
})
|
||||
.collect::<Vec<Node>>())
|
||||
})
|
||||
.await?;
|
||||
|
||||
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
||||
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
/// At startup, load the high level state for shards, such as their config + policy. This will
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let loaded = self
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
if loaded.is_empty() {
|
||||
if let Some(path) = &self.json_path {
|
||||
if tokio::fs::try_exists(path)
|
||||
.await
|
||||
.map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
|
||||
{
|
||||
tracing::info!("Importing from legacy JSON format at {path}");
|
||||
return self.list_tenant_shards_json(path).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(loaded)
|
||||
}
|
||||
|
||||
/// Shim for automated compatibility tests: load tenants from a JSON file instead of database
|
||||
pub(crate) async fn list_tenant_shards_json(
|
||||
&self,
|
||||
path: &Utf8Path,
|
||||
) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let bytes = tokio::fs::read(path)
|
||||
.await
|
||||
.map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
|
||||
|
||||
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
||||
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
||||
// empty strings with proper defaults.
|
||||
if tenant.tenant_id.is_empty() {
|
||||
tenant.tenant_id = format!("{}", tenant_id);
|
||||
tenant.config = serde_json::to_string(&TenantConfig::default())?;
|
||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
|
||||
tenant.tenant_id = tenant_id.to_string();
|
||||
tenant.config = serde_json::to_string(&TenantConfig::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(decoded)
|
||||
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
|
||||
|
||||
// Synchronize database with what is in the JSON file
|
||||
self.insert_tenant_shards(tenants.clone()).await?;
|
||||
|
||||
Ok(tenants)
|
||||
}
|
||||
|
||||
async fn load_or_new(path: &Utf8Path) -> Self {
|
||||
match Self::load(path).await {
|
||||
Ok(s) => {
|
||||
tracing::info!("Loaded state file at {}", path);
|
||||
s
|
||||
}
|
||||
Err(e)
|
||||
if e.downcast_ref::<std::io::Error>()
|
||||
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
|
||||
.unwrap_or(false) =>
|
||||
{
|
||||
tracing::info!("Will create state file at {}", path);
|
||||
Self {
|
||||
tenants: HashMap::new(),
|
||||
path: path.to_owned(),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
|
||||
}
|
||||
/// For use in testing environments, where we dump out JSON on shutdown.
|
||||
pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
|
||||
let Some(path) = &self.json_path else {
|
||||
anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
|
||||
};
|
||||
tracing::info!("Writing state to {path}...");
|
||||
let tenants = self.list_tenant_shards().await?;
|
||||
let mut tenants_map = HashMap::new();
|
||||
for tsp in tenants {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
|
||||
tenants_map.insert(tenant_shard_id, tsp);
|
||||
}
|
||||
}
|
||||
}
|
||||
let json = serde_json::to_string(&JsonPersistence {
|
||||
tenants: tenants_map,
|
||||
})?;
|
||||
|
||||
impl Persistence {
|
||||
pub async fn new(path: &Utf8Path) -> Self {
|
||||
let state = PersistentState::load_or_new(path).await;
|
||||
Self {
|
||||
state: std::sync::Mutex::new(state),
|
||||
}
|
||||
}
|
||||
tokio::fs::write(path, &json).await?;
|
||||
tracing::info!("Wrote {} bytes to {path}...", json.len());
|
||||
|
||||
/// When registering a node, persist it so that on next start we will be able to
|
||||
/// iterate over known nodes to synchronize their tenant shard states with our observed state.
|
||||
pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
|
||||
// TODO: node persitence will come with database backend
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// At startup, we populate the service's list of nodes, and use this list to call into
|
||||
/// each node to do an initial reconciliation of the state of the world with our in-memory
|
||||
/// observed state.
|
||||
pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
|
||||
let env = LocalEnv::load_config()?;
|
||||
// TODO: node persitence will come with database backend
|
||||
|
||||
// XXX hack: enable test_backward_compatibility to work by populating our list of
|
||||
// nodes from LocalEnv when it is not present in persistent storage. Otherwise at
|
||||
// first startup in the compat test, we may have shards but no nodes.
|
||||
let mut result = Vec::new();
|
||||
tracing::info!(
|
||||
"Loaded {} pageserver nodes from LocalEnv",
|
||||
env.pageservers.len()
|
||||
);
|
||||
for ps_conf in env.pageservers {
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
result.push(Node {
|
||||
id: ps_conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// At startup, we populate our map of tenant shards from persistent storage.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
|
||||
let locked = self.state.lock().unwrap();
|
||||
Ok(locked.tenants.values().cloned().collect())
|
||||
}
|
||||
|
||||
/// Tenants must be persisted before we schedule them for the first time. This enables us
|
||||
/// to correctly retain generation monotonicity, and the externally provided placement policy & config.
|
||||
pub(crate) async fn insert_tenant_shards(
|
||||
&self,
|
||||
shards: Vec<TenantShardPersistence>,
|
||||
) -> anyhow::Result<()> {
|
||||
let write = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
for shard in shards {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(shard.shard_number as u8),
|
||||
shard_count: ShardCount(shard.shard_count as u8),
|
||||
};
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
locked.tenants.insert(tenant_shard_id, shard);
|
||||
}
|
||||
locked.save()
|
||||
};
|
||||
/// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
|
||||
/// the tenant from memory on this server.
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||
.execute(conn)?;
|
||||
|
||||
write.commit().await?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
Ok(())
|
||||
/// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
|
||||
/// batched increment of the generations of all tenants whose generation_pageserver is equal to
|
||||
/// the node that called /re-attach.
|
||||
#[tracing::instrument(skip_all, fields(node_id))]
|
||||
pub(crate) async fn re_attach(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_conn(move |conn| {
|
||||
let rows_updated = diesel::update(tenant_shards)
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.set(generation.eq(generation + 1))
|
||||
.execute(conn)?;
|
||||
|
||||
tracing::info!("Incremented {} tenants' generations", rows_updated);
|
||||
|
||||
// TODO: UPDATE+SELECT in one query
|
||||
|
||||
let updated = tenant_shards
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.select(TenantShardPersistence::as_select())
|
||||
.load(conn)?;
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let mut result = HashMap::new();
|
||||
for tsp in updated {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
|
||||
@@ -186,57 +300,48 @@ impl Persistence {
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<Generation> {
|
||||
let (write, gen) = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
anyhow::bail!("Tried to increment generation of unknown shard");
|
||||
};
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.set((
|
||||
generation.eq(generation + 1),
|
||||
generation_pageserver.eq(node_id.0 as i64),
|
||||
))
|
||||
// TODO: only returning() the generation column
|
||||
.returning(TenantShardPersistence::as_returning())
|
||||
.get_result(conn)?;
|
||||
|
||||
shard.generation += 1;
|
||||
shard.generation_pageserver = Some(node_id);
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let gen = Generation::new(shard.generation);
|
||||
(locked.save(), gen)
|
||||
};
|
||||
|
||||
write.commit().await?;
|
||||
Ok(gen)
|
||||
Ok(Generation::new(updated.generation as u32))
|
||||
}
|
||||
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
let write = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
anyhow::bail!("Tried to increment generation of unknown shard");
|
||||
};
|
||||
shard.generation_pageserver = None;
|
||||
locked.save()
|
||||
};
|
||||
write.commit().await?;
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.set((
|
||||
generation_pageserver.eq(i64::MAX),
|
||||
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn re_attach(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
|
||||
let (write, result) = {
|
||||
let mut result = HashMap::new();
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
|
||||
if shard.generation_pageserver == Some(node_id) {
|
||||
shard.generation += 1;
|
||||
result.insert(*tenant_shard_id, Generation::new(shard.generation));
|
||||
}
|
||||
}
|
||||
|
||||
(locked.save(), result)
|
||||
};
|
||||
|
||||
write.commit().await?;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
// TODO: when we start shard splitting, we must durably mark the tenant so that
|
||||
// on restart, we know that we must go through recovery (list shards that exist
|
||||
// and pick up where we left off and/or revert to parent shards).
|
||||
@@ -254,7 +359,8 @@ impl Persistence {
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
pub(crate) struct TenantShardPersistence {
|
||||
#[serde(default)]
|
||||
pub(crate) tenant_id: String,
|
||||
@@ -265,16 +371,28 @@ pub(crate) struct TenantShardPersistence {
|
||||
#[serde(default)]
|
||||
pub(crate) shard_stripe_size: i32,
|
||||
|
||||
// Currently attached pageserver
|
||||
#[serde(rename = "pageserver")]
|
||||
pub(crate) generation_pageserver: Option<NodeId>,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
pub(crate) generation: u32,
|
||||
pub(crate) generation: i32,
|
||||
|
||||
// Currently attached pageserver
|
||||
#[serde(rename = "pageserver")]
|
||||
pub(crate) generation_pageserver: i64,
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) placement_policy: String,
|
||||
#[serde(default)]
|
||||
pub(crate) config: String,
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
pub(crate) struct NodePersistence {
|
||||
pub(crate) node_id: i64,
|
||||
pub(crate) scheduling_policy: String,
|
||||
pub(crate) listen_http_addr: String,
|
||||
pub(crate) listen_http_port: i32,
|
||||
pub(crate) listen_pg_addr: String,
|
||||
pub(crate) listen_pg_port: i32,
|
||||
}
|
||||
|
||||
27
control_plane/attachment_service/src/schema.rs
Normal file
27
control_plane/attachment_service/src/schema.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
// @generated automatically by Diesel CLI.
|
||||
|
||||
diesel::table! {
|
||||
nodes (node_id) {
|
||||
node_id -> Int8,
|
||||
scheduling_policy -> Varchar,
|
||||
listen_http_addr -> Varchar,
|
||||
listen_http_port -> Int4,
|
||||
listen_pg_addr -> Varchar,
|
||||
listen_pg_port -> Int4,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
tenant_shards (tenant_id, shard_number, shard_count) {
|
||||
tenant_id -> Varchar,
|
||||
shard_number -> Int4,
|
||||
shard_count -> Int4,
|
||||
shard_stripe_size -> Int4,
|
||||
generation -> Int4,
|
||||
generation_pageserver -> Int8,
|
||||
placement_policy -> Varchar,
|
||||
config -> Text,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
|
||||
@@ -11,6 +11,7 @@ use control_plane::attachment_service::{
|
||||
TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
use diesel::result::DatabaseErrorKind;
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::{
|
||||
control_api::{
|
||||
@@ -20,22 +21,24 @@ use pageserver_api::{
|
||||
models,
|
||||
models::{
|
||||
LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
|
||||
TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api;
|
||||
use utils::{
|
||||
completion::Barrier,
|
||||
generation::Generation,
|
||||
http::error::ApiError,
|
||||
id::{NodeId, TenantId},
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
seqwait::SeqWait,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
persistence::{Persistence, TenantShardPersistence},
|
||||
persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
|
||||
scheduler::Scheduler,
|
||||
tenant_state::{
|
||||
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
|
||||
@@ -46,6 +49,10 @@ use crate::{
|
||||
|
||||
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
/// How long [`Service::startup_reconcile`] is allowed to take before it should give
|
||||
/// up on unresponsive pageservers and proceed.
|
||||
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
struct ServiceState {
|
||||
tenants: BTreeMap<TenantShardId, TenantState>,
|
||||
@@ -79,10 +86,27 @@ pub struct Config {
|
||||
pub jwt_token: Option<String>,
|
||||
}
|
||||
|
||||
impl From<DatabaseError> for ApiError {
|
||||
fn from(err: DatabaseError) -> ApiError {
|
||||
match err {
|
||||
DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
|
||||
// FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
|
||||
DatabaseError::Connection(_e) => ApiError::ShuttingDown,
|
||||
DatabaseError::Logical(reason) => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(reason))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Service {
|
||||
inner: Arc<std::sync::RwLock<ServiceState>>,
|
||||
config: Config,
|
||||
persistence: Arc<Persistence>,
|
||||
|
||||
/// This waits for initial reconciliation with pageservers to complete. Until this barrier
|
||||
/// passes, it isn't safe to do any actions that mutate tenants.
|
||||
pub(crate) startup_complete: Barrier,
|
||||
}
|
||||
|
||||
impl From<ReconcileWaitError> for ApiError {
|
||||
@@ -96,77 +120,32 @@ impl From<ReconcileWaitError> for ApiError {
|
||||
}
|
||||
|
||||
impl Service {
|
||||
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
|
||||
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
tracing::info!("Loading nodes from database...");
|
||||
let mut nodes = persistence.list_nodes().await?;
|
||||
tracing::info!("Loaded {} nodes from database.", nodes.len());
|
||||
|
||||
tracing::info!("Loading shards from database...");
|
||||
let tenant_shard_persistence = persistence.list_tenant_shards().await?;
|
||||
tracing::info!(
|
||||
"Loaded {} shards from database.",
|
||||
tenant_shard_persistence.len()
|
||||
);
|
||||
|
||||
let mut tenants = BTreeMap::new();
|
||||
|
||||
for tsp in tenant_shard_persistence {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
let shard_identity = if tsp.shard_count == 0 {
|
||||
ShardIdentity::unsharded()
|
||||
} else {
|
||||
ShardIdentity::new(
|
||||
ShardNumber(tsp.shard_number as u8),
|
||||
ShardCount(tsp.shard_count as u8),
|
||||
ShardStripeSize(tsp.shard_stripe_size as u32),
|
||||
)?
|
||||
};
|
||||
let new_tenant = TenantState {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
sequence: Sequence::initial(),
|
||||
// Note that we load generation, but don't care about generation_pageserver. We will either end up finding
|
||||
// our existing attached location and it will match generation_pageserver, or we will attach somewhere new
|
||||
// and update generation_pageserver in the process.
|
||||
generation: Generation::new(tsp.generation),
|
||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
||||
intent: IntentState::new(),
|
||||
observed: ObservedState::new(),
|
||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
||||
reconciler: None,
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
};
|
||||
|
||||
tenants.insert(tenant_shard_id, new_tenant);
|
||||
}
|
||||
pub fn get_config(&self) -> &Config {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
|
||||
/// until this is done.
|
||||
async fn startup_reconcile(&self) {
|
||||
// For all tenant shards, a vector of observed states on nodes (where None means
|
||||
// indeterminate, same as in [`ObservedStateLocation`])
|
||||
let mut observed = HashMap::new();
|
||||
|
||||
let nodes = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
locked.nodes.clone()
|
||||
};
|
||||
|
||||
// TODO: issue these requests concurrently
|
||||
for node in &mut nodes {
|
||||
let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
|
||||
for node in nodes.values() {
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
|
||||
tracing::info!("Scanning shards on node {}...", node.id);
|
||||
match client.list_location_config().await {
|
||||
Err(e) => {
|
||||
tracing::warn!("Could not contact pageserver {} ({e})", node.id);
|
||||
// TODO: be more tolerant, apply a generous 5-10 second timeout
|
||||
// TODO: setting a node to Offline is a dramatic thing to do, and can
|
||||
// prevent neon_local from starting up (it starts this service before
|
||||
// any pageservers are running). It may make sense to give nodes
|
||||
// a Pending state to accomodate this situation, and allow (but deprioritize)
|
||||
// scheduling on Pending nodes.
|
||||
//node.availability = NodeAvailability::Offline;
|
||||
// TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
|
||||
// pageserver is being restarted at the same time as we are
|
||||
}
|
||||
Ok(listing) => {
|
||||
tracing::info!(
|
||||
@@ -174,7 +153,6 @@ impl Service {
|
||||
listing.tenant_shards.len(),
|
||||
node.id
|
||||
);
|
||||
node.availability = NodeAvailability::Active;
|
||||
|
||||
for (tenant_shard_id, conf_opt) in listing.tenant_shards {
|
||||
observed.insert(tenant_shard_id, (node.id, conf_opt));
|
||||
@@ -186,41 +164,46 @@ impl Service {
|
||||
let mut cleanup = Vec::new();
|
||||
|
||||
// Populate intent and observed states for all tenants, based on reported state on pageservers
|
||||
for (tenant_shard_id, (node_id, observed_loc)) in observed {
|
||||
let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
|
||||
cleanup.push((tenant_shard_id, node_id));
|
||||
continue;
|
||||
};
|
||||
let shard_count = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
for (tenant_shard_id, (node_id, observed_loc)) in observed {
|
||||
let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
cleanup.push((tenant_shard_id, node_id));
|
||||
continue;
|
||||
};
|
||||
|
||||
tenant_state
|
||||
.observed
|
||||
.locations
|
||||
.insert(node_id, ObservedStateLocation { conf: observed_loc });
|
||||
}
|
||||
|
||||
// State of nodes is now frozen, transform to a HashMap.
|
||||
let mut nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
|
||||
|
||||
// Populate each tenant's intent state
|
||||
let mut scheduler = Scheduler::new(&tenants, &nodes);
|
||||
for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
|
||||
tenant_state.intent_from_observed();
|
||||
if let Err(e) = tenant_state.schedule(&mut scheduler) {
|
||||
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
|
||||
// not enough pageservers are available. The tenant may well still be available
|
||||
// to clients.
|
||||
tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
|
||||
tenant_state
|
||||
.observed
|
||||
.locations
|
||||
.insert(node_id, ObservedStateLocation { conf: observed_loc });
|
||||
}
|
||||
}
|
||||
|
||||
// Populate each tenant's intent state
|
||||
let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
|
||||
for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
|
||||
tenant_state.intent_from_observed();
|
||||
if let Err(e) = tenant_state.schedule(&mut scheduler) {
|
||||
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
|
||||
// not enough pageservers are available. The tenant may well still be available
|
||||
// to clients.
|
||||
tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
locked.tenants.len()
|
||||
};
|
||||
|
||||
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
|
||||
// generation_pageserver in the database.
|
||||
|
||||
// Clean up any tenants that were found on pageservers but are not known to us.
|
||||
for (tenant_shard_id, node_id) in cleanup {
|
||||
// A node reported a tenant_shard_id which is unknown to us: detach it.
|
||||
let node = nodes
|
||||
.get_mut(&node_id)
|
||||
.get(&node_id)
|
||||
.expect("Always exists: only known nodes are scanned");
|
||||
|
||||
let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
match client
|
||||
.location_config(
|
||||
tenant_shard_id,
|
||||
@@ -252,13 +235,80 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
let shard_count = tenants.len();
|
||||
// Finally, now that the service is up and running, launch reconcile operations for any tenants
|
||||
// which require it: under normal circumstances this should only include tenants that were in some
|
||||
// transient state before we restarted.
|
||||
let reconcile_tasks = self.reconcile_all();
|
||||
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
|
||||
}
|
||||
|
||||
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
|
||||
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
tracing::info!("Loading nodes from database...");
|
||||
let nodes = persistence.list_nodes().await?;
|
||||
let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
|
||||
tracing::info!("Loaded {} nodes from database.", nodes.len());
|
||||
|
||||
tracing::info!("Loading shards from database...");
|
||||
let tenant_shard_persistence = persistence.list_tenant_shards().await?;
|
||||
tracing::info!(
|
||||
"Loaded {} shards from database.",
|
||||
tenant_shard_persistence.len()
|
||||
);
|
||||
|
||||
let mut tenants = BTreeMap::new();
|
||||
|
||||
for tsp in tenant_shard_persistence {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
let shard_identity = if tsp.shard_count == 0 {
|
||||
ShardIdentity::unsharded()
|
||||
} else {
|
||||
ShardIdentity::new(
|
||||
ShardNumber(tsp.shard_number as u8),
|
||||
ShardCount(tsp.shard_count as u8),
|
||||
ShardStripeSize(tsp.shard_stripe_size as u32),
|
||||
)?
|
||||
};
|
||||
|
||||
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
|
||||
// it with what we can infer: the node for which a generation was most recently issued.
|
||||
let mut intent = IntentState::new();
|
||||
if tsp.generation_pageserver != i64::MAX {
|
||||
intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
|
||||
}
|
||||
|
||||
let new_tenant = TenantState {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
sequence: Sequence::initial(),
|
||||
generation: Generation::new(tsp.generation as u32),
|
||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
||||
intent,
|
||||
observed: ObservedState::new(),
|
||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
||||
reconciler: None,
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
};
|
||||
|
||||
tenants.insert(tenant_shard_id, new_tenant);
|
||||
}
|
||||
|
||||
let (startup_completion, startup_complete) = utils::completion::channel();
|
||||
|
||||
let this = Arc::new(Self {
|
||||
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
|
||||
result_tx, nodes, tenants,
|
||||
))),
|
||||
config,
|
||||
persistence,
|
||||
startup_complete,
|
||||
});
|
||||
|
||||
let result_task_this = this.clone();
|
||||
@@ -316,11 +366,13 @@ impl Service {
|
||||
}
|
||||
});
|
||||
|
||||
// Finally, now that the service is up and running, launch reconcile operations for any tenants
|
||||
// which require it: under normal circumstances this should only include tenants that were in some
|
||||
// transient state before we restarted.
|
||||
let reconcile_tasks = this.reconcile_all();
|
||||
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
|
||||
let startup_reconcile_this = this.clone();
|
||||
tokio::task::spawn(async move {
|
||||
// Block the [`Service::startup_complete`] barrier until we're done
|
||||
let _completion = startup_completion;
|
||||
|
||||
startup_reconcile_this.startup_reconcile().await
|
||||
});
|
||||
|
||||
Ok(this)
|
||||
}
|
||||
@@ -336,7 +388,6 @@ impl Service {
|
||||
let locked = self.inner.write().unwrap();
|
||||
!locked.tenants.contains_key(&attach_req.tenant_shard_id)
|
||||
};
|
||||
|
||||
if insert {
|
||||
let tsp = TenantShardPersistence {
|
||||
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
|
||||
@@ -344,22 +395,39 @@ impl Service {
|
||||
shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
|
||||
shard_stripe_size: 0,
|
||||
generation: 0,
|
||||
generation_pageserver: None,
|
||||
generation_pageserver: i64::MAX,
|
||||
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
|
||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||
};
|
||||
|
||||
self.persistence.insert_tenant_shards(vec![tsp]).await?;
|
||||
match self.persistence.insert_tenant_shards(vec![tsp]).await {
|
||||
Err(e) => match e {
|
||||
DatabaseError::Query(diesel::result::Error::DatabaseError(
|
||||
DatabaseErrorKind::UniqueViolation,
|
||||
_,
|
||||
)) => {
|
||||
tracing::info!(
|
||||
"Raced with another request to insert tenant {}",
|
||||
attach_req.tenant_shard_id
|
||||
)
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
},
|
||||
Ok(()) => {
|
||||
tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked.tenants.insert(
|
||||
attach_req.tenant_shard_id,
|
||||
TenantState::new(
|
||||
attach_req.tenant_shard_id,
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Single,
|
||||
),
|
||||
);
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked.tenants.insert(
|
||||
attach_req.tenant_shard_id,
|
||||
TenantState::new(
|
||||
attach_req.tenant_shard_id,
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Single,
|
||||
),
|
||||
);
|
||||
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let new_generation = if let Some(req_node_id) = attach_req.node_id {
|
||||
@@ -381,6 +449,11 @@ impl Service {
|
||||
|
||||
if let Some(new_generation) = new_generation {
|
||||
tenant_state.generation = new_generation;
|
||||
} else {
|
||||
// This is a detach notification. We must update placement policy to avoid re-attaching
|
||||
// during background scheduling/reconciliation, or during attachment service restart.
|
||||
assert!(attach_req.node_id.is_none());
|
||||
tenant_state.policy = PlacementPolicy::Detached;
|
||||
}
|
||||
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
@@ -501,6 +574,14 @@ impl Service {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
});
|
||||
} else {
|
||||
// After tenant deletion, we may approve any validation. This avoids
|
||||
// spurious warnings on the pageserver if it has pending LSN updates
|
||||
// at the point a deletion happens.
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
response
|
||||
@@ -555,8 +636,8 @@ impl Service {
|
||||
shard_number: tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: tenant_shard_id.shard_count.0 as i32,
|
||||
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
|
||||
generation: 0,
|
||||
generation_pageserver: None,
|
||||
generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
|
||||
generation_pageserver: i64::MAX,
|
||||
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
|
||||
config: serde_json::to_string(&create_req.config).unwrap(),
|
||||
})
|
||||
@@ -597,6 +678,7 @@ impl Service {
|
||||
})?;
|
||||
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: entry
|
||||
.get()
|
||||
.intent
|
||||
@@ -629,6 +711,7 @@ impl Service {
|
||||
})?;
|
||||
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: state
|
||||
.intent
|
||||
.attached
|
||||
@@ -662,14 +745,257 @@ impl Service {
|
||||
(waiters, response_shards)
|
||||
};
|
||||
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
|
||||
self.await_waiters(waiters).await?;
|
||||
|
||||
Ok(TenantCreateResponse {
|
||||
shards: response_shards,
|
||||
})
|
||||
}
|
||||
|
||||
/// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
|
||||
/// wait for reconciliation to complete before responding.
|
||||
async fn await_waiters(
|
||||
&self,
|
||||
waiters: Vec<ReconcilerWaiter>,
|
||||
) -> Result<(), ReconcileWaitError> {
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
|
||||
for waiter in waiters {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
waiter.wait_timeout(timeout).await?;
|
||||
}
|
||||
Ok(TenantCreateResponse {
|
||||
shards: response_shards,
|
||||
})
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This API is used by the cloud control plane to do coarse-grained control of tenants:
|
||||
/// - Call with mode Attached* to upsert the tenant.
|
||||
/// - Call with mode Detached to switch to PolicyMode::Detached
|
||||
///
|
||||
/// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
|
||||
/// secondary locations.
|
||||
pub(crate) async fn tenant_location_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
req: TenantLocationConfigRequest,
|
||||
) -> Result<TenantLocationConfigResponse, ApiError> {
|
||||
if req.tenant_id.shard_count.0 > 1 {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"This API is for importing single-sharded or unsharded tenants"
|
||||
)));
|
||||
}
|
||||
|
||||
let mut waiters = Vec::new();
|
||||
let mut result = TenantLocationConfigResponse { shards: Vec::new() };
|
||||
let maybe_create = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let pageservers = locked.nodes.clone();
|
||||
|
||||
let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
|
||||
|
||||
// Maybe we have existing shards
|
||||
let mut create = true;
|
||||
for (shard_id, shard) in locked
|
||||
.tenants
|
||||
.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
// Saw an existing shard: this is not a creation
|
||||
create = false;
|
||||
|
||||
// Note that for existing tenants we do _not_ respect the generation in the request: this is likely
|
||||
// to be stale. Once a tenant is created in this service, our view of generation is authoritative, and
|
||||
// callers' generations may be ignored. This represents a one-way migration of tenants from the outer
|
||||
// cloud control plane into this service.
|
||||
|
||||
// Use location config mode as an indicator of policy: if they ask for
|
||||
// attached we go to default HA attached mode. If they ask for secondary
|
||||
// we go to secondary-only mode. If they ask for detached we detach.
|
||||
match req.config.mode {
|
||||
LocationConfigMode::Detached => {
|
||||
shard.policy = PlacementPolicy::Detached;
|
||||
}
|
||||
LocationConfigMode::Secondary => {
|
||||
// TODO: implement secondary-only mode.
|
||||
todo!();
|
||||
}
|
||||
LocationConfigMode::AttachedMulti
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
// TODO: persistence for changes in policy
|
||||
if pageservers.len() > 1 {
|
||||
shard.policy = PlacementPolicy::Double(1)
|
||||
} else {
|
||||
// Convenience for dev/test: if we just have one pageserver, import
|
||||
// tenants into Single mode so that scheduling will succeed.
|
||||
shard.policy = PlacementPolicy::Single
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
shard.schedule(&mut scheduler)?;
|
||||
|
||||
let maybe_waiter = shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
&pageservers,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
);
|
||||
if let Some(waiter) = maybe_waiter {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
|
||||
if let Some(node_id) = shard.intent.attached {
|
||||
result.shards.push(TenantShardLocation {
|
||||
shard_id: *shard_id,
|
||||
node_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if create {
|
||||
// Validate request mode
|
||||
match req.config.mode {
|
||||
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
|
||||
// When using this API to onboard an existing tenant to this service, it must start in
|
||||
// an attached state, because we need the request to come with a generation
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Imported tenant must be in attached mode"
|
||||
)));
|
||||
}
|
||||
|
||||
LocationConfigMode::AttachedMulti
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
// Pass
|
||||
}
|
||||
}
|
||||
|
||||
// Validate request generation
|
||||
let Some(generation) = req.config.generation else {
|
||||
// We can only import attached tenants, because we need the request to come with a generation
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Generation is mandatory when importing tenant"
|
||||
)));
|
||||
};
|
||||
|
||||
// Synthesize a creation request
|
||||
Some(TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: Some(generation),
|
||||
shard_parameters: ShardParameters {
|
||||
// Must preserve the incoming shard_count do distinguish unsharded (0)
|
||||
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
|
||||
count: req.tenant_id.shard_count,
|
||||
// We only import un-sharded or single-sharded tenants, so stripe
|
||||
// size can be made up arbitrarily here.
|
||||
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
},
|
||||
config: req.config.tenant_conf,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(create_req) = maybe_create {
|
||||
let create_resp = self.tenant_create(create_req).await?;
|
||||
result.shards = create_resp
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|s| TenantShardLocation {
|
||||
node_id: s.node_id,
|
||||
shard_id: s.shard_id,
|
||||
})
|
||||
.collect();
|
||||
} else {
|
||||
// This was an update, wait for reconciliation
|
||||
self.await_waiters(waiters).await?;
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
|
||||
// TODO: refactor into helper
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in
|
||||
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
let node_id = shard.intent.attached.ok_or_else(|| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
|
||||
})?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
}
|
||||
targets
|
||||
};
|
||||
|
||||
// TODO: error out if the tenant is not attached anywhere.
|
||||
|
||||
// Phase 1: delete on the pageservers
|
||||
let mut any_pending = false;
|
||||
for (tenant_shard_id, node) in targets {
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
// TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
|
||||
// surface immediately as an error to our caller.
|
||||
let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Error deleting shard {tenant_shard_id} on node {}: {e}",
|
||||
node.id
|
||||
))
|
||||
})?;
|
||||
tracing::info!(
|
||||
"Shard {tenant_shard_id} on node {}, delete returned {}",
|
||||
node.id,
|
||||
status
|
||||
);
|
||||
if status == StatusCode::ACCEPTED {
|
||||
any_pending = true;
|
||||
}
|
||||
}
|
||||
|
||||
if any_pending {
|
||||
// Caller should call us again later. When we eventually see 404s from
|
||||
// all the shards, we may proceed to delete our records of the tenant.
|
||||
tracing::info!(
|
||||
"Tenant {} has some shards pending deletion, returning 202",
|
||||
tenant_id
|
||||
);
|
||||
return Ok(StatusCode::ACCEPTED);
|
||||
}
|
||||
|
||||
// Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
|
||||
// our in-memory state and database state.
|
||||
|
||||
// Ordering: we delete persistent state first: if we then
|
||||
// crash, we will drop the in-memory state.
|
||||
|
||||
// Drop persistent state.
|
||||
self.persistence.delete_tenant(tenant_id).await?;
|
||||
|
||||
// Drop in-memory state
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked
|
||||
.tenants
|
||||
.retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
|
||||
tracing::info!(
|
||||
"Deleted tenant {tenant_id}, now have {} tenants",
|
||||
locked.tenants.len()
|
||||
);
|
||||
};
|
||||
|
||||
// Success is represented as 404, to imitate the existing pageserver deletion API
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_create(
|
||||
@@ -679,25 +1005,15 @@ impl Service {
|
||||
) -> Result<TimelineInfo, ApiError> {
|
||||
let mut timeline_info = None;
|
||||
|
||||
let ensure_waiters = {
|
||||
let locked = self.inner.write().unwrap();
|
||||
tracing::info!(
|
||||
"Creating timeline {}/{}, have {} pageservers",
|
||||
tenant_id,
|
||||
create_req.new_timeline_id,
|
||||
locked.nodes.len()
|
||||
);
|
||||
tracing::info!(
|
||||
"Creating timeline {}/{}",
|
||||
tenant_id,
|
||||
create_req.new_timeline_id,
|
||||
);
|
||||
|
||||
self.ensure_attached(locked, tenant_id)
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
};
|
||||
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
|
||||
for waiter in ensure_waiters {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
waiter.wait_timeout(timeout).await?;
|
||||
}
|
||||
self.ensure_attached_wait(tenant_id).await?;
|
||||
|
||||
// TODO: refuse to do this if shard splitting is in progress
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
@@ -768,6 +1084,111 @@ impl Service {
|
||||
Ok(timeline_info.expect("targets cannot be empty"))
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_delete(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<StatusCode, ApiError> {
|
||||
tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
|
||||
|
||||
self.ensure_attached_wait(tenant_id).await?;
|
||||
|
||||
// TODO: refuse to do this if shard splitting is in progress
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in
|
||||
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
let node_id = shard.intent.attached.ok_or_else(|| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
|
||||
})?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
}
|
||||
targets
|
||||
};
|
||||
|
||||
if targets.is_empty() {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant not found").into(),
|
||||
));
|
||||
}
|
||||
|
||||
// TODO: call into shards concurrently
|
||||
let mut any_pending = false;
|
||||
for (tenant_shard_id, node) in targets {
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
|
||||
tracing::info!(
|
||||
"Deleting timeline on shard {}/{}, attached to node {}",
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
node.id
|
||||
);
|
||||
|
||||
let status = client
|
||||
.timeline_delete(tenant_shard_id, timeline_id)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
|
||||
node.id
|
||||
))
|
||||
})?;
|
||||
|
||||
if status == StatusCode::ACCEPTED {
|
||||
any_pending = true;
|
||||
}
|
||||
}
|
||||
|
||||
if any_pending {
|
||||
Ok(StatusCode::ACCEPTED)
|
||||
} else {
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
}
|
||||
}
|
||||
|
||||
/// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
|
||||
/// function looks it up and returns the url. If the tenant isn't found, returns Err(ApiError::NotFound)
|
||||
pub(crate) fn tenant_shard0_baseurl(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(String, TenantShardId), ApiError> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some((tenant_shard_id, shard)) = locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.next()
|
||||
else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
// TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
|
||||
// point to somewhere we haven't attached yet.
|
||||
let Some(node_id) = shard.intent.attached else {
|
||||
return Err(ApiError::Conflict(
|
||||
"Cannot call timeline API on non-attached tenant".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
let Some(node) = locked.nodes.get(&node_id) else {
|
||||
// This should never happen
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Shard refers to nonexistent node"
|
||||
)));
|
||||
};
|
||||
|
||||
Ok((node.base_url(), *tenant_shard_id))
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_locate(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
@@ -870,7 +1291,6 @@ impl Service {
|
||||
} else {
|
||||
let old_attached = shard.intent.attached;
|
||||
|
||||
shard.intent.attached = Some(migrate_req.node_id);
|
||||
match shard.policy {
|
||||
PlacementPolicy::Single => {
|
||||
shard.intent.secondary.clear();
|
||||
@@ -884,7 +1304,13 @@ impl Service {
|
||||
shard.intent.secondary.push(old_attached);
|
||||
}
|
||||
}
|
||||
PlacementPolicy::Detached => {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
|
||||
)))
|
||||
}
|
||||
}
|
||||
shard.intent.attached = Some(migrate_req.node_id);
|
||||
|
||||
tracing::info!("Migrating: new intent {:?}", shard.intent);
|
||||
shard.sequence = shard.sequence.next();
|
||||
@@ -908,6 +1334,20 @@ impl Service {
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
}
|
||||
|
||||
pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
|
||||
// It is convenient to avoid taking the big lock and converting Node to a serializable
|
||||
// structure, by fetching from storage instead of reading in-memory state.
|
||||
let nodes = self
|
||||
.persistence
|
||||
.list_nodes()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|n| n.to_persistent())
|
||||
.collect();
|
||||
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
pub(crate) async fn node_register(
|
||||
&self,
|
||||
register_req: NodeRegisterRequest,
|
||||
@@ -957,10 +1397,7 @@ impl Service {
|
||||
availability: NodeAvailability::Active,
|
||||
};
|
||||
// TODO: idempotency if the node already exists in the database
|
||||
self.persistence
|
||||
.insert_node(&new_node)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
self.persistence.insert_node(&new_node).await?;
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let mut new_nodes = (*locked.nodes).clone();
|
||||
@@ -1084,7 +1521,7 @@ impl Service {
|
||||
/// Helper for methods that will try and call pageserver APIs for
|
||||
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
|
||||
/// is attached somewhere.
|
||||
fn ensure_attached(
|
||||
fn ensure_attached_schedule(
|
||||
&self,
|
||||
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
|
||||
tenant_id: TenantId,
|
||||
@@ -1114,6 +1551,23 @@ impl Service {
|
||||
Ok(waiters)
|
||||
}
|
||||
|
||||
async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
|
||||
let ensure_waiters = {
|
||||
let locked = self.inner.write().unwrap();
|
||||
|
||||
self.ensure_attached_schedule(locked, tenant_id)
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
};
|
||||
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
|
||||
for waiter in ensure_waiters {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
waiter.wait_timeout(timeout).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check all tenants for pending reconciliation work, and reconcile those in need
|
||||
///
|
||||
/// Returns how many reconciliation tasks were started
|
||||
|
||||
@@ -312,6 +312,18 @@ impl TenantState {
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Detached => {
|
||||
// Should have no attached or secondary pageservers
|
||||
if self.intent.attached.is_some() {
|
||||
self.intent.attached = None;
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if modified {
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use diesel::{
|
||||
backend::Backend,
|
||||
query_builder::{AstPass, QueryFragment, QueryId},
|
||||
Connection, PgConnection, QueryResult, RunQueryDsl,
|
||||
};
|
||||
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
|
||||
use hyper::Method;
|
||||
use pageserver_api::{
|
||||
models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
|
||||
@@ -7,10 +13,11 @@ use pageserver_api::{
|
||||
};
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{path::PathBuf, process::Child, str::FromStr};
|
||||
use std::{env, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
id::{NodeId, TenantId},
|
||||
@@ -19,14 +26,17 @@ use utils::{
|
||||
pub struct AttachmentService {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: PathBuf,
|
||||
path: Utf8PathBuf,
|
||||
jwt_token: Option<String>,
|
||||
public_key_path: Option<Utf8PathBuf>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "attachment_service";
|
||||
|
||||
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -50,6 +60,7 @@ pub struct InspectResponse {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
pub generation: u32,
|
||||
}
|
||||
@@ -169,7 +180,9 @@ pub struct TenantShardMigrateResponse {}
|
||||
|
||||
impl AttachmentService {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = env.base_data_dir.join("attachments.json");
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachments.json");
|
||||
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
@@ -181,6 +194,13 @@ impl AttachmentService {
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
|
||||
// port, for use by our captive postgres.
|
||||
let postgres_port = listen_url
|
||||
.port()
|
||||
.expect("Control plane API setting should always have a port")
|
||||
+ 1;
|
||||
|
||||
// Assume all pageservers have symmetric auth configuration: this service
|
||||
// expects to use one JWT token to talk to all of them.
|
||||
let ps_conf = env
|
||||
@@ -209,6 +229,7 @@ impl AttachmentService {
|
||||
listen,
|
||||
jwt_token,
|
||||
public_key_path,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
@@ -220,13 +241,214 @@ impl AttachmentService {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<Child> {
|
||||
let path_str = self.path.to_string_lossy();
|
||||
/// PIDFile for the postgres instance used to store attachment service state
|
||||
fn postgres_pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join("attachment_service_postgres.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
let mut args = vec!["-l", &self.listen, "-p", &path_str]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
/// In order to access database migrations, we need to find the Neon source tree
|
||||
async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
// We assume that either prd or our binary is in the source tree. The former is usually
|
||||
// true for automated test runners, the latter is usually true for developer workstations. Often
|
||||
// both are true, which is fine.
|
||||
let candidate_start_points = [
|
||||
// Current working directory
|
||||
Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
|
||||
// Directory containing the binary we're running inside
|
||||
Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
|
||||
];
|
||||
|
||||
// For each candidate start point, search through ancestors looking for a neon.git source tree root
|
||||
for start_point in &candidate_start_points {
|
||||
// Start from the build dir: assumes we are running out of a built neon source tree
|
||||
for path in start_point.ancestors() {
|
||||
// A crude approximation: the root of the source tree is whatever contains a "control_plane"
|
||||
// subdirectory.
|
||||
let control_plane = path.join("control_plane");
|
||||
if tokio::fs::try_exists(&control_plane).await? {
|
||||
return Ok(path.to_owned());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fall-through
|
||||
Err(anyhow::anyhow!(
|
||||
"Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
|
||||
))
|
||||
}
|
||||
|
||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||
///
|
||||
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
|
||||
|
||||
for v in prefer_versions {
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
||||
if tokio::fs::try_exists(&path).await? {
|
||||
return Ok(path);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through
|
||||
anyhow::bail!(
|
||||
"Postgres binaries not found in {}",
|
||||
self.env.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
/// Readiness check for our postgres process
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||
let bin_path = pg_bin_dir.join("pg_isready");
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
|
||||
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
||||
|
||||
Ok(exitcode.success())
|
||||
}
|
||||
|
||||
/// Create our database if it doesn't exist, and run migrations.
|
||||
///
|
||||
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
||||
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
||||
/// who just want to run `cargo neon_local` without knowing about diesel.
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
let database_url = format!(
|
||||
"postgresql://localhost:{}/attachment_service",
|
||||
self.postgres_port
|
||||
);
|
||||
println!("Running attachment service database setup...");
|
||||
fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
|
||||
let base = ::url::Url::parse(database_url).unwrap();
|
||||
let database = base.path_segments().unwrap().last().unwrap().to_owned();
|
||||
let mut new_url = base.join(default_database).unwrap();
|
||||
new_url.set_query(base.query());
|
||||
(database, new_url.into())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CreateDatabaseStatement {
|
||||
db_name: String,
|
||||
}
|
||||
|
||||
impl CreateDatabaseStatement {
|
||||
pub fn new(db_name: &str) -> Self {
|
||||
CreateDatabaseStatement {
|
||||
db_name: db_name.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
|
||||
fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
|
||||
out.push_sql("CREATE DATABASE ");
|
||||
out.push_identifier(&self.db_name)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
|
||||
|
||||
impl QueryId for CreateDatabaseStatement {
|
||||
type QueryId = ();
|
||||
|
||||
const HAS_STATIC_QUERY_ID: bool = false;
|
||||
}
|
||||
if PgConnection::establish(&database_url).is_err() {
|
||||
let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
|
||||
println!("Creating database: {database}");
|
||||
let mut conn = PgConnection::establish(&postgres_url)?;
|
||||
CreateDatabaseStatement::new(&database).execute(&mut conn)?;
|
||||
}
|
||||
let mut conn = PgConnection::establish(&database_url)?;
|
||||
|
||||
let migrations_dir = self
|
||||
.find_source_root()
|
||||
.await?
|
||||
.join("control_plane/attachment_service/migrations");
|
||||
|
||||
let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
|
||||
println!("Running migrations in {}", migrations.path().display());
|
||||
HarnessWithOutput::write_to_stdout(&mut conn)
|
||||
.run_pending_migrations(migrations)
|
||||
.map(|_| ())
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
println!("Migrations complete");
|
||||
|
||||
Ok(database_url)
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the attachment service for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachment_service_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
};
|
||||
|
||||
println!("Starting attachment service database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"attachment_service_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
|| self.pg_isready(&pg_bin_dir),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
"-p",
|
||||
self.path.as_ref(),
|
||||
"--database-url",
|
||||
&database_url,
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
}
|
||||
@@ -235,7 +457,7 @@ impl AttachmentService {
|
||||
args.push(format!("--public-key={public_key_path}"));
|
||||
}
|
||||
|
||||
let result = background_process::start_process(
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.attachment_service_bin(),
|
||||
@@ -252,29 +474,46 @@ impl AttachmentService {
|
||||
}
|
||||
},
|
||||
)
|
||||
.await;
|
||||
.await?;
|
||||
|
||||
for ps_conf in &self.env.pageservers {
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
self.node_register(NodeRegisterRequest {
|
||||
node_id: ps_conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
println!("Stopping attachment service database...");
|
||||
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
|
||||
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_stop_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
if !stop_status.success() {
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Attachment service data base is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())
|
||||
}
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
@@ -286,13 +525,15 @@ impl AttachmentService {
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
let url = self
|
||||
.env
|
||||
.control_plane_api
|
||||
.clone()
|
||||
.unwrap()
|
||||
.join(&path)
|
||||
.unwrap();
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
@@ -329,7 +570,7 @@ impl AttachmentService {
|
||||
let response = self
|
||||
.dispatch::<_, AttachHookResponse>(
|
||||
Method::POST,
|
||||
"attach-hook".to_string(),
|
||||
"debug/v1/attach-hook".to_string(),
|
||||
Some(request),
|
||||
)
|
||||
.await?;
|
||||
@@ -345,7 +586,11 @@ impl AttachmentService {
|
||||
let request = InspectRequest { tenant_shard_id };
|
||||
|
||||
let response = self
|
||||
.dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
|
||||
.dispatch::<_, InspectResponse>(
|
||||
Method::POST,
|
||||
"debug/v1/inspect".to_string(),
|
||||
Some(request),
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(response.attachment)
|
||||
@@ -356,14 +601,18 @@ impl AttachmentService {
|
||||
&self,
|
||||
req: TenantCreateRequest,
|
||||
) -> anyhow::Result<TenantCreateResponse> {
|
||||
self.dispatch(Method::POST, "tenant".to_string(), Some(req))
|
||||
self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||
self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
|
||||
.await
|
||||
self.dispatch::<(), _>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
@@ -385,7 +634,7 @@ impl AttachmentService {
|
||||
|
||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
||||
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
|
||||
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -393,7 +642,7 @@ impl AttachmentService {
|
||||
pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(
|
||||
Method::PUT,
|
||||
format!("node/{}/config", req.node_id),
|
||||
format!("control/v1/node/{}/config", req.node_id),
|
||||
Some(req),
|
||||
)
|
||||
.await
|
||||
@@ -413,7 +662,7 @@ impl AttachmentService {
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
self.dispatch(
|
||||
Method::POST,
|
||||
format!("tenant/{tenant_id}/timeline"),
|
||||
format!("v1/tenant/{tenant_id}/timeline"),
|
||||
Some(req),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::io::Write;
|
||||
use std::os::unix::prelude::AsRawFd;
|
||||
use std::os::unix::process::CommandExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Child, Command};
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::{fs, io, thread};
|
||||
|
||||
@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
envs: EI,
|
||||
initial_pid_file: InitialPidFile,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<Child>
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
F: Fn() -> Fut,
|
||||
Fut: std::future::Future<Output = anyhow::Result<bool>>,
|
||||
@@ -98,7 +98,7 @@ where
|
||||
InitialPidFile::Expect(path) => path,
|
||||
};
|
||||
|
||||
let mut spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
let spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
format!("Could not spawn {process_name}, see console output and log files for details.")
|
||||
})?;
|
||||
let pid = spawned_process.id();
|
||||
@@ -106,12 +106,26 @@ where
|
||||
i32::try_from(pid)
|
||||
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
|
||||
);
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
|
||||
println!("SIGKILL & wait the started process");
|
||||
(|| {
|
||||
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
|
||||
spawned_process.kill().context("SIGKILL child")?;
|
||||
spawned_process.wait().context("wait() for child process")?;
|
||||
anyhow::Ok(())
|
||||
})()
|
||||
.with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check).await {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started, pid: {pid}");
|
||||
return Ok(spawned_process);
|
||||
println!("\n{process_name} started and passed status check, pid: {pid}");
|
||||
// leak the child process, it'll outlive this neon_local invocation
|
||||
drop(scopeguard::ScopeGuard::into_inner(spawned_process));
|
||||
return Ok(());
|
||||
}
|
||||
Ok(false) => {
|
||||
if retries == NOTICE_AFTER_RETRIES {
|
||||
@@ -126,16 +140,15 @@ where
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} failed to start: {e:#}");
|
||||
if let Err(e) = spawned_process.kill() {
|
||||
println!("Could not stop {process_name} subprocess: {e:#}")
|
||||
};
|
||||
println!("error starting process {process_name:?}: {e:#}");
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
|
||||
anyhow::bail!(
|
||||
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
|
||||
);
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
@@ -243,7 +256,9 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
"AWS_PROFILE",
|
||||
// HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
|
||||
"HOME",
|
||||
"AZURE_STORAGE_ACCOUNT",
|
||||
"AZURE_STORAGE_ACCESS_KEY",
|
||||
] {
|
||||
|
||||
@@ -51,7 +51,7 @@ project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||
|
||||
fn default_conf(num_pageservers: u16) -> String {
|
||||
let mut template = format!(
|
||||
@@ -135,7 +135,7 @@ fn main() -> Result<()> {
|
||||
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
||||
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
||||
"stop" => handle_stop_all(sub_args, &env),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
|
||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
||||
@@ -1056,8 +1056,9 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.start(&pageserver_config_overrides(subcommand_args), *register)
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1086,24 +1087,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("migrate", subcommand_args)) => {
|
||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
||||
//TODO what shutdown strategy should we use here?
|
||||
if let Err(e) = pageserver.stop(false) {
|
||||
eprintln!("pageserver stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.start(&pageserver_config_overrides(subcommand_args), false)
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1161,7 +1145,7 @@ async fn handle_attachment_service(
|
||||
.map(|s| s.as_str())
|
||||
== Some("immediate");
|
||||
|
||||
if let Err(e) = svc.stop(immediate) {
|
||||
if let Err(e) = svc.stop(immediate).await {
|
||||
eprintln!("stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1257,7 +1241,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.start().await {
|
||||
eprintln!("attachment_service start failed: {:#}", e);
|
||||
try_stop_all(env, true);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@@ -1265,11 +1249,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(sub_match))
|
||||
.start(&pageserver_config_overrides(sub_match), true)
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
try_stop_all(env, true);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@@ -1278,23 +1262,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start(vec![]).await {
|
||||
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
|
||||
try_stop_all(env, false);
|
||||
try_stop_all(env, false).await;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let immediate =
|
||||
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||
|
||||
try_stop_all(env, immediate);
|
||||
try_stop_all(env, immediate).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
// Stop all endpoints
|
||||
match ComputeControlPlane::load(env.clone()) {
|
||||
Ok(cplane) => {
|
||||
@@ -1329,7 +1313,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
|
||||
if env.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.stop(immediate) {
|
||||
if let Err(e) = attachment_service.stop(immediate).await {
|
||||
eprintln!("attachment service stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
@@ -1549,7 +1533,11 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
|
||||
.long("register")
|
||||
.default_value("true").required(false)
|
||||
.value_parser(value_parser!(bool))
|
||||
.value_name("register"))
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
|
||||
@@ -438,7 +438,7 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead
|
||||
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
@@ -583,9 +583,21 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
let child = cmd.spawn()?;
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let child = scopeguard::guard(child, |mut child| {
|
||||
println!("SIGKILL & wait the started process");
|
||||
(|| {
|
||||
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
|
||||
child.kill().context("SIGKILL child")?;
|
||||
child.wait().context("wait() for child process")?;
|
||||
anyhow::Ok(())
|
||||
})()
|
||||
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Write down the pid so we can wait for it when we want to stop
|
||||
// TODO use background_process::start_process instead
|
||||
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
|
||||
let pid = child.id();
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
std::fs::write(pidfile_path, pid.to_string())?;
|
||||
@@ -634,6 +646,9 @@ impl Endpoint {
|
||||
std::thread::sleep(ATTEMPT_INTERVAL);
|
||||
}
|
||||
|
||||
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
|
||||
drop(scopeguard::ScopeGuard::into_inner(child));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -223,7 +223,11 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
pub fn attachment_service_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("attachment_service")
|
||||
// Irrespective of configuration, attachment service binary is always
|
||||
// run from the same location as neon_local. This means that for compatibility
|
||||
// tests that run old pageserver/safekeeper, they still run latest attachment service.
|
||||
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
|
||||
neon_local_bin_dir.join("attachment_service")
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> PathBuf {
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::io;
|
||||
use std::io::Write;
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Child, Command};
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
@@ -30,6 +30,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
|
||||
use crate::local_env::PageServerConf;
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
@@ -161,8 +162,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
self.start_node(config_overrides, false).await
|
||||
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false, register).await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
@@ -207,7 +208,8 @@ impl PageServerNode {
|
||||
&self,
|
||||
config_overrides: &[&str],
|
||||
update_config: bool,
|
||||
) -> anyhow::Result<Child> {
|
||||
register: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
@@ -244,7 +246,26 @@ impl PageServerNode {
|
||||
}
|
||||
},
|
||||
)
|
||||
.await
|
||||
.await?;
|
||||
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pageserver_basic_args<'a>(
|
||||
@@ -374,6 +395,11 @@ impl PageServerNode {
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
lazy_slru_download: settings
|
||||
.remove("lazy_slru_download")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -474,6 +500,11 @@ impl PageServerNode {
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
lazy_slru_download: settings
|
||||
.remove("lazy_slru_download")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
//! ```
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -104,7 +103,7 @@ impl SafekeeperNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
|
||||
9
diesel.toml
Normal file
9
diesel.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
# For documentation on how to configure this file,
|
||||
# see https://diesel.rs/guides/configuring-diesel-cli
|
||||
|
||||
[print_schema]
|
||||
file = "control_plane/attachment_service/src/schema.rs"
|
||||
custom_type_derives = ["diesel::query_builder::QueryId"]
|
||||
|
||||
[migrations_directory]
|
||||
dir = "control_plane/attachment_service/migrations"
|
||||
@@ -9,5 +9,10 @@ prometheus.workspace = true
|
||||
libc.workspace = true
|
||||
once_cell.workspace = true
|
||||
chrono.workspace = true
|
||||
twox-hash.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8"
|
||||
rand_distr = "0.4.3"
|
||||
|
||||
523
libs/metrics/src/hll.rs
Normal file
523
libs/metrics/src/hll.rs
Normal file
@@ -0,0 +1,523 @@
|
||||
//! HyperLogLog is an algorithm for the count-distinct problem,
|
||||
//! approximating the number of distinct elements in a multiset.
|
||||
//! Calculating the exact cardinality of the distinct elements
|
||||
//! of a multiset requires an amount of memory proportional to
|
||||
//! the cardinality, which is impractical for very large data sets.
|
||||
//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
|
||||
//! use significantly less memory than this, but can only approximate the cardinality.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
|
||||
sync::{atomic::AtomicU8, Arc, RwLock},
|
||||
};
|
||||
|
||||
use prometheus::{
|
||||
core::{self, Describer},
|
||||
proto, Opts,
|
||||
};
|
||||
use twox_hash::xxh3;
|
||||
|
||||
/// Create an [`HyperLogLogVec`] and registers to default registry.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! register_hll_vec {
|
||||
($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
|
||||
let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
|
||||
$crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
|
||||
}};
|
||||
|
||||
($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
|
||||
$crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||
}};
|
||||
}
|
||||
|
||||
/// Create an [`HyperLogLog`] and registers to default registry.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! register_hll {
|
||||
($N:literal, $OPTS:expr $(,)?) => {{
|
||||
let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
|
||||
$crate::register(Box::new(hll.clone())).map(|_| hll)
|
||||
}};
|
||||
|
||||
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||
}};
|
||||
}
|
||||
|
||||
/// HLL is a probabilistic cardinality measure.
|
||||
///
|
||||
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// If you want an estimate over time, you can use the following query:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (
|
||||
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
||||
/// ) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
||||
///
|
||||
/// ```promql
|
||||
/// # LinearCounting(m, V) = m log (m / V)
|
||||
/// shards_count * ln(shards_count /
|
||||
/// # calculate V = how many shards contain a 0
|
||||
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
||||
/// )
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLogVec<const N: usize> {
|
||||
core: Arc<HyperLogLogVecCore<N>>,
|
||||
}
|
||||
|
||||
struct HyperLogLogVecCore<const N: usize> {
|
||||
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
|
||||
pub desc: core::Desc,
|
||||
pub opts: Opts,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
for child in self.core.children.read().unwrap().values() {
|
||||
child.core.collect_into(&mut metrics);
|
||||
}
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogVec<N> {
|
||||
/// Create a new [`HyperLogLogVec`] based on the provided
|
||||
/// [`Opts`] and partitioned by the given label names. At least one label name must be
|
||||
/// provided.
|
||||
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
|
||||
let opts = opts.variable_labels(variable_names);
|
||||
|
||||
let desc = opts.describe()?;
|
||||
let v = HyperLogLogVecCore {
|
||||
children: RwLock::new(HashMap::default()),
|
||||
desc,
|
||||
opts,
|
||||
};
|
||||
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
|
||||
/// of label values (same order as the VariableLabels in Desc). If that combination of
|
||||
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
|
||||
///
|
||||
/// An error is returned if the number of label values is not the same as the
|
||||
/// number of VariableLabels in Desc.
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
self.core.get_metric_with_label_values(vals)
|
||||
}
|
||||
|
||||
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
|
||||
/// occurs.
|
||||
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogVecCore<N> {
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let h = self.hash_label_values(vals)?;
|
||||
|
||||
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
self.get_or_create_metric(h, vals)
|
||||
}
|
||||
|
||||
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
|
||||
if vals.len() != self.desc.variable_labels.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: self.desc.variable_labels.len(),
|
||||
got: vals.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut h = xxh3::Hash64::default();
|
||||
for val in vals {
|
||||
h.write(val.as_bytes());
|
||||
}
|
||||
|
||||
Ok(h.finish())
|
||||
}
|
||||
|
||||
fn get_or_create_metric(
|
||||
&self,
|
||||
hash: u64,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let mut children = self.children.write().unwrap();
|
||||
// Check exist first.
|
||||
if let Some(metric) = children.get(&hash).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
|
||||
children.insert(hash, metric.clone());
|
||||
Ok(metric)
|
||||
}
|
||||
}
|
||||
|
||||
/// HLL is a probabilistic cardinality measure.
|
||||
///
|
||||
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// If you want an estimate over time, you can use the following query:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (
|
||||
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
||||
/// ) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
||||
///
|
||||
/// ```promql
|
||||
/// # LinearCounting(m, V) = m log (m / V)
|
||||
/// shards_count * ln(shards_count /
|
||||
/// # calculate V = how many shards contain a 0
|
||||
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
||||
/// )
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLog<const N: usize> {
|
||||
core: Arc<HyperLogLogCore<N>>,
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLog<N> {
|
||||
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
|
||||
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let opts = Opts::new(name, help);
|
||||
Self::with_opts(opts)
|
||||
}
|
||||
|
||||
/// Create a [`HyperLogLog`] with the `opts` options.
|
||||
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
|
||||
Self::with_opts_and_label_values(&opts, &[])
|
||||
}
|
||||
|
||||
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
|
||||
let desc = opts.describe()?;
|
||||
let labels = make_label_pairs(&desc, label_values)?;
|
||||
|
||||
let v = HyperLogLogCore {
|
||||
shards: [0; N].map(AtomicU8::new),
|
||||
desc,
|
||||
labels,
|
||||
};
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
pub fn measure(&self, item: &impl Hash) {
|
||||
// changing the hasher will break compatibility with previous measurements.
|
||||
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
||||
}
|
||||
|
||||
fn record(&self, hash: u64) {
|
||||
let p = N.ilog2() as u8;
|
||||
let j = hash & (N as u64 - 1);
|
||||
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
|
||||
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
struct HyperLogLogCore<const N: usize> {
|
||||
shards: [AtomicU8; N],
|
||||
desc: core::Desc,
|
||||
labels: Vec<proto::LabelPair>,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLog<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
self.core.collect_into(&mut metrics);
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogCore<N> {
|
||||
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
|
||||
self.shards.iter().enumerate().for_each(|(i, x)| {
|
||||
let mut shard_label = proto::LabelPair::default();
|
||||
shard_label.set_name("hll_shard".to_owned());
|
||||
shard_label.set_value(format!("{i}"));
|
||||
|
||||
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
|
||||
|
||||
// This seems like it would be a race condition,
|
||||
// but HLL is not impacted by a write in one shard happening in between.
|
||||
// This is because in PromQL we will be implementing a harmonic mean of all buckets.
|
||||
// we will also merge samples in a time series using `max by (hll_shard)`.
|
||||
|
||||
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
|
||||
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
|
||||
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
let mut m = proto::Metric::default();
|
||||
let mut c = proto::Gauge::default();
|
||||
c.set_value(v as f64);
|
||||
m.set_gauge(c);
|
||||
|
||||
let mut labels = Vec::with_capacity(self.labels.len() + 1);
|
||||
labels.extend_from_slice(&self.labels);
|
||||
labels.push(shard_label);
|
||||
|
||||
m.set_label(labels);
|
||||
metrics.push(m);
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn make_label_pairs(
|
||||
desc: &core::Desc,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<Vec<proto::LabelPair>> {
|
||||
if desc.variable_labels.len() != label_values.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: desc.variable_labels.len(),
|
||||
got: label_values.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
|
||||
if total_len == 0 {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
if desc.variable_labels.is_empty() {
|
||||
return Ok(desc.const_label_pairs.clone());
|
||||
}
|
||||
|
||||
let mut label_pairs = Vec::with_capacity(total_len);
|
||||
for (i, n) in desc.variable_labels.iter().enumerate() {
|
||||
let mut label_pair = proto::LabelPair::default();
|
||||
label_pair.set_name(n.clone());
|
||||
label_pair.set_value(label_values[i].to_owned());
|
||||
label_pairs.push(label_pair);
|
||||
}
|
||||
|
||||
for label_pair in &desc.const_label_pairs {
|
||||
label_pairs.push(label_pair.clone());
|
||||
}
|
||||
label_pairs.sort();
|
||||
Ok(label_pairs)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use prometheus::{proto, Opts};
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use rand_distr::{Distribution, Zipf};
|
||||
|
||||
use crate::HyperLogLogVec;
|
||||
|
||||
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
|
||||
let mut metrics = vec![];
|
||||
hll.core
|
||||
.children
|
||||
.read()
|
||||
.unwrap()
|
||||
.values()
|
||||
.for_each(|c| c.core.collect_into(&mut metrics));
|
||||
metrics
|
||||
}
|
||||
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
|
||||
let mut buckets = [0.0; 32];
|
||||
for metric in metrics.chunks_exact(32) {
|
||||
if filter(&metric[0]) {
|
||||
for (i, m) in metric.iter().enumerate() {
|
||||
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buckets
|
||||
.into_iter()
|
||||
.map(|f| 2.0f64.powf(-f))
|
||||
.sum::<f64>()
|
||||
.recip()
|
||||
* 0.697
|
||||
* 32.0
|
||||
* 32.0
|
||||
}
|
||||
|
||||
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
|
||||
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
|
||||
|
||||
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
|
||||
let mut set_a = HashSet::new();
|
||||
let mut set_b = HashSet::new();
|
||||
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_a.insert(x.to_bits());
|
||||
hll.with_label_values(&["a"]).measure(&x.to_bits());
|
||||
}
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_b.insert(x.to_bits());
|
||||
hll.with_label_values(&["b"]).measure(&x.to_bits());
|
||||
}
|
||||
let merge = &set_a | &set_b;
|
||||
|
||||
let metrics = collect(&hll);
|
||||
let len = get_cardinality(&metrics, |_| true);
|
||||
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
|
||||
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
|
||||
|
||||
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_small() {
|
||||
let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [46, 30, 32]);
|
||||
assert!(51.3 < estimate[0] && estimate[0] < 51.4);
|
||||
assert!(44.0 < estimate[1] && estimate[1] < 44.1);
|
||||
assert!(39.0 < estimate[2] && estimate[2] < 39.1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_medium() {
|
||||
let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [2529, 1618, 1629]);
|
||||
assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
|
||||
assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
|
||||
assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_large() {
|
||||
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [129077, 79579, 79630]);
|
||||
assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
|
||||
assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
|
||||
assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_small2() {
|
||||
let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [92, 58, 60]);
|
||||
assert!(116.1 < estimate[0] && estimate[0] < 116.2);
|
||||
assert!(81.7 < estimate[1] && estimate[1] < 81.8);
|
||||
assert!(69.3 < estimate[2] && estimate[2] < 69.4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_medium2() {
|
||||
let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [8201, 5131, 5051]);
|
||||
assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
|
||||
assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
|
||||
assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_large2() {
|
||||
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [777847, 482069, 482246]);
|
||||
assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
|
||||
assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
|
||||
assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
|
||||
}
|
||||
}
|
||||
@@ -28,7 +28,9 @@ use prometheus::{Registry, Result};
|
||||
pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub mod metric_vec_duration;
|
||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||
|
||||
pub type UIntGauge = GenericGauge<AtomicU64>;
|
||||
pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
|
||||
|
||||
@@ -20,6 +20,7 @@ strum_macros.workspace = true
|
||||
hex.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -63,16 +63,84 @@ impl KeySpace {
|
||||
KeyPartitioning { parts }
|
||||
}
|
||||
|
||||
/// Update the keyspace such that it doesn't contain any range
|
||||
/// that is overlapping with `other`. This can involve splitting or
|
||||
/// removing of existing ranges.
|
||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
|
||||
let (self_start, self_end) = match (self.start(), self.end()) {
|
||||
(Some(start), Some(end)) => (start, end),
|
||||
_ => {
|
||||
// self is empty
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// Key spaces are sorted by definition, so skip ahead to the first
|
||||
// potentially intersecting range. Similarly, ignore ranges that start
|
||||
// after the current keyspace ends.
|
||||
let other_ranges = other
|
||||
.ranges
|
||||
.iter()
|
||||
.skip_while(|range| self_start >= range.end)
|
||||
.take_while(|range| self_end > range.start);
|
||||
|
||||
for range in other_ranges {
|
||||
while let Some(overlap_at) = self.overlaps_at(range) {
|
||||
let overlapped = self.ranges[overlap_at].clone();
|
||||
|
||||
if overlapped.start < range.start && overlapped.end <= range.end {
|
||||
// Higher part of the range is completely overlapped.
|
||||
self.ranges[overlap_at].end = range.start;
|
||||
}
|
||||
if overlapped.start >= range.start && overlapped.end > range.end {
|
||||
// Lower part of the range is completely overlapped.
|
||||
self.ranges[overlap_at].start = range.end;
|
||||
}
|
||||
if overlapped.start < range.start && overlapped.end > range.end {
|
||||
// Middle part of the range is overlapped.
|
||||
self.ranges[overlap_at].end = range.start;
|
||||
self.ranges
|
||||
.insert(overlap_at + 1, range.end..overlapped.end);
|
||||
}
|
||||
if overlapped.start >= range.start && overlapped.end <= range.end {
|
||||
// Whole range is overlapped
|
||||
self.ranges.remove(overlap_at);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Option<Key> {
|
||||
self.ranges.first().map(|range| range.start)
|
||||
}
|
||||
|
||||
pub fn end(&self) -> Option<Key> {
|
||||
self.ranges.last().map(|range| range.end)
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
pub fn total_size(&self) -> usize {
|
||||
self.ranges
|
||||
.iter()
|
||||
.map(|range| key_range_size(range) as usize)
|
||||
.sum()
|
||||
}
|
||||
|
||||
fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
|
||||
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
|
||||
Ok(0) => None,
|
||||
Err(0) => None,
|
||||
Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
|
||||
Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Check if key space contains overlapping range
|
||||
///
|
||||
pub fn overlaps(&self, range: &Range<Key>) -> bool {
|
||||
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
|
||||
Ok(0) => false,
|
||||
Err(0) => false,
|
||||
Ok(index) => self.ranges[index - 1].end > range.start,
|
||||
Err(index) => self.ranges[index - 1].end > range.start,
|
||||
}
|
||||
self.overlaps_at(range).is_some()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -104,6 +172,7 @@ pub struct KeySpaceAccum {
|
||||
accum: Option<Range<Key>>,
|
||||
|
||||
ranges: Vec<Range<Key>>,
|
||||
size: u64,
|
||||
}
|
||||
|
||||
impl KeySpaceAccum {
|
||||
@@ -111,6 +180,7 @@ impl KeySpaceAccum {
|
||||
Self {
|
||||
accum: None,
|
||||
ranges: Vec::new(),
|
||||
size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +191,8 @@ impl KeySpaceAccum {
|
||||
|
||||
#[inline(always)]
|
||||
pub fn add_range(&mut self, range: Range<Key>) {
|
||||
self.size += key_range_size(&range) as u64;
|
||||
|
||||
match self.accum.as_mut() {
|
||||
Some(accum) => {
|
||||
if range.start == accum.end {
|
||||
@@ -146,6 +218,23 @@ impl KeySpaceAccum {
|
||||
ranges: self.ranges,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn consume_keyspace(&mut self) -> KeySpace {
|
||||
if let Some(accum) = self.accum.take() {
|
||||
self.ranges.push(accum);
|
||||
}
|
||||
|
||||
let mut prev_accum = KeySpaceAccum::new();
|
||||
std::mem::swap(self, &mut prev_accum);
|
||||
|
||||
KeySpace {
|
||||
ranges: prev_accum.ranges,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -254,6 +343,30 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_consume() {
|
||||
let ranges = vec![kr(0..10), kr(20..35), kr(40..45)];
|
||||
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
for range in &ranges {
|
||||
accum.add_range(range.clone());
|
||||
}
|
||||
|
||||
let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
|
||||
assert_eq!(accum.size(), expected_size);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
|
||||
assert_eq!(accum.size(), 0);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), vec![]);
|
||||
assert_eq!(accum.size(), 0);
|
||||
|
||||
for range in &ranges {
|
||||
accum.add_range(range.clone());
|
||||
}
|
||||
assert_ks_eq(&accum.to_keyspace(), ranges);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_add_range() {
|
||||
// two separate ranges
|
||||
@@ -396,4 +509,118 @@ mod tests {
|
||||
// xxxxxxxxxxx
|
||||
assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_full_overlapps() {
|
||||
let mut key_space1 = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(1)..Key::from_i128(4),
|
||||
Key::from_i128(5)..Key::from_i128(8),
|
||||
Key::from_i128(10)..Key::from_i128(12),
|
||||
],
|
||||
};
|
||||
let key_space2 = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(2)..Key::from_i128(3),
|
||||
Key::from_i128(6)..Key::from_i128(7),
|
||||
Key::from_i128(11)..Key::from_i128(13),
|
||||
],
|
||||
};
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
Key::from_i128(1)..Key::from_i128(2),
|
||||
Key::from_i128(3)..Key::from_i128(4),
|
||||
Key::from_i128(5)..Key::from_i128(6),
|
||||
Key::from_i128(7)..Key::from_i128(8),
|
||||
Key::from_i128(10)..Key::from_i128(11)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_partial_overlaps() {
|
||||
// Test partial ovelaps
|
||||
let mut key_space1 = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(1)..Key::from_i128(5),
|
||||
Key::from_i128(7)..Key::from_i128(10),
|
||||
Key::from_i128(12)..Key::from_i128(15),
|
||||
],
|
||||
};
|
||||
let key_space2 = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(3)..Key::from_i128(6),
|
||||
Key::from_i128(8)..Key::from_i128(11),
|
||||
Key::from_i128(14)..Key::from_i128(17),
|
||||
],
|
||||
};
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
Key::from_i128(1)..Key::from_i128(3),
|
||||
Key::from_i128(7)..Key::from_i128(8),
|
||||
Key::from_i128(12)..Key::from_i128(14),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_no_overlaps() {
|
||||
let mut key_space1 = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(1)..Key::from_i128(5),
|
||||
Key::from_i128(7)..Key::from_i128(10),
|
||||
Key::from_i128(12)..Key::from_i128(15),
|
||||
],
|
||||
};
|
||||
let key_space2 = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(6)..Key::from_i128(7),
|
||||
Key::from_i128(11)..Key::from_i128(12),
|
||||
Key::from_i128(15)..Key::from_i128(17),
|
||||
],
|
||||
};
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
Key::from_i128(1)..Key::from_i128(5),
|
||||
Key::from_i128(7)..Key::from_i128(10),
|
||||
Key::from_i128(12)..Key::from_i128(15),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_one_range_overlaps_multiple() {
|
||||
let mut key_space1 = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(1)..Key::from_i128(3),
|
||||
Key::from_i128(3)..Key::from_i128(6),
|
||||
Key::from_i128(6)..Key::from_i128(10),
|
||||
Key::from_i128(12)..Key::from_i128(15),
|
||||
Key::from_i128(17)..Key::from_i128(20),
|
||||
Key::from_i128(20)..Key::from_i128(30),
|
||||
Key::from_i128(30)..Key::from_i128(40),
|
||||
],
|
||||
};
|
||||
let key_space2 = KeySpace {
|
||||
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
|
||||
};
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
Key::from_i128(1)..Key::from_i128(3),
|
||||
Key::from_i128(3)..Key::from_i128(6),
|
||||
Key::from_i128(6)..Key::from_i128(9),
|
||||
Key::from_i128(19)..Key::from_i128(20),
|
||||
Key::from_i128(20)..Key::from_i128(30),
|
||||
Key::from_i128(30)..Key::from_i128(40),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::{
|
||||
};
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::serde_as;
|
||||
use strum_macros;
|
||||
@@ -271,6 +272,7 @@ pub struct TenantConfig {
|
||||
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
||||
pub gc_feedback: Option<bool>,
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -364,6 +366,19 @@ pub struct TenantLocationConfigRequest {
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantShardLocation {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigResponse {
|
||||
pub shards: Vec<TenantShardLocation>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantConfigRequest {
|
||||
@@ -439,6 +454,8 @@ pub struct TenantDetails {
|
||||
#[serde(flatten)]
|
||||
pub tenant_info: TenantInfo,
|
||||
|
||||
pub walredo: Option<WalRedoManagerStatus>,
|
||||
|
||||
pub timelines: Vec<TimelineId>,
|
||||
}
|
||||
|
||||
@@ -626,6 +643,12 @@ pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerStatus {
|
||||
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
pub pid: Option<u32>,
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum PagestreamFeMessage {
|
||||
@@ -633,6 +656,7 @@ pub enum PagestreamFeMessage {
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -643,6 +667,7 @@ pub enum PagestreamBeMessage {
|
||||
GetPage(PagestreamGetPageResponse),
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
@@ -653,6 +678,7 @@ enum PagestreamBeMessageTag {
|
||||
GetPage = 102,
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
}
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
@@ -663,6 +689,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
102 => Ok(PagestreamBeMessageTag::GetPage),
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
@@ -697,6 +724,14 @@ pub struct PagestreamDbSizeRequest {
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamGetSlruSegmentRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub kind: u8,
|
||||
pub segno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamExistsResponse {
|
||||
pub exists: bool,
|
||||
@@ -712,6 +747,11 @@ pub struct PagestreamGetPageResponse {
|
||||
pub page: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetSlruSegmentResponse {
|
||||
pub segment: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamErrorResponse {
|
||||
pub message: String,
|
||||
@@ -775,6 +815,14 @@ impl PagestreamFeMessage {
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.dbnode);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(req) => {
|
||||
bytes.put_u8(4);
|
||||
bytes.put_u8(u8::from(req.latest));
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -825,6 +873,14 @@ impl PagestreamFeMessage {
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
4 => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentRequest {
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
kind: body.read_u8()?,
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
}
|
||||
}
|
||||
@@ -860,6 +916,12 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(resp) => {
|
||||
bytes.put_u8(Tag::GetSlruSegment as u8);
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -900,6 +962,14 @@ impl PagestreamBeMessage {
|
||||
let db_size = buf.read_i64::<BigEndian>()?;
|
||||
Self::DbSize(PagestreamDbSizeResponse { db_size })
|
||||
}
|
||||
Tag::GetSlruSegment => {
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
|
||||
buf.read_exact(&mut segment)?;
|
||||
Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
@@ -918,6 +988,7 @@ impl PagestreamBeMessage {
|
||||
Self::GetPage(_) => "GetPage",
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,9 +111,23 @@ impl RelTag {
|
||||
/// These files are divided into segments, which are divided into
|
||||
/// pages of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
Hash,
|
||||
Serialize,
|
||||
Deserialize,
|
||||
PartialEq,
|
||||
Eq,
|
||||
PartialOrd,
|
||||
Ord,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::FromRepr,
|
||||
)]
|
||||
#[repr(u8)]
|
||||
pub enum SlruKind {
|
||||
Clog,
|
||||
Clog = 0,
|
||||
MultiXactMembers,
|
||||
MultiXactOffsets,
|
||||
}
|
||||
|
||||
@@ -207,10 +207,16 @@ pub fn find_end_of_wal(
|
||||
let seg_offs = curr_lsn.segment_offset(wal_seg_size);
|
||||
segment.seek(SeekFrom::Start(seg_offs as u64))?;
|
||||
// loop inside segment
|
||||
loop {
|
||||
while curr_lsn.segment_number(wal_seg_size) == segno {
|
||||
let bytes_read = segment.read(&mut buf)?;
|
||||
if bytes_read == 0 {
|
||||
break; // EOF
|
||||
debug!(
|
||||
"find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
|
||||
result,
|
||||
seg_file_path,
|
||||
curr_lsn.segment_offset(wal_seg_size)
|
||||
);
|
||||
return Ok(result);
|
||||
}
|
||||
curr_lsn += bytes_read as u64;
|
||||
decoder.feed_bytes(&buf[0..bytes_read]);
|
||||
|
||||
@@ -10,6 +10,7 @@ byteorder.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
smallvec.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -7,7 +7,8 @@ pub mod framed;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
|
||||
use smallvec::SmallVec;
|
||||
use std::{borrow::Cow, fmt, io, ops::Range, str};
|
||||
|
||||
// re-export for use in utils pageserver_feedback.rs
|
||||
pub use postgres_protocol::PG_EPOCH;
|
||||
@@ -49,29 +50,67 @@ pub enum FeStartupPacket {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct StartupMessageParams {
|
||||
params: HashMap<String, String>,
|
||||
data: String,
|
||||
pairs: SmallVec<[Range<u32>; 4]>,
|
||||
// for easy access
|
||||
user: Option<Range<u32>>,
|
||||
database: Option<Range<u32>>,
|
||||
options: Option<Range<u32>>,
|
||||
replication: Option<Range<u32>>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for StartupMessageParams {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_map().entries(self.iter()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl StartupMessageParams {
|
||||
/// Get parameter's value by its name.
|
||||
pub fn get(&self, name: &str) -> Option<&str> {
|
||||
self.params.get(name).map(|s| s.as_str())
|
||||
self.pairs
|
||||
.iter()
|
||||
.map(|r| &self.data[r.start as usize..r.end as usize])
|
||||
.find_map(|pair| pair.strip_prefix(name).and_then(|x| x.strip_prefix('\0')))
|
||||
}
|
||||
|
||||
pub fn user(&self) -> Option<&str> {
|
||||
self.user
|
||||
.clone()
|
||||
.and_then(|r| self.data.get(r.start as usize..r.end as usize))
|
||||
}
|
||||
|
||||
pub fn database(&self) -> Option<&str> {
|
||||
self.database
|
||||
.clone()
|
||||
.and_then(|r| self.data.get(r.start as usize..r.end as usize))
|
||||
}
|
||||
|
||||
pub(crate) fn options_str(&self) -> Option<&str> {
|
||||
self.options
|
||||
.clone()
|
||||
.and_then(|r| self.data.get(r.start as usize..r.end as usize))
|
||||
}
|
||||
|
||||
pub fn replication(&self) -> Option<&str> {
|
||||
self.replication
|
||||
.clone()
|
||||
.and_then(|r| self.data.get(r.start as usize..r.end as usize))
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// taking into account all escape sequences but leaving them as-is.
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
|
||||
self.get("options").map(Self::parse_options_raw)
|
||||
self.options_str().map(Self::parse_options_raw)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// applying all escape sequences (using owned strings as needed).
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
|
||||
self.get("options").map(Self::parse_options_escaped)
|
||||
self.options_str().map(Self::parse_options_escaped)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
@@ -111,15 +150,44 @@ impl StartupMessageParams {
|
||||
|
||||
/// Iterate through key-value pairs in an arbitrary order.
|
||||
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
|
||||
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
|
||||
self.pairs
|
||||
.iter()
|
||||
.map(|r| &self.data[r.start as usize..r.end as usize])
|
||||
.flat_map(|pair| pair.split_once('\0'))
|
||||
}
|
||||
|
||||
// This function is mostly useful in tests.
|
||||
#[doc(hidden)]
|
||||
pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
|
||||
Self {
|
||||
params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
|
||||
let mut this = Self {
|
||||
data: Default::default(),
|
||||
pairs: Default::default(),
|
||||
user: Default::default(),
|
||||
database: Default::default(),
|
||||
options: Default::default(),
|
||||
replication: Default::default(),
|
||||
};
|
||||
for (k, v) in pairs {
|
||||
let start = this.data.len();
|
||||
this.data.push_str(k);
|
||||
this.data.push('\0');
|
||||
let value_offset = this.data.len();
|
||||
this.data.push_str(v);
|
||||
let end = this.data.len();
|
||||
this.data.push('\0');
|
||||
let range = start as u32..end as u32;
|
||||
this.pairs.push(range);
|
||||
let value_range = value_offset as u32..end as u32;
|
||||
match k {
|
||||
"user" => this.user = Some(value_range),
|
||||
"database" => this.database = Some(value_range),
|
||||
"options" => this.options = Some(value_range),
|
||||
"replication" => this.replication = Some(value_range),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
this.data.push('\0');
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
@@ -346,33 +414,62 @@ impl FeStartupPacket {
|
||||
|
||||
// Parse pairs of null-terminated strings (key, value).
|
||||
// See `postgres: ProcessStartupPacket, build_startup_packet`.
|
||||
let mut tokens = str::from_utf8(&msg)
|
||||
let data = str::from_utf8(&msg)
|
||||
.map_err(|_e| {
|
||||
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
|
||||
})?
|
||||
.strip_suffix('\0') // drop packet's own null
|
||||
.ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
.to_owned();
|
||||
|
||||
let mut params = StartupMessageParams {
|
||||
data,
|
||||
pairs: Default::default(),
|
||||
user: Default::default(),
|
||||
database: Default::default(),
|
||||
options: Default::default(),
|
||||
replication: Default::default(),
|
||||
};
|
||||
|
||||
let mut offset = 0;
|
||||
let mut rest = params.data.as_str();
|
||||
loop {
|
||||
let Some((key, rest1)) = rest.split_once('\0') else {
|
||||
return Err(ProtocolError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
)
|
||||
})?
|
||||
.split_terminator('\0');
|
||||
));
|
||||
};
|
||||
// pairs terminated
|
||||
if key.is_empty() {
|
||||
params.data.truncate(offset + 1);
|
||||
params.data.shrink_to_fit();
|
||||
break;
|
||||
}
|
||||
let Some((value, rest2)) = rest1.split_once('\0') else {
|
||||
return Err(ProtocolError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
));
|
||||
};
|
||||
rest = rest2;
|
||||
|
||||
let mut params = HashMap::new();
|
||||
while let Some(name) = tokens.next() {
|
||||
let value = tokens.next().ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: key without value".to_string(),
|
||||
)
|
||||
})?;
|
||||
let start = offset;
|
||||
let value_offset = offset + key.len() + 1;
|
||||
let end = value_offset + value.len();
|
||||
offset = end + 1;
|
||||
|
||||
params.insert(name.to_owned(), value.to_owned());
|
||||
params.pairs.push(start as u32..end as u32);
|
||||
let value_range = value_offset as u32..end as u32;
|
||||
match key {
|
||||
"user" => params.user = Some(value_range),
|
||||
"database" => params.database = Some(value_range),
|
||||
"options" => params.options = Some(value_range),
|
||||
"replication" => params.replication = Some(value_range),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
FeStartupPacket::StartupMessage {
|
||||
major_version,
|
||||
minor_version,
|
||||
params: StartupMessageParams { params },
|
||||
params,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
@@ -23,9 +24,11 @@ use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use http_types::{StatusCode, Url};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
use crate::TimeTravelError;
|
||||
use crate::{
|
||||
AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
|
||||
RemoteStorage, StorageMetadata,
|
||||
@@ -183,7 +186,6 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for AzureBlobStorage {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -371,6 +373,18 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
copy_status = status;
|
||||
}
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
// TODO use Azure point in time recovery feature for this
|
||||
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
|
||||
@@ -25,6 +25,7 @@ use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
|
||||
@@ -142,7 +143,7 @@ pub struct Listing {
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
@@ -210,6 +211,15 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
|
||||
/// Copy a remote object inside a bucket from one path to another.
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
/// Resets the content of everything with the given prefix to the given state
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError>;
|
||||
}
|
||||
|
||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||
@@ -259,17 +269,57 @@ impl std::fmt::Display for DownloadError {
|
||||
|
||||
impl std::error::Error for DownloadError {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TimeTravelError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The used remote storage does not have time travel recovery implemented
|
||||
Unimplemented,
|
||||
/// The number of versions/deletion markers is above our limit.
|
||||
TooManyVersions,
|
||||
/// A cancellation token aborted the process, typically during
|
||||
/// request closure or process shutdown.
|
||||
Cancelled,
|
||||
/// Other errors
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TimeTravelError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
TimeTravelError::BadInput(e) => {
|
||||
write!(
|
||||
f,
|
||||
"Failed to time travel recover a prefix due to user input: {e}"
|
||||
)
|
||||
}
|
||||
TimeTravelError::Unimplemented => write!(
|
||||
f,
|
||||
"time travel recovery is not implemented for the current storage backend"
|
||||
),
|
||||
TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
TimeTravelError::TooManyVersions => {
|
||||
write!(f, "Number of versions/delete markers above limit")
|
||||
}
|
||||
TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TimeTravelError {}
|
||||
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
#[derive(Clone)]
|
||||
pub enum GenericRemoteStorage {
|
||||
// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
|
||||
pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
|
||||
LocalFs(LocalFs),
|
||||
AwsS3(Arc<S3Bucket>),
|
||||
AzureBlob(Arc<AzureBlobStorage>),
|
||||
Unreliable(Arc<UnreliableWrapper>),
|
||||
Unreliable(Other),
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
pub async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -386,6 +436,33 @@ impl GenericRemoteStorage {
|
||||
Self::Unreliable(s) => s.copy(from, to).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
@@ -396,7 +473,12 @@ impl GenericRemoteStorage {
|
||||
Self::LocalFs(LocalFs::new(root.clone())?)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
|
||||
// The profile and access key id are only printed here for debugging purposes,
|
||||
// their values don't indicate the eventually taken choice for auth.
|
||||
let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "<none>".into());
|
||||
let access_key_id =
|
||||
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
|
||||
}
|
||||
@@ -673,6 +755,7 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::List => &self.read,
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
RequestKind::TimeTravel => &self.write,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//! This storage used in tests, but can also be used in cases when a certain persistent
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
@@ -14,11 +14,13 @@ use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
|
||||
use crate::{
|
||||
Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -157,7 +159,6 @@ impl LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -423,6 +424,17 @@ impl RemoteStorage for LocalFs {
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::diverging_sub_expression)]
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
|
||||
@@ -6,12 +6,14 @@
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{Context, Poll},
|
||||
time::SystemTime,
|
||||
};
|
||||
|
||||
use anyhow::Context as _;
|
||||
use anyhow::{anyhow, Context as _};
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
@@ -27,22 +29,24 @@ use aws_sdk_s3::{
|
||||
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
types::{Delete, ObjectIdentifier},
|
||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
|
||||
Client,
|
||||
};
|
||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use aws_smithy_types::body::SdkBody;
|
||||
use aws_smithy_types::byte_stream::ByteStream;
|
||||
use aws_smithy_types::{body::SdkBody, DateTime};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
@@ -270,6 +274,59 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_oids(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
delete_objects: &[ObjectIdentifier],
|
||||
) -> anyhow::Result<()> {
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(
|
||||
Delete::builder()
|
||||
.set_objects(Some(chunk.to_vec()))
|
||||
.build()?,
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
let resp = resp?;
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
// would be too verbose, especially as errors may lead us
|
||||
// to retry repeatedly.
|
||||
const LOG_UP_TO_N_ERRORS: usize = 10;
|
||||
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
|
||||
tracing::warn!(
|
||||
"DeleteObjects key {} failed: {}: {}",
|
||||
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.message.as_ref().map(Cow::from).unwrap_or("".into())
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
@@ -373,7 +430,6 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -569,64 +625,218 @@ impl RemoteStorage for S3Bucket {
|
||||
delete_objects.push(obj_id);
|
||||
}
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(
|
||||
Delete::builder()
|
||||
.set_objects(Some(chunk.to_vec()))
|
||||
.build()?,
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
match resp {
|
||||
Ok(resp) => {
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
// would be too verbose, especially as errors may lead us
|
||||
// to retry repeatedly.
|
||||
const LOG_UP_TO_N_ERRORS: usize = 10;
|
||||
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
|
||||
tracing::warn!(
|
||||
"DeleteObjects key {} failed: {}: {}",
|
||||
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.message.as_ref().map(Cow::from).unwrap_or("".into())
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
self.delete_oids(kind, &delete_objects).await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
let timestamp = DateTime::from(timestamp);
|
||||
let done_if_after = DateTime::from(done_if_after);
|
||||
|
||||
tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
let is_permanent = |_e: &_| false;
|
||||
|
||||
let mut key_marker = None;
|
||||
let mut version_id_marker = None;
|
||||
let mut versions_and_deletes = Vec::new();
|
||||
|
||||
loop {
|
||||
let response = backoff::retry(
|
||||
|| async {
|
||||
self.client
|
||||
.list_object_versions()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(prefix.clone())
|
||||
.set_key_marker(key_marker.clone())
|
||||
.set_version_id_marker(version_id_marker.clone())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"listing object versions for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
|
||||
)
|
||||
.await?;
|
||||
|
||||
tracing::trace!(
|
||||
" Got List response version_id_marker={:?}, key_marker={:?}",
|
||||
response.version_id_marker,
|
||||
response.key_marker
|
||||
);
|
||||
let versions = response
|
||||
.versions
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.map(VerOrDelete::from_version);
|
||||
let deletes = response
|
||||
.delete_markers
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.map(VerOrDelete::from_delete_marker);
|
||||
itertools::process_results(versions.chain(deletes), |n_vds| {
|
||||
versions_and_deletes.extend(n_vds)
|
||||
})
|
||||
.map_err(TimeTravelError::Other)?;
|
||||
fn none_if_empty(v: Option<String>) -> Option<String> {
|
||||
v.filter(|v| !v.is_empty())
|
||||
}
|
||||
version_id_marker = none_if_empty(response.next_version_id_marker);
|
||||
key_marker = none_if_empty(response.next_key_marker);
|
||||
if version_id_marker.is_none() {
|
||||
// The final response is not supposed to be truncated
|
||||
if response.is_truncated.unwrap_or_default() {
|
||||
return Err(TimeTravelError::Other(anyhow::anyhow!(
|
||||
"Received truncated ListObjectVersions response for prefix={prefix:?}"
|
||||
)));
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Limit the number of versions deletions, mostly so that we don't
|
||||
// keep requesting forever if the list is too long, as we'd put the
|
||||
// list in RAM.
|
||||
// Building a list of 100k entries that reaches the limit roughly takes
|
||||
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
|
||||
const COMPLEXITY_LIMIT: usize = 100_000;
|
||||
if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
|
||||
return Err(TimeTravelError::TooManyVersions);
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Built list for time travel with {} versions and deletions",
|
||||
versions_and_deletes.len()
|
||||
);
|
||||
|
||||
// Work on the list of references instead of the objects directly,
|
||||
// otherwise we get lifetime errors in the sort_by_key call below.
|
||||
let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
|
||||
|
||||
versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
|
||||
|
||||
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
|
||||
|
||||
for vd in &versions_and_deletes {
|
||||
let VerOrDelete {
|
||||
version_id, key, ..
|
||||
} = &vd;
|
||||
if version_id == "null" {
|
||||
return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \
|
||||
indicating either disabled versioning, or legacy objects with null version id values")));
|
||||
}
|
||||
tracing::trace!(
|
||||
"Parsing version key={key} version_id={version_id} kind={:?}",
|
||||
vd.kind
|
||||
);
|
||||
|
||||
vds_for_key.entry(key).or_default().push(vd);
|
||||
}
|
||||
for (key, versions) in vds_for_key {
|
||||
let last_vd = versions.last().unwrap();
|
||||
if last_vd.last_modified > done_if_after {
|
||||
tracing::trace!("Key {key} has version later than done_if_after, skipping");
|
||||
continue;
|
||||
}
|
||||
// the version we want to restore to.
|
||||
let version_to_restore_to =
|
||||
match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) {
|
||||
Ok(v) => v,
|
||||
Err(e) => e,
|
||||
};
|
||||
if version_to_restore_to == versions.len() {
|
||||
tracing::trace!("Key {key} has no changes since timestamp, skipping");
|
||||
continue;
|
||||
}
|
||||
let mut do_delete = false;
|
||||
if version_to_restore_to == 0 {
|
||||
// All versions more recent, so the key didn't exist at the specified time point.
|
||||
tracing::trace!(
|
||||
"All {} versions more recent for {key}, deleting",
|
||||
versions.len()
|
||||
);
|
||||
do_delete = true;
|
||||
} else {
|
||||
match &versions[version_to_restore_to - 1] {
|
||||
VerOrDelete {
|
||||
kind: VerOrDeleteKind::Version,
|
||||
version_id,
|
||||
..
|
||||
} => {
|
||||
tracing::trace!("Copying old version {version_id} for {key}...");
|
||||
// Restore the state to the last version by copying
|
||||
let source_id =
|
||||
format!("{}/{key}?versionId={version_id}", self.bucket_name);
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(key)
|
||||
.copy_source(&source_id)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"copying object version for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
|
||||
)
|
||||
.await?;
|
||||
tracing::info!(%version_id, %key, "Copied old version in S3");
|
||||
}
|
||||
VerOrDelete {
|
||||
kind: VerOrDeleteKind::DeleteMarker,
|
||||
..
|
||||
} => {
|
||||
do_delete = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
if do_delete {
|
||||
if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
|
||||
// Key has since been deleted (but there was some history), no need to do anything
|
||||
tracing::trace!("Key {key} already deleted, skipping.");
|
||||
} else {
|
||||
tracing::trace!("Deleting {key}...");
|
||||
|
||||
let oid = ObjectIdentifier::builder()
|
||||
.key(key.to_owned())
|
||||
.build()
|
||||
.map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
|
||||
self.delete_oids(kind, &[oid])
|
||||
.await
|
||||
.map_err(TimeTravelError::Other)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
@@ -651,6 +861,62 @@ fn start_measuring_requests(
|
||||
})
|
||||
}
|
||||
|
||||
// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
|
||||
struct VerOrDelete {
|
||||
kind: VerOrDeleteKind,
|
||||
last_modified: DateTime,
|
||||
version_id: String,
|
||||
key: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum VerOrDeleteKind {
|
||||
Version,
|
||||
DeleteMarker,
|
||||
}
|
||||
|
||||
impl VerOrDelete {
|
||||
fn with_kind(
|
||||
kind: VerOrDeleteKind,
|
||||
last_modified: Option<DateTime>,
|
||||
version_id: Option<String>,
|
||||
key: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let lvk = (last_modified, version_id, key);
|
||||
let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
|
||||
anyhow::bail!(
|
||||
"One (or more) of last_modified, key, and id is None. \
|
||||
Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
|
||||
lvk.0,
|
||||
lvk.1,
|
||||
lvk.2,
|
||||
);
|
||||
};
|
||||
Ok(Self {
|
||||
kind,
|
||||
last_modified,
|
||||
version_id,
|
||||
key,
|
||||
})
|
||||
}
|
||||
fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
|
||||
Self::with_kind(
|
||||
VerOrDeleteKind::Version,
|
||||
v.last_modified,
|
||||
v.version_id,
|
||||
v.key,
|
||||
)
|
||||
}
|
||||
fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
|
||||
Self::with_kind(
|
||||
VerOrDeleteKind::DeleteMarker,
|
||||
v.last_modified,
|
||||
v.version_id,
|
||||
v.key,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -12,6 +12,7 @@ pub(crate) enum RequestKind {
|
||||
Delete = 2,
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
TimeTravel = 5,
|
||||
}
|
||||
|
||||
use RequestKind::*;
|
||||
@@ -24,6 +25,7 @@ impl RequestKind {
|
||||
Delete => "delete_object",
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
TimeTravel => "time_travel_recover",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -31,7 +33,7 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct RequestTyped<C>([C; 5]);
|
||||
pub(super) struct RequestTyped<C>([C; 6]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -40,8 +42,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy].into_iter();
|
||||
let arr = std::array::from_fn::<C, 5, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
|
||||
let arr = std::array::from_fn::<C, 6, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -3,16 +3,19 @@
|
||||
//! testing purposes.
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
use std::time::SystemTime;
|
||||
use std::{collections::hash_map::Entry, sync::Arc};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
StorageMetadata, TimeTravelError,
|
||||
};
|
||||
|
||||
pub struct UnreliableWrapper {
|
||||
inner: crate::GenericRemoteStorage,
|
||||
inner: GenericRemoteStorage<Arc<VoidStorage>>,
|
||||
|
||||
// This many attempts of each operation will fail, then we let it succeed.
|
||||
attempts_to_fail: u64,
|
||||
@@ -29,11 +32,21 @@ enum RemoteOp {
|
||||
Download(RemotePath),
|
||||
Delete(RemotePath),
|
||||
DeleteObjects(Vec<RemotePath>),
|
||||
TimeTravelRecover(Option<RemotePath>),
|
||||
}
|
||||
|
||||
impl UnreliableWrapper {
|
||||
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
|
||||
assert!(attempts_to_fail > 0);
|
||||
let inner = match inner {
|
||||
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
|
||||
GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
|
||||
GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
|
||||
// We could also make this a no-op, as in, extract the inner of the passed generic remote storage
|
||||
GenericRemoteStorage::Unreliable(_s) => {
|
||||
panic!("Can't wrap unreliable wrapper unreliably")
|
||||
}
|
||||
};
|
||||
UnreliableWrapper {
|
||||
inner,
|
||||
attempts_to_fail,
|
||||
@@ -84,7 +97,9 @@ impl UnreliableWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
|
||||
type VoidStorage = crate::LocalFs;
|
||||
|
||||
impl RemoteStorage for UnreliableWrapper {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
@@ -169,4 +184,18 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.copy_object(from, to).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
|
||||
.map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
|
||||
self.inner
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,21 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
use std::time::{Duration, UNIX_EPOCH};
|
||||
use std::{collections::HashSet, time::SystemTime};
|
||||
|
||||
use crate::common::{download_to_vec, upload_stream};
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use futures_util::Future;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
mod common;
|
||||
@@ -18,11 +24,160 @@ mod common;
|
||||
mod tests_s3;
|
||||
|
||||
use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
|
||||
use utils::backoff;
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test";
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorage::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
};
|
||||
// Our test depends on discrepancies in the clock between S3 and the environment the tests
|
||||
// run in. Therefore, wait a little bit before and after. The alternative would be
|
||||
// to take the time from S3 response headers.
|
||||
const WAIT_TIME: Duration = Duration::from_millis(3_000);
|
||||
|
||||
async fn retry<T, O, F, E>(op: O) -> Result<T, E>
|
||||
where
|
||||
E: Display + Debug + 'static,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
{
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
backoff::retry(
|
||||
op,
|
||||
|_e| false,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"test retry",
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn time_point() -> SystemTime {
|
||||
tokio::time::sleep(WAIT_TIME).await;
|
||||
let ret = SystemTime::now();
|
||||
tokio::time::sleep(WAIT_TIME).await;
|
||||
ret
|
||||
}
|
||||
|
||||
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(retry(|| client.list_files(None))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>())
|
||||
}
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t0_files = list_files(&ctx.client).await?;
|
||||
let t0 = time_point().await;
|
||||
println!("at t0: {t0_files:?}");
|
||||
|
||||
let old_data = "remote blob data2";
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(old_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t1_files = list_files(&ctx.client).await?;
|
||||
let t1 = time_point().await;
|
||||
println!("at t1: {t1_files:?}");
|
||||
|
||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||
{
|
||||
let dl = retry(|| ctx.client.download(&path2)).await?;
|
||||
let last_modified = dl.last_modified.unwrap();
|
||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||
let t0_hwt = t0 + half_wt;
|
||||
let t1_hwt = t1 - half_wt;
|
||||
if !(t0_hwt..=t1_hwt).contains(&last_modified) {
|
||||
panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
|
||||
This likely means a large lock discrepancy between S3 and the local clock.");
|
||||
}
|
||||
}
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let new_data = "new remote blob data2";
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(new_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
retry(|| ctx.client.delete(&path1)).await?;
|
||||
let t2_files = list_files(&ctx.client).await?;
|
||||
let t2 = time_point().await;
|
||||
println!("at t2: {t2_files:?}");
|
||||
|
||||
// No changes after recovery to t2 (no-op)
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t2, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t2_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t2: {t2_files_recovered:?}");
|
||||
assert_eq!(t2_files, t2_files_recovered);
|
||||
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
assert_eq!(path2_recovered_t2, new_data.as_bytes());
|
||||
|
||||
// after recovery to t1: path1 is back, path2 has the old content
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t1, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t1_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t1: {t1_files_recovered:?}");
|
||||
assert_eq!(t1_files, t1_files_recovered);
|
||||
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
assert_eq!(path2_recovered_t1, old_data.as_bytes());
|
||||
|
||||
// after recovery to t0: everything is gone except for path1
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t0, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t0_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t0: {t0_files_recovered:?}");
|
||||
assert_eq!(t0_files, t0_files_recovered);
|
||||
|
||||
// cleanup
|
||||
|
||||
let paths = &[path1, path2, path3];
|
||||
retry(|| ctx.client.delete_objects(paths)).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct EnabledS3 {
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
base_prefix: &'static str,
|
||||
|
||||
@@ -112,6 +112,55 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
|
||||
tokio::fs::File::open(path.as_ref()).await?.sync_all().await
|
||||
}
|
||||
|
||||
pub async fn fsync_async_opt(
|
||||
path: impl AsRef<Utf8Path>,
|
||||
do_fsync: bool,
|
||||
) -> Result<(), std::io::Error> {
|
||||
if do_fsync {
|
||||
fsync_async(path.as_ref()).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Like postgres' durable_rename, renames file issuing fsyncs do make it
|
||||
/// durable. After return, file and rename are guaranteed to be persisted.
|
||||
///
|
||||
/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
|
||||
/// contents durable; 2) its directory entry to make rename durable 3) again to
|
||||
/// already renamed file, which is not required by standards but postgres does
|
||||
/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
|
||||
/// rename if it exists to ensure that at least one of the files survives, but
|
||||
/// current callers don't need that.
|
||||
///
|
||||
/// virtual_file.rs has similar code, but it doesn't use vfs.
|
||||
///
|
||||
/// Useful links: <https://lwn.net/Articles/457667/>
|
||||
/// <https://www.postgresql.org/message-id/flat/56583BDD.9060302%402ndquadrant.com>
|
||||
/// <https://thunk.org/tytso/blog/2009/03/15/dont-fear-the-fsync/>
|
||||
pub async fn durable_rename(
|
||||
old_path: impl AsRef<Utf8Path>,
|
||||
new_path: impl AsRef<Utf8Path>,
|
||||
do_fsync: bool,
|
||||
) -> io::Result<()> {
|
||||
// first fsync the file
|
||||
fsync_async_opt(old_path.as_ref(), do_fsync).await?;
|
||||
|
||||
// Time to do the real deal.
|
||||
tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
|
||||
|
||||
// Postgres'ish fsync of renamed file.
|
||||
fsync_async_opt(new_path.as_ref(), do_fsync).await?;
|
||||
|
||||
// Now fsync the parent
|
||||
let parent = match new_path.as_ref().parent() {
|
||||
Some(p) => p,
|
||||
None => Utf8Path::new("./"), // assume current dir if there is no parent
|
||||
};
|
||||
fsync_async_opt(parent, do_fsync).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -131,7 +131,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
||||
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
|
||||
_ => error!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
|
||||
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
|
||||
_ => info!("Error processing HTTP request: {api_error:#}"),
|
||||
}
|
||||
|
||||
api_error.into_response()
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::{
|
||||
sync::{
|
||||
atomic::{AtomicBool, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
|
||||
///
|
||||
@@ -6,62 +12,70 @@ use std::{sync::Arc, time::Duration};
|
||||
/// the resource calls `close()` when they want to ensure that all holders of guards
|
||||
/// have released them, and that no future guards will be issued.
|
||||
pub struct Gate {
|
||||
/// Each caller of enter() takes one unit from the semaphore. In close(), we
|
||||
/// take all the units to ensure all GateGuards are destroyed.
|
||||
sem: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// For observability only: a name that will be used to log warnings if a particular
|
||||
/// gate is holding up shutdown
|
||||
name: String,
|
||||
inner: Arc<GateInner>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Gate {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Gate<{}>", self.name)
|
||||
f.debug_struct("Gate")
|
||||
// use this for identification
|
||||
.field("ptr", &Arc::as_ptr(&self.inner))
|
||||
.field("inner", &self.inner)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
struct GateInner {
|
||||
sem: tokio::sync::Semaphore,
|
||||
closing: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for GateInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let avail = self.sem.available_permits();
|
||||
|
||||
let guards = u32::try_from(avail)
|
||||
.ok()
|
||||
// the sem only supports 32-bit ish amount, but lets play it safe
|
||||
.and_then(|x| Gate::MAX_UNITS.checked_sub(x));
|
||||
|
||||
let closing = self.closing.load(Ordering::Relaxed);
|
||||
|
||||
if let Some(guards) = guards {
|
||||
f.debug_struct("Gate")
|
||||
.field("remaining_guards", &guards)
|
||||
.field("closing", &closing)
|
||||
.finish()
|
||||
} else {
|
||||
f.debug_struct("Gate")
|
||||
.field("avail_permits", &avail)
|
||||
.field("closing", &closing)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
||||
/// not complete.
|
||||
#[derive(Debug)]
|
||||
pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
|
||||
pub struct GateGuard {
|
||||
// Record the span where the gate was entered, so that we can identify who was blocking Gate::close
|
||||
span_at_enter: tracing::Span,
|
||||
gate: Arc<GateInner>,
|
||||
}
|
||||
|
||||
/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
|
||||
async fn warn_if_stuck<Fut: std::future::Future>(
|
||||
fut: Fut,
|
||||
name: &str,
|
||||
warn_period: std::time::Duration,
|
||||
) -> <Fut as std::future::Future>::Output {
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
let mut warned = false;
|
||||
let ret = loop {
|
||||
match tokio::time::timeout(warn_period, &mut fut).await {
|
||||
Ok(ret) => break ret,
|
||||
Err(_) => {
|
||||
tracing::warn!(
|
||||
gate = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"still waiting, taking longer than expected..."
|
||||
);
|
||||
warned = true;
|
||||
}
|
||||
impl Drop for GateGuard {
|
||||
fn drop(&mut self) {
|
||||
if self.gate.closing.load(Ordering::Relaxed) {
|
||||
self.span_at_enter.in_scope(
|
||||
|| tracing::info!(gate = ?Arc::as_ptr(&self.gate), "kept the gate from closing"),
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
// If we emitted a warning for slowness, also emit a message when we complete, so that
|
||||
// someone debugging a shutdown can know for sure whether we have moved past this operation.
|
||||
if warned {
|
||||
tracing::info!(
|
||||
gate = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"completed, after taking longer than expected"
|
||||
)
|
||||
// when the permit was acquired, it was forgotten to allow us to manage it's lifecycle
|
||||
// manually, so "return" the permit now.
|
||||
self.gate.sem.add_permits(1);
|
||||
}
|
||||
|
||||
ret
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -69,15 +83,19 @@ pub enum GateError {
|
||||
GateClosed,
|
||||
}
|
||||
|
||||
impl Gate {
|
||||
const MAX_UNITS: u32 = u32::MAX;
|
||||
|
||||
pub fn new(name: String) -> Self {
|
||||
impl Default for Gate {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
|
||||
name,
|
||||
inner: Arc::new(GateInner {
|
||||
sem: tokio::sync::Semaphore::new(Self::MAX_UNITS as usize),
|
||||
closing: AtomicBool::new(false),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Gate {
|
||||
const MAX_UNITS: u32 = u32::MAX;
|
||||
|
||||
/// Acquire a guard that will prevent close() calls from completing. If close()
|
||||
/// was already called, this will return an error which should be interpreted
|
||||
@@ -88,11 +106,23 @@ impl Gate {
|
||||
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
|
||||
/// also contain a CancellationToken.
|
||||
pub fn enter(&self) -> Result<GateGuard, GateError> {
|
||||
self.sem
|
||||
.clone()
|
||||
.try_acquire_owned()
|
||||
.map(GateGuard)
|
||||
.map_err(|_| GateError::GateClosed)
|
||||
let permit = self
|
||||
.inner
|
||||
.sem
|
||||
.try_acquire()
|
||||
.map_err(|_| GateError::GateClosed)?;
|
||||
|
||||
// we now have the permit, let's disable the normal raii functionality and leave
|
||||
// "returning" the permit to our GateGuard::drop.
|
||||
//
|
||||
// this is done to avoid the need for multiple Arcs (one for semaphore, next for other
|
||||
// fields).
|
||||
permit.forget();
|
||||
|
||||
Ok(GateGuard {
|
||||
span_at_enter: tracing::Span::current(),
|
||||
gate: self.inner.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Types with a shutdown() method and a gate should call this method at the
|
||||
@@ -102,48 +132,88 @@ impl Gate {
|
||||
/// important that the holders of such guards are respecting a CancellationToken which has
|
||||
/// been cancelled before entering this function.
|
||||
pub async fn close(&self) {
|
||||
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
|
||||
let started_at = std::time::Instant::now();
|
||||
let mut do_close = std::pin::pin!(self.do_close());
|
||||
|
||||
let nag_after = Duration::from_secs(1);
|
||||
|
||||
let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
|
||||
return;
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
gate = ?self.as_ptr(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"closing is taking longer than expected"
|
||||
);
|
||||
|
||||
// close operation is not trying to be cancellation safe as pageserver does not need it.
|
||||
//
|
||||
// note: "closing" is not checked in Gate::enter -- it exists just for observability,
|
||||
// dropping of GateGuard after this will log who they were.
|
||||
self.inner.closing.store(true, Ordering::Relaxed);
|
||||
|
||||
do_close.await;
|
||||
|
||||
tracing::info!(
|
||||
gate = ?self.as_ptr(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"close completed"
|
||||
);
|
||||
}
|
||||
|
||||
/// Used as an identity of a gate. This identity will be resolved to something useful when
|
||||
/// it's actually closed in a hopefully sensible `tracing::Span` which will describe it even
|
||||
/// more.
|
||||
///
|
||||
/// `GateGuard::drop` also logs this pointer when it has realized it has been keeping the gate
|
||||
/// open for too long.
|
||||
fn as_ptr(&self) -> *const GateInner {
|
||||
Arc::as_ptr(&self.inner)
|
||||
}
|
||||
|
||||
/// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This
|
||||
/// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
|
||||
/// the CancellationToken on such types is analogous to "Did shutdown start?"
|
||||
pub fn close_complete(&self) -> bool {
|
||||
self.sem.is_closed()
|
||||
self.inner.sem.is_closed()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(gate = ?self.as_ptr()))]
|
||||
async fn do_close(&self) {
|
||||
tracing::debug!(gate = self.name, "Closing Gate...");
|
||||
match self.sem.acquire_many(Self::MAX_UNITS).await {
|
||||
Ok(_units) => {
|
||||
tracing::debug!("Closing Gate...");
|
||||
|
||||
match self.inner.sem.acquire_many(Self::MAX_UNITS).await {
|
||||
Ok(_permit) => {
|
||||
// While holding all units, close the semaphore. All subsequent calls to enter() will fail.
|
||||
self.sem.close();
|
||||
self.inner.sem.close();
|
||||
}
|
||||
Err(_) => {
|
||||
Err(_closed) => {
|
||||
// Semaphore closed: we are the only function that can do this, so it indicates a double-call.
|
||||
// This is legal. Timeline::shutdown for example is not protected from being called more than
|
||||
// once.
|
||||
tracing::debug!(gate = self.name, "Double close")
|
||||
tracing::debug!("Double close")
|
||||
}
|
||||
}
|
||||
tracing::debug!(gate = self.name, "Closed Gate.")
|
||||
tracing::debug!("Closed Gate.")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::FutureExt;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_idle_gate() {
|
||||
// Having taken no gates, we should not be blocked in close
|
||||
let gate = Gate::new("test".to_string());
|
||||
async fn close_unused() {
|
||||
// Having taken no guards, we should not be blocked in close
|
||||
let gate = Gate::default();
|
||||
gate.close().await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn close_idle() {
|
||||
// If a guard is dropped before entering, close should not be blocked
|
||||
let gate = Gate::new("test".to_string());
|
||||
let gate = Gate::default();
|
||||
let guard = gate.enter().unwrap();
|
||||
drop(guard);
|
||||
gate.close().await;
|
||||
@@ -152,25 +222,30 @@ mod tests {
|
||||
gate.enter().expect_err("enter should fail after close");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_busy_gate() {
|
||||
let gate = Gate::new("test".to_string());
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn close_busy_gate() {
|
||||
let gate = Gate::default();
|
||||
let forever = Duration::from_secs(24 * 7 * 365);
|
||||
|
||||
let guard = gate.enter().unwrap();
|
||||
let guard =
|
||||
tracing::info_span!("i am holding back the gate").in_scope(|| gate.enter().unwrap());
|
||||
|
||||
let mut close_fut = std::pin::pin!(gate.close());
|
||||
|
||||
// Close should be blocked
|
||||
assert!(close_fut.as_mut().now_or_never().is_none());
|
||||
// Close should be waiting for guards to drop
|
||||
tokio::time::timeout(forever, &mut close_fut)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
// Attempting to enter() should fail, even though close isn't done yet.
|
||||
gate.enter()
|
||||
.expect_err("enter should fail after entering close");
|
||||
|
||||
// this will now log, which we cannot verify except manually
|
||||
drop(guard);
|
||||
|
||||
// Guard is gone, close should finish
|
||||
assert!(close_fut.as_mut().now_or_never().is_some());
|
||||
close_fut.await;
|
||||
|
||||
// Attempting to enter() is still forbidden
|
||||
gate.enter().expect_err("enter should fail finishing close");
|
||||
|
||||
@@ -21,7 +21,6 @@ camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
close_fds.workspace = true
|
||||
const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crc32c.workspace = true
|
||||
@@ -61,6 +60,7 @@ sync_wrapper.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-epoll-uring.workspace = true
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
|
||||
@@ -69,6 +69,25 @@ impl Client {
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// Get an arbitrary path and returning a streaming Response. This function is suitable
|
||||
/// for pass-through/proxy use cases where we don't care what the response content looks
|
||||
/// like.
|
||||
///
|
||||
/// Use/add one of the properly typed methods below if you know aren't proxying, and
|
||||
/// know what kind of response you expect.
|
||||
pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
|
||||
debug_assert!(path.starts_with('/'));
|
||||
let uri = format!("{}{}", self.mgmt_api_endpoint, path);
|
||||
|
||||
let req = self.client.request(Method::GET, uri);
|
||||
let req = if let Some(value) = &self.authorization_header {
|
||||
req.header(reqwest::header::AUTHORIZATION, value)
|
||||
} else {
|
||||
req
|
||||
};
|
||||
req.send().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_details(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -171,6 +190,25 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// The tenant deletion API can return 202 if deletion is incomplete, or
|
||||
/// 404 if it is complete. Callers are responsible for checking the status
|
||||
/// code and retrying. Error codes other than 404 will return Err().
|
||||
pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
|
||||
let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
|
||||
|
||||
match self.request(Method::DELETE, &uri, ()).await {
|
||||
Err(Error::ApiError(status_code, msg)) => {
|
||||
if status_code == StatusCode::NOT_FOUND {
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
} else {
|
||||
Err(Error::ApiError(status_code, msg))
|
||||
}
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
Ok(response) => Ok(response.status()),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, &uri, req).await?;
|
||||
@@ -234,6 +272,32 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// The timeline deletion API can return 201 if deletion is incomplete, or
|
||||
/// 403 if it is complete. Callers are responsible for checking the status
|
||||
/// code and retrying. Error codes other than 403 will return Err().
|
||||
pub async fn timeline_delete(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<StatusCode> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
match self.request(Method::DELETE, &uri, ()).await {
|
||||
Err(Error::ApiError(status_code, msg)) => {
|
||||
if status_code == StatusCode::NOT_FOUND {
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
} else {
|
||||
Err(Error::ApiError(status_code, msg))
|
||||
}
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
Ok(response) => Ok(response.status()),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/reset",
|
||||
|
||||
@@ -156,7 +156,8 @@ impl PagestreamClient {
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::Exists(_)
|
||||
| PagestreamBeMessage::Nblocks(_)
|
||||
| PagestreamBeMessage::DbSize(_) => {
|
||||
| PagestreamBeMessage::DbSize(_)
|
||||
| PagestreamBeMessage::GetSlruSegment(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
msg.kind()
|
||||
|
||||
@@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader;
|
||||
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
|
||||
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
|
||||
use pageserver::tenant::storage_layer::range_overlaps;
|
||||
use pageserver::virtual_file::VirtualFile;
|
||||
use pageserver::virtual_file::{self, VirtualFile};
|
||||
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
||||
pageserver::virtual_file::init(10);
|
||||
pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let mut total_delta_layers = 0usize;
|
||||
|
||||
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(10);
|
||||
virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
|
||||
page_cache::init(100);
|
||||
let file = FileBlockReader::new(VirtualFile::open(path).await?);
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
} => {
|
||||
pageserver::virtual_file::init(10);
|
||||
pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
|
||||
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx).await
|
||||
|
||||
@@ -423,8 +423,8 @@ async fn client(
|
||||
tokio::select! {
|
||||
res = do_requests => { res },
|
||||
_ = cancel.cancelled() => {
|
||||
client.shutdown().await;
|
||||
return;
|
||||
// fallthrough to shutdown
|
||||
}
|
||||
}
|
||||
client.shutdown().await;
|
||||
}
|
||||
|
||||
@@ -66,13 +66,10 @@ impl serde::Serialize for LatencyPercentiles {
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
|
||||
for p in LATENCY_PERCENTILES {
|
||||
for (p, v) in LATENCY_PERCENTILES.iter().zip(&self.latency_percentiles) {
|
||||
ser.serialize_entry(
|
||||
&format!("p{p}"),
|
||||
&format!(
|
||||
"{}",
|
||||
&humantime::format_duration(self.latency_percentiles[0])
|
||||
),
|
||||
&format!("{}", humantime::format_duration(*v)),
|
||||
)?;
|
||||
}
|
||||
ser.end()
|
||||
|
||||
@@ -11,8 +11,9 @@
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{key_to_slru_block, Key};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
@@ -133,6 +134,87 @@ where
|
||||
ctx: &'a RequestContext,
|
||||
}
|
||||
|
||||
/// A sink that accepts SLRU blocks ordered by key and forwards
|
||||
/// full segments to the archive.
|
||||
struct SlruSegmentsBuilder<'a, 'b, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
ar: &'a mut Builder<&'b mut W>,
|
||||
buf: Vec<u8>,
|
||||
current_segment: Option<(SlruKind, u32)>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
fn new(ar: &'a mut Builder<&'b mut W>) -> Self {
|
||||
Self {
|
||||
ar,
|
||||
buf: Vec::new(),
|
||||
current_segment: None,
|
||||
}
|
||||
}
|
||||
|
||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
|
||||
let (kind, segno, _) = key_to_slru_block(*key)?;
|
||||
|
||||
match kind {
|
||||
SlruKind::Clog => {
|
||||
ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
|
||||
}
|
||||
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
|
||||
ensure!(block.len() == BLCKSZ as usize);
|
||||
}
|
||||
}
|
||||
|
||||
let segment = (kind, segno);
|
||||
match self.current_segment {
|
||||
None => {
|
||||
self.current_segment = Some(segment);
|
||||
self.buf
|
||||
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
|
||||
}
|
||||
Some(current_seg) if current_seg == segment => {
|
||||
self.buf
|
||||
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
|
||||
}
|
||||
Some(_) => {
|
||||
self.flush().await?;
|
||||
|
||||
self.current_segment = Some(segment);
|
||||
self.buf
|
||||
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> anyhow::Result<()> {
|
||||
let nblocks = self.buf.len() / BLCKSZ as usize;
|
||||
let (kind, segno) = self.current_segment.take().unwrap();
|
||||
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
|
||||
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
||||
self.ar.append(&header, self.buf.as_slice()).await?;
|
||||
|
||||
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
|
||||
self.buf.clear();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn finish(mut self) -> anyhow::Result<()> {
|
||||
if self.current_segment.is_none() || self.buf.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.flush().await
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W> Basebackup<'a, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -140,6 +222,8 @@ where
|
||||
async fn send_tarball(mut self) -> anyhow::Result<()> {
|
||||
// TODO include checksum
|
||||
|
||||
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
|
||||
|
||||
// Create pgdata subdirs structure
|
||||
for dir in PGDATA_SUBDIRS.iter() {
|
||||
let header = new_tar_header_dir(dir)?;
|
||||
@@ -166,20 +250,27 @@ where
|
||||
.context("could not add config file to basebackup tarball")?;
|
||||
}
|
||||
}
|
||||
|
||||
// Gather non-relational files from object storage pages.
|
||||
for kind in [
|
||||
SlruKind::Clog,
|
||||
SlruKind::MultiXactOffsets,
|
||||
SlruKind::MultiXactMembers,
|
||||
] {
|
||||
for segno in self
|
||||
if !lazy_slru_download {
|
||||
// Gather non-relational files from object storage pages.
|
||||
let slru_partitions = self
|
||||
.timeline
|
||||
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
|
||||
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
.partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
|
||||
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(&part.ranges, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
slru_builder.add_block(&key, block?).await?;
|
||||
}
|
||||
}
|
||||
slru_builder.finish().await?;
|
||||
}
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
@@ -305,39 +396,6 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Generate SLRU segment files from repository.
|
||||
//
|
||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
|
||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
if slru == SlruKind::Clog {
|
||||
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
|
||||
} else {
|
||||
ensure!(img.len() == BLCKSZ as usize);
|
||||
}
|
||||
|
||||
slru_buf.extend_from_slice(&img[..BLCKSZ as usize]);
|
||||
}
|
||||
|
||||
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
|
||||
let header = new_tar_header(&segname, slru_buf.len() as u64)?;
|
||||
self.ar.append(&header, slru_buf.as_slice()).await?;
|
||||
|
||||
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Include database/tablespace directories.
|
||||
//
|
||||
|
||||
@@ -33,12 +33,10 @@ use pageserver::{
|
||||
use postgres_backend::AuthType;
|
||||
use utils::failpoint_support;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::signals::ShutdownSignals;
|
||||
use utils::{
|
||||
auth::{JwtAuth, SwappableJwtAuth},
|
||||
logging, project_build_tag, project_git_version,
|
||||
sentry_init::init_sentry,
|
||||
signals::Signal,
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
@@ -130,7 +128,7 @@ fn main() -> anyhow::Result<()> {
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||
@@ -656,34 +654,42 @@ fn start_pageserver(
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Quit => {
|
||||
info!(
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
std::process::exit(111);
|
||||
}
|
||||
{
|
||||
use signal_hook::consts::*;
|
||||
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
|
||||
let mut signals =
|
||||
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
|
||||
return signals
|
||||
.forever()
|
||||
.next()
|
||||
.expect("forever() never returns None unless explicitly closed");
|
||||
});
|
||||
let signal = BACKGROUND_RUNTIME
|
||||
.block_on(signal_handler)
|
||||
.expect("join error");
|
||||
match signal {
|
||||
SIGQUIT => {
|
||||
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
|
||||
std::process::exit(111);
|
||||
}
|
||||
SIGINT | SIGTERM => {
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
||||
|
||||
Signal::Interrupt | Signal::Terminate => {
|
||||
info!(
|
||||
"Got {}. Terminating gracefully in fast shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
|
||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
));
|
||||
unreachable!()
|
||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
));
|
||||
unreachable!()
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn create_remote_storage_client(
|
||||
|
||||
@@ -36,6 +36,7 @@ use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::virtual_file;
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
@@ -43,6 +44,8 @@ use crate::{
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
|
||||
use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
|
||||
|
||||
pub mod defaults {
|
||||
use crate::tenant::config::defaults::*;
|
||||
use const_format::formatcp;
|
||||
@@ -79,6 +82,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -114,6 +119,8 @@ pub mod defaults {
|
||||
|
||||
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
|
||||
|
||||
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -247,6 +254,8 @@ pub struct PageServerConf {
|
||||
|
||||
/// Maximum number of WAL records to be ingested and committed at the same time
|
||||
pub ingest_batch_size: u64,
|
||||
|
||||
pub virtual_file_io_engine: virtual_file::IoEngineKind,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -331,6 +340,8 @@ struct PageServerConfigBuilder {
|
||||
secondary_download_concurrency: BuilderValue<usize>,
|
||||
|
||||
ingest_batch_size: BuilderValue<u64>,
|
||||
|
||||
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -406,6 +417,8 @@ impl Default for PageServerConfigBuilder {
|
||||
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
|
||||
|
||||
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
|
||||
|
||||
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -562,6 +575,10 @@ impl PageServerConfigBuilder {
|
||||
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
|
||||
}
|
||||
|
||||
pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
|
||||
self.virtual_file_io_engine = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
@@ -669,6 +686,9 @@ impl PageServerConfigBuilder {
|
||||
ingest_batch_size: self
|
||||
.ingest_batch_size
|
||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
||||
virtual_file_io_engine: self
|
||||
.virtual_file_io_engine
|
||||
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -920,6 +940,9 @@ impl PageServerConf {
|
||||
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
|
||||
},
|
||||
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
|
||||
"virtual_file_io_engine" => {
|
||||
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -993,6 +1016,7 @@ impl PageServerConf {
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1225,6 +1249,7 @@ background_task_maximum_delay = '334 s'
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1288,6 +1313,7 @@ background_task_maximum_delay = '334 s'
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: 100,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -97,23 +97,86 @@ pub enum EvictionOrder {
|
||||
|
||||
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
||||
/// the set of resident layers of a tenant.
|
||||
///
|
||||
/// This strategy will evict layers more fairly but is untested.
|
||||
RelativeAccessed {
|
||||
#[serde(default)]
|
||||
/// Determines if the tenant with most layers should lose first.
|
||||
///
|
||||
/// Having this enabled is currently the only reasonable option, because the order in which
|
||||
/// we read tenants is deterministic. If we find the need to use this as `false`, we need
|
||||
/// to ensure nondeterminism by adding in a random number to break the
|
||||
/// `relative_last_activity==0.0` ties.
|
||||
#[serde(default = "default_highest_layer_count_loses_first")]
|
||||
highest_layer_count_loses_first: bool,
|
||||
},
|
||||
}
|
||||
|
||||
fn default_highest_layer_count_loses_first() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
impl EvictionOrder {
|
||||
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
|
||||
/// counts should be the first ones to have their layers evicted.
|
||||
fn highest_layer_count_loses_first(&self) -> bool {
|
||||
fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
|
||||
use EvictionOrder::*;
|
||||
|
||||
match self {
|
||||
EvictionOrder::AbsoluteAccessed => false,
|
||||
EvictionOrder::RelativeAccessed {
|
||||
AbsoluteAccessed => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.last_activity_ts)
|
||||
});
|
||||
}
|
||||
RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
|
||||
/// layers in **most** recently used order.
|
||||
fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
|
||||
use EvictionOrder::*;
|
||||
|
||||
match self {
|
||||
AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
|
||||
RelativeAccessed {
|
||||
highest_layer_count_loses_first,
|
||||
} => *highest_layer_count_loses_first,
|
||||
} => {
|
||||
// keeping the -1 or not decides if every tenant should lose their least recently accessed
|
||||
// layer OR if this should happen in the order of having highest layer count:
|
||||
let fudge = if *highest_layer_count_loses_first {
|
||||
// relative_last_activity vs. tenant layer count:
|
||||
// - 0.1..=1.0 (10 layers)
|
||||
// - 0.01..=1.0 (100 layers)
|
||||
// - 0.001..=1.0 (1000 layers)
|
||||
//
|
||||
// leading to evicting less of the smallest tenants.
|
||||
0
|
||||
} else {
|
||||
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
|
||||
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
|
||||
// be that less than 10k layer evictions is enough, so we would not need to evict from
|
||||
// all tenants.
|
||||
//
|
||||
// as the tenant ordering is now deterministic this could hit the same tenants
|
||||
// disproportionetly on multiple invocations. alternative could be to remember how many
|
||||
// layers did we evict last time from this tenant, and inject that as an additional
|
||||
// fudge here.
|
||||
1
|
||||
};
|
||||
|
||||
let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
|
||||
let divider = total as f32;
|
||||
|
||||
// most recently used is always (total - 0) / divider == 1.0
|
||||
// least recently used depends on the fudge:
|
||||
// - (total - 1) - (total - 1) / total => 0 / total
|
||||
// - total - (total - 1) / total => 1 / total
|
||||
let distance = (total - index) as f32;
|
||||
|
||||
finite_f32::FiniteF32::try_from_normalized(distance / divider)
|
||||
.unwrap_or_else(|val| {
|
||||
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
|
||||
finite_f32::FiniteF32::ZERO
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -389,52 +452,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
|
||||
let selection = select_victims(&candidates, usage_pre);
|
||||
|
||||
let mut candidates = candidates;
|
||||
|
||||
let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
|
||||
// we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
|
||||
// for comparison here. this is a temporary measure to develop alternatives.
|
||||
use std::fmt::Write;
|
||||
|
||||
let mut summary_buf = String::with_capacity(256);
|
||||
|
||||
{
|
||||
let absolute_summary = candidates
|
||||
.iter()
|
||||
.take(selection.amount)
|
||||
.map(|(_, candidate)| candidate)
|
||||
.collect::<summary::EvictionSummary>();
|
||||
|
||||
write!(summary_buf, "{absolute_summary}").expect("string grows");
|
||||
|
||||
info!("absolute accessed selection summary: {summary_buf}");
|
||||
}
|
||||
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
});
|
||||
|
||||
let selection = select_victims(&candidates, usage_pre);
|
||||
|
||||
{
|
||||
summary_buf.clear();
|
||||
|
||||
let relative_summary = candidates
|
||||
.iter()
|
||||
.take(selection.amount)
|
||||
.map(|(_, candidate)| candidate)
|
||||
.collect::<summary::EvictionSummary>();
|
||||
|
||||
write!(summary_buf, "{relative_summary}").expect("string grows");
|
||||
|
||||
info!("relative accessed selection summary: {summary_buf}");
|
||||
}
|
||||
|
||||
selection
|
||||
} else {
|
||||
selection
|
||||
};
|
||||
|
||||
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
|
||||
|
||||
// phase2: evict layers
|
||||
@@ -835,54 +852,12 @@ async fn collect_eviction_candidates(
|
||||
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
let mut cumsum: i128 = 0;
|
||||
|
||||
// keeping the -1 or not decides if every tenant should lose their least recently accessed
|
||||
// layer OR if this should happen in the order of having highest layer count:
|
||||
let fudge = if eviction_order.highest_layer_count_loses_first() {
|
||||
// relative_age vs. tenant layer count:
|
||||
// - 0.1..=1.0 (10 layers)
|
||||
// - 0.01..=1.0 (100 layers)
|
||||
// - 0.001..=1.0 (1000 layers)
|
||||
//
|
||||
// leading to evicting less of the smallest tenants.
|
||||
0
|
||||
} else {
|
||||
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
|
||||
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
|
||||
// be that less than 10k layer evictions is enough, so we would not need to evict from
|
||||
// all tenants.
|
||||
//
|
||||
// as the tenant ordering is now deterministic this could hit the same tenants
|
||||
// disproportionetly on multiple invocations. alternative could be to remember how many
|
||||
// layers did we evict last time from this tenant, and inject that as an additional
|
||||
// fudge here.
|
||||
1
|
||||
};
|
||||
|
||||
let total = tenant_candidates
|
||||
.len()
|
||||
.checked_sub(fudge)
|
||||
.filter(|&x| x > 0)
|
||||
// support 0 or 1 resident layer tenants as well
|
||||
.unwrap_or(1);
|
||||
let divider = total as f32;
|
||||
let total = tenant_candidates.len();
|
||||
|
||||
for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
|
||||
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
||||
// be 1.0; this is for us to evict it last.
|
||||
candidate.relative_last_activity = if matches!(
|
||||
eviction_order,
|
||||
EvictionOrder::RelativeAccessed { .. }
|
||||
) {
|
||||
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
|
||||
// similarly for u16. unsure how it would help.
|
||||
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
|
||||
.unwrap_or_else(|val| {
|
||||
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
|
||||
finite_f32::FiniteF32::ZERO
|
||||
})
|
||||
} else {
|
||||
finite_f32::FiniteF32::ZERO
|
||||
};
|
||||
candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
|
||||
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
@@ -927,10 +902,7 @@ async fn collect_eviction_candidates(
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
|
||||
// always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
|
||||
// will sort later by candidate.relative_last_activity to get compare evictions.
|
||||
candidates
|
||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||
eviction_order.sort(&mut candidates);
|
||||
|
||||
Ok(EvictionCandidates::Finished(candidates))
|
||||
}
|
||||
@@ -1070,6 +1042,12 @@ pub(crate) mod finite_f32 {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FiniteF32> for f32 {
|
||||
fn from(value: FiniteF32) -> f32 {
|
||||
value.0
|
||||
}
|
||||
}
|
||||
|
||||
impl FiniteF32 {
|
||||
pub const ZERO: FiniteF32 = FiniteF32(0.0);
|
||||
|
||||
@@ -1082,136 +1060,9 @@ pub(crate) mod finite_f32 {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod summary {
|
||||
use super::finite_f32::FiniteF32;
|
||||
use super::{EvictionCandidate, LayerCount};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::time::SystemTime;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub(super) struct EvictionSummary {
|
||||
evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
|
||||
total: LayerCount,
|
||||
|
||||
last_absolute: Option<SystemTime>,
|
||||
last_relative: Option<FiniteF32>,
|
||||
}
|
||||
|
||||
impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
|
||||
fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
|
||||
let mut summary = EvictionSummary::default();
|
||||
for item in iter {
|
||||
let counts = summary
|
||||
.evicted_per_tenant
|
||||
.entry(*item.layer.get_tenant_shard_id())
|
||||
.or_default();
|
||||
|
||||
let sz = item.layer.get_file_size();
|
||||
|
||||
counts.file_sizes += sz;
|
||||
counts.count += 1;
|
||||
|
||||
summary.total.file_sizes += sz;
|
||||
summary.total.count += 1;
|
||||
|
||||
summary.last_absolute = Some(item.last_activity_ts);
|
||||
summary.last_relative = Some(item.relative_last_activity);
|
||||
}
|
||||
|
||||
summary
|
||||
}
|
||||
}
|
||||
|
||||
struct SiBytesAmount(u64);
|
||||
|
||||
impl std::fmt::Display for SiBytesAmount {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.0 < 1024 {
|
||||
return write!(f, "{}B", self.0);
|
||||
}
|
||||
|
||||
let mut tmp = self.0;
|
||||
let mut ch = 0;
|
||||
let suffixes = b"KMGTPE";
|
||||
|
||||
while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
|
||||
tmp /= 1024;
|
||||
ch += 1;
|
||||
}
|
||||
|
||||
let ch = suffixes[ch] as char;
|
||||
|
||||
write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EvictionSummary {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// wasteful, but it's for testing
|
||||
|
||||
let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
|
||||
|
||||
for (tenant_shard_id, count) in &self.evicted_per_tenant {
|
||||
sorted
|
||||
.entry(count.count)
|
||||
.or_default()
|
||||
.push((*tenant_shard_id, count.file_sizes));
|
||||
}
|
||||
|
||||
let total_file_sizes = SiBytesAmount(self.total.file_sizes);
|
||||
|
||||
writeln!(
|
||||
f,
|
||||
"selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
|
||||
self.total.count, self.last_absolute, self.last_relative,
|
||||
)?;
|
||||
|
||||
for (count, per_tenant) in sorted.iter().rev().take(10) {
|
||||
write!(f, "- {count} layers: ")?;
|
||||
|
||||
if per_tenant.len() < 3 {
|
||||
for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
let bytes = SiBytesAmount(*bytes);
|
||||
write!(f, "{tenant_shard_id} ({bytes})")?;
|
||||
}
|
||||
} else {
|
||||
let num_tenants = per_tenant.len();
|
||||
let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
|
||||
let total_bytes = SiBytesAmount(total_bytes);
|
||||
let layers = num_tenants * count;
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{num_tenants} tenants {total_bytes} in total {layers} layers",
|
||||
)?;
|
||||
}
|
||||
|
||||
writeln!(f)?;
|
||||
}
|
||||
|
||||
if sorted.len() > 10 {
|
||||
let (rem_count, rem_bytes) = sorted
|
||||
.iter()
|
||||
.rev()
|
||||
.map(|(count, per_tenant)| {
|
||||
(
|
||||
count,
|
||||
per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
|
||||
)
|
||||
})
|
||||
.fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
|
||||
let rem_bytes = SiBytesAmount(rem_bytes);
|
||||
writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
pub fn into_inner(self) -> f32 {
|
||||
self.into()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1336,3 +1187,40 @@ mod filesystem_level_usage {
|
||||
assert!(!usage.has_pressure());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn relative_equal_bounds() {
|
||||
let order = EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: false,
|
||||
};
|
||||
|
||||
let len = 10;
|
||||
let v = (0..len)
|
||||
.map(|i| order.relative_last_activity(len, i).into_inner())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(v.first(), Some(&1.0));
|
||||
assert_eq!(v.last(), Some(&0.0));
|
||||
assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn relative_spare_bounds() {
|
||||
let order = EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
};
|
||||
|
||||
let len = 10;
|
||||
let v = (0..len)
|
||||
.map(|i| order.relative_last_activity(len, i).into_inner())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(v.first(), Some(&1.0));
|
||||
assert_eq!(v.last(), Some(&0.1));
|
||||
assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,6 +178,64 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/time_travel_remote_storage:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: travel_to
|
||||
in: query
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: date-time
|
||||
- name: done_if_after
|
||||
in: query
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: date-time
|
||||
put:
|
||||
description: Time travel the tenant's remote storage
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
@@ -419,12 +477,6 @@ paths:
|
||||
type: string
|
||||
format: date-time
|
||||
description: A timestamp to get the LSN
|
||||
- name: version
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
description: The version of the endpoint to use
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
@@ -674,6 +726,10 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant is now in requested state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantLocationConfigResponse"
|
||||
"503":
|
||||
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
|
||||
content:
|
||||
@@ -877,6 +933,56 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
Marks the initdb archive for preservation upon deletion of the timeline or tenant.
|
||||
This is meant to be part of the disaster recovery process.
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"404":
|
||||
description: No tenant or timeline found for the specified ids
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/synthetic_size:
|
||||
parameters:
|
||||
@@ -1376,6 +1482,28 @@ components:
|
||||
$ref: '#/components/schemas/SecondaryConfig'
|
||||
tenant_conf:
|
||||
$ref: '#/components/schemas/TenantConfig'
|
||||
TenantLocationConfigResponse:
|
||||
type: object
|
||||
required:
|
||||
- shards
|
||||
properties:
|
||||
shards:
|
||||
description: Pageservers where this tenant's shards are attached. Not populated for secondary locations.
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantShardLocation"
|
||||
TenantShardLocation:
|
||||
type: object
|
||||
required:
|
||||
- node_id
|
||||
- shard_id
|
||||
properties:
|
||||
node_id:
|
||||
description: Pageserver node ID where this shard is attached
|
||||
type: integer
|
||||
shard_id:
|
||||
description: Tenant shard ID of the shard
|
||||
type: string
|
||||
SecondaryConfig:
|
||||
type: object
|
||||
properties:
|
||||
|
||||
@@ -17,6 +17,8 @@ use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigResponse;
|
||||
use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
@@ -24,6 +26,7 @@ use pageserver_api::models::{
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeTravelError;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -45,6 +48,7 @@ use crate::tenant::mgr::{
|
||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||
};
|
||||
use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
|
||||
use crate::tenant::remote_timeline_client;
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
@@ -75,8 +79,14 @@ use utils::{
|
||||
// For APIs that require an Active tenant, how long should we block waiting for that state?
|
||||
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
||||
// failed API calls while tenants are activating.
|
||||
#[cfg(not(feature = "testing"))]
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
|
||||
// finish attaching, if calls to remote storage are slow.
|
||||
#[cfg(feature = "testing")]
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
|
||||
pub struct State {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
@@ -187,6 +197,7 @@ impl From<TenantSlotUpsertError> for ApiError {
|
||||
match e {
|
||||
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
|
||||
MapState(e) => e.into(),
|
||||
ShuttingDown(_) => ApiError::ShuttingDown,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -495,6 +506,10 @@ async fn timeline_create_handler(
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
Err(_) if tenant.cancel.is_cancelled() => {
|
||||
// In case we get some ugly error type during shutdown, cast it into a clean 503.
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
|
||||
json_response(StatusCode::CONFLICT, ())
|
||||
}
|
||||
@@ -561,6 +576,43 @@ async fn timeline_list_handler(
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
|
||||
async fn timeline_preserve_initdb_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// Part of the process for disaster recovery from safekeeper-stored WAL:
|
||||
// If we don't recover into a new timeline but want to keep the timeline ID,
|
||||
// then the initdb archive is deleted. This endpoint copies it to a different
|
||||
// location where timeline recreation cand find it.
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
timeline
|
||||
.preserve_initdb_archive()
|
||||
.await
|
||||
.context("preserving initdb archive")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_preserve_initdb_archive",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
%timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -915,6 +967,7 @@ async fn tenant_status(
|
||||
attachment_status: state.attachment_status(),
|
||||
generation: tenant.generation().into(),
|
||||
},
|
||||
walredo: tenant.wal_redo_manager_status(),
|
||||
timelines: tenant.list_timeline_ids(),
|
||||
})
|
||||
}
|
||||
@@ -1220,19 +1273,9 @@ async fn tenant_create_handler(
|
||||
};
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
if let res @ Err(_) = new_tenant
|
||||
new_tenant
|
||||
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||
.await
|
||||
{
|
||||
// This shouldn't happen because we just created the tenant directory
|
||||
// in upsert_location, and there aren't any remote timelines
|
||||
// to load, so, nothing can really fail during load.
|
||||
// Don't do cleanup because we don't know how we got here.
|
||||
// The tenant will likely be in `Broken` state and subsequent
|
||||
// calls will fail.
|
||||
res.context("created tenant failed to become active")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
.await?;
|
||||
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
@@ -1324,7 +1367,7 @@ async fn put_tenant_location_config_handler(
|
||||
let location_conf =
|
||||
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
state
|
||||
let attached = state
|
||||
.tenant_manager
|
||||
.upsert_location(
|
||||
tenant_shard_id,
|
||||
@@ -1333,7 +1376,8 @@ async fn put_tenant_location_config_handler(
|
||||
tenant::SpawnMode::Normal,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
.await?
|
||||
.is_some();
|
||||
|
||||
if let Some(_flush_ms) = flush {
|
||||
match state
|
||||
@@ -1352,7 +1396,18 @@ async fn put_tenant_location_config_handler(
|
||||
tracing::info!("No flush requested when configuring");
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
// This API returns a vector of pageservers where the tenant is attached: this is
|
||||
// primarily for use in the sharding service. For compatibilty, we also return this
|
||||
// when called directly on a pageserver, but the payload is always zero or one shards.
|
||||
let mut response = TenantLocationConfigResponse { shards: Vec::new() };
|
||||
if attached {
|
||||
response.shards.push(TenantShardLocation {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: state.conf.id,
|
||||
})
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
async fn list_location_config_handler(
|
||||
@@ -1377,6 +1432,79 @@ async fn list_location_config_handler(
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
// Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached
|
||||
// (from all pageservers) as it invalidates consistency assumptions.
|
||||
async fn tenant_time_travel_remote_storage_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let timestamp_raw = must_get_query_param(&request, "travel_to")?;
|
||||
let timestamp = humantime::parse_rfc3339(×tamp_raw)
|
||||
.with_context(|| format!("Invalid time for travel_to: {timestamp_raw:?}"))
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
let done_if_after_raw = must_get_query_param(&request, "done_if_after")?;
|
||||
let done_if_after = humantime::parse_rfc3339(&done_if_after_raw)
|
||||
.with_context(|| format!("Invalid time for done_if_after: {done_if_after_raw:?}"))
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
// This is just a sanity check to fend off naive wrong usages of the API:
|
||||
// the tenant needs to be detached *everywhere*
|
||||
let state = get_state(&request);
|
||||
let we_manage_tenant = state.tenant_manager.manages_tenant_shard(tenant_shard_id);
|
||||
if we_manage_tenant {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Tenant {tenant_shard_id} is already attached at this pageserver"
|
||||
)));
|
||||
}
|
||||
|
||||
let Some(storage) = state.remote_storage.as_ref() else {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run time travel"
|
||||
)));
|
||||
};
|
||||
|
||||
if timestamp > done_if_after {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"The done_if_after timestamp comes before the timestamp to recover to"
|
||||
)));
|
||||
}
|
||||
|
||||
tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
|
||||
|
||||
remote_timeline_client::upload::time_travel_recover_tenant(
|
||||
storage,
|
||||
&tenant_shard_id,
|
||||
timestamp,
|
||||
done_if_after,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
TimeTravelError::BadInput(e) => {
|
||||
warn!("bad input error: {e}");
|
||||
ApiError::BadRequest(anyhow!("bad input error"))
|
||||
}
|
||||
TimeTravelError::Unimplemented => {
|
||||
ApiError::BadRequest(anyhow!("unimplemented for the configured remote storage"))
|
||||
}
|
||||
TimeTravelError::Cancelled => ApiError::InternalServerError(anyhow!("cancelled")),
|
||||
TimeTravelError::TooManyVersions => {
|
||||
ApiError::InternalServerError(anyhow!("too many versions in remote storage"))
|
||||
}
|
||||
TimeTravelError::Other(e) => {
|
||||
warn!("internal error: {e}");
|
||||
ApiError::InternalServerError(anyhow!("internal error"))
|
||||
}
|
||||
})?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
|
||||
async fn handle_tenant_break(
|
||||
r: Request<Body>,
|
||||
@@ -1922,6 +2050,10 @@ pub fn make_router(
|
||||
.get("/v1/location_config", |r| {
|
||||
api_handler(r, list_location_config_handler)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/time_travel_remote_storage",
|
||||
|r| api_handler(r, tenant_time_travel_remote_storage_handler),
|
||||
)
|
||||
.get("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
||||
api_handler(r, timeline_list_handler)
|
||||
})
|
||||
@@ -1943,6 +2075,10 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
||||
api_handler(r, tenant_ignore_handler)
|
||||
})
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
)
|
||||
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||
api_handler(r, timeline_detail_handler)
|
||||
})
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#![recursion_limit = "300"]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
mod auth;
|
||||
|
||||
@@ -150,6 +150,43 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) struct GetVectoredLatency {
|
||||
map: EnumMap<TaskKind, Option<Histogram>>,
|
||||
}
|
||||
|
||||
impl GetVectoredLatency {
|
||||
// Only these task types perform vectored gets. Filter all other tasks out to reduce total
|
||||
// cardinality of the metric.
|
||||
const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
|
||||
|
||||
pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
|
||||
self.map[task_kind].as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_get_vectored_seconds",
|
||||
"Time spent in get_vectored",
|
||||
&["task_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
GetVectoredLatency {
|
||||
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
|
||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
|
||||
|
||||
if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
|
||||
let task_kind = task_kind.into();
|
||||
Some(inner.with_label_values(&[task_kind]))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct PageCacheMetricsForTaskKind {
|
||||
pub read_accesses_materialized_page: IntCounter,
|
||||
pub read_accesses_immutable: IntCounter,
|
||||
@@ -932,6 +969,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub(crate) mod virtual_file_descriptor_cache {
|
||||
use super::*;
|
||||
|
||||
@@ -951,6 +989,20 @@ pub(crate) mod virtual_file_descriptor_cache {
|
||||
// ```
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub(crate) mod virtual_file_io_engine {
|
||||
use super::*;
|
||||
|
||||
pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_virtual_file_io_engine_kind",
|
||||
"The configured io engine for VirtualFile",
|
||||
&["kind"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct GlobalAndPerTimelineHistogram {
|
||||
global: Histogram,
|
||||
@@ -991,6 +1043,7 @@ pub enum SmgrQueryType {
|
||||
GetRelSize,
|
||||
GetPageAtLsn,
|
||||
GetDbSize,
|
||||
GetSlruSegment,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -1107,11 +1160,12 @@ mod smgr_query_time_tests {
|
||||
#[test]
|
||||
fn op_label_name() {
|
||||
use super::SmgrQueryType::*;
|
||||
let expect: [(super::SmgrQueryType, &'static str); 4] = [
|
||||
let expect: [(super::SmgrQueryType, &'static str); 5] = [
|
||||
(GetRelExists, "get_rel_exists"),
|
||||
(GetRelSize, "get_rel_size"),
|
||||
(GetPageAtLsn, "get_page_at_lsn"),
|
||||
(GetDbSize, "get_db_size"),
|
||||
(GetSlruSegment, "get_slru_segment"),
|
||||
];
|
||||
for (op, expect) in expect {
|
||||
let actual: &'static str = op.into();
|
||||
@@ -1597,11 +1651,18 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_process_launch_duration",
|
||||
"Histogram of the duration of successful WalRedoProcess::launch calls",
|
||||
redo_histogram_time_buckets!(),
|
||||
vec![
|
||||
0.0002, 0.0004, 0.0006, 0.0008, 0.0010,
|
||||
0.0020, 0.0040, 0.0060, 0.0080, 0.0100,
|
||||
0.0200, 0.0400, 0.0600, 0.0800, 0.1000,
|
||||
0.2000, 0.4000, 0.6000, 0.8000, 1.0000,
|
||||
1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000
|
||||
],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
@@ -22,7 +22,8 @@ use pageserver_api::models::{
|
||||
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
|
||||
PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
|
||||
PagestreamNblocksResponse,
|
||||
};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
@@ -74,8 +75,8 @@ use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
@@ -322,8 +323,8 @@ enum PageStreamError {
|
||||
Shutdown,
|
||||
|
||||
/// Something went wrong reading a page: this likely indicates a pageserver bug
|
||||
#[error("Read error: {0}")]
|
||||
Read(PageReconstructError),
|
||||
#[error("Read error")]
|
||||
Read(#[source] PageReconstructError),
|
||||
|
||||
/// Ran out of time waiting for an LSN
|
||||
#[error("LSN timeout: {0}")]
|
||||
@@ -332,11 +333,11 @@ enum PageStreamError {
|
||||
/// The entity required to serve the request (tenant or timeline) is not found,
|
||||
/// or is not found in a suitable state to serve a request.
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(std::borrow::Cow<'static, str>),
|
||||
NotFound(Cow<'static, str>),
|
||||
|
||||
/// Request asked for something that doesn't make sense, like an invalid LSN
|
||||
#[error("Bad request: {0}")]
|
||||
BadRequest(std::borrow::Cow<'static, str>),
|
||||
BadRequest(Cow<'static, str>),
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for PageStreamError {
|
||||
@@ -368,6 +369,16 @@ impl From<WaitLsnError> for PageStreamError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<WaitLsnError> for QueryError {
|
||||
fn from(value: WaitLsnError) -> Self {
|
||||
match value {
|
||||
e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
|
||||
WaitLsnError::Shutdown => Self::Shutdown,
|
||||
WaitLsnError::BadState => Self::Reconnect,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -637,6 +648,15 @@ impl PageServerHandler {
|
||||
span,
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
|
||||
(
|
||||
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
span,
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
match response {
|
||||
@@ -667,7 +687,10 @@ impl PageServerHandler {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||
let full = utils::error::report_compact_sources(&e);
|
||||
span.in_scope(|| {
|
||||
error!("error reading relation or page version: {full:#}")
|
||||
});
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
@@ -1124,6 +1147,33 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
async fn handle_get_slru_segment_request(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
req: &PagestreamGetSlruSegmentRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetSlruSegment);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let kind = SlruKind::from_repr(req.kind)
|
||||
.ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
|
||||
let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentResponse { segment },
|
||||
))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
|
||||
async fn handle_basebackup_request<IO>(
|
||||
@@ -1136,7 +1186,7 @@ impl PageServerHandler {
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()>
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
@@ -1401,7 +1451,7 @@ where
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
anyhow::Ok(())
|
||||
Result::<(), QueryError>::Ok(())
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
@@ -1675,6 +1725,7 @@ impl From<GetActiveTenantError> for QueryError {
|
||||
| GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
|
||||
QueryError::Shutdown
|
||||
}
|
||||
e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()),
|
||||
e => QueryError::Other(anyhow::anyhow!(e)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::repository::*;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -27,6 +27,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
@@ -320,6 +321,27 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the whole SLRU segment
|
||||
pub(crate) async fn get_slru_segment(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let n_blocks = self
|
||||
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
|
||||
for blkno in 0..n_blocks {
|
||||
let block = self
|
||||
.get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
|
||||
.await?;
|
||||
segment.extend_from_slice(&block[..BLCKSZ as usize]);
|
||||
}
|
||||
Ok(segment.freeze())
|
||||
}
|
||||
|
||||
/// Look up given SLRU page version.
|
||||
pub(crate) async fn get_slru_page_at_lsn(
|
||||
&self,
|
||||
@@ -533,6 +555,33 @@ impl Timeline {
|
||||
Ok(Default::default())
|
||||
}
|
||||
|
||||
pub(crate) async fn get_slru_keyspace(
|
||||
&self,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<KeySpace, PageReconstructError> {
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
|
||||
for kind in SlruKind::iter() {
|
||||
let mut segments: Vec<u32> = self
|
||||
.list_slru_segments(kind, version, ctx)
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
segments.sort_unstable();
|
||||
|
||||
for seg in segments {
|
||||
let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?;
|
||||
|
||||
accum.add_range(
|
||||
slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(accum.to_keyspace())
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub(crate) async fn list_slru_segments(
|
||||
&self,
|
||||
|
||||
@@ -20,6 +20,7 @@ use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
@@ -91,7 +92,6 @@ use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::ops::Bound::Included;
|
||||
use std::process::Stdio;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
@@ -365,6 +365,14 @@ impl WalRedoManager {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
match self {
|
||||
WalRedoManager::Prod(m) => m.status(),
|
||||
#[cfg(test)]
|
||||
WalRedoManager::Test(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
||||
@@ -628,9 +636,15 @@ impl Tenant {
|
||||
deletion_queue_client,
|
||||
));
|
||||
|
||||
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
|
||||
// we shut down while attaching.
|
||||
let Ok(attach_gate_guard) = tenant.gate.enter() else {
|
||||
// We just created the Tenant: nothing else can have shut it down yet
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
// Do all the hard work in the background
|
||||
let tenant_clone = Arc::clone(&tenant);
|
||||
|
||||
let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
@@ -640,6 +654,8 @@ impl Tenant {
|
||||
"attach tenant",
|
||||
false,
|
||||
async move {
|
||||
let _gate_guard = attach_gate_guard;
|
||||
|
||||
// Is this tenant being spawned as part of process startup?
|
||||
let starting_up = init_order.is_some();
|
||||
scopeguard::defer! {
|
||||
@@ -814,7 +830,7 @@ impl Tenant {
|
||||
SpawnMode::Create => None,
|
||||
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
||||
};
|
||||
match tenant_clone.attach(preload, &ctx).await {
|
||||
match tenant_clone.attach(preload, mode, &ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
if let Some(t)= attach_timer {t.observe_duration();}
|
||||
@@ -901,15 +917,20 @@ impl Tenant {
|
||||
async fn attach(
|
||||
self: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
|
||||
let preload = match preload {
|
||||
Some(p) => p,
|
||||
None => {
|
||||
let preload = match (preload, mode) {
|
||||
(Some(p), _) => p,
|
||||
(None, SpawnMode::Create) => TenantPreload {
|
||||
deleting: false,
|
||||
timelines: HashMap::new(),
|
||||
},
|
||||
(None, SpawnMode::Normal) => {
|
||||
// Deprecated dev mode: load from local disk state instead of remote storage
|
||||
// https://github.com/neondatabase/neon/issues/5624
|
||||
return self.load_local(ctx).await;
|
||||
@@ -1008,6 +1029,7 @@ impl Tenant {
|
||||
Some(remote_timeline_client),
|
||||
self.deletion_queue_client.clone(),
|
||||
)
|
||||
.instrument(tracing::info_span!("timeline_delete", %timeline_id))
|
||||
.await
|
||||
.context("resume_deletion")
|
||||
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
|
||||
@@ -1017,7 +1039,10 @@ impl Tenant {
|
||||
// IndexPart is the source of truth.
|
||||
self.clean_up_timelines(&existent_timelines)?;
|
||||
|
||||
failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
|
||||
fail::fail_point!("attach-before-activate", |_| {
|
||||
anyhow::bail!("attach-before-activate");
|
||||
});
|
||||
failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);
|
||||
|
||||
info!("Done");
|
||||
|
||||
@@ -1681,9 +1706,13 @@ impl Tenant {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
if !self.is_active() {
|
||||
return Err(CreateTimelineError::Other(anyhow::anyhow!(
|
||||
"Cannot create timelines on inactive tenant"
|
||||
)));
|
||||
if matches!(self.current_state(), TenantState::Stopping { .. }) {
|
||||
return Err(CreateTimelineError::ShuttingDown);
|
||||
} else {
|
||||
return Err(CreateTimelineError::Other(anyhow::anyhow!(
|
||||
"Cannot create timelines on inactive tenant"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let _gate = self
|
||||
@@ -1937,6 +1966,10 @@ impl Tenant {
|
||||
self.generation
|
||||
}
|
||||
|
||||
pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
|
||||
self.walredo_mgr.status()
|
||||
}
|
||||
|
||||
/// Changes tenant status to active, unless shutdown was already requested.
|
||||
///
|
||||
/// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
|
||||
@@ -2074,7 +2107,10 @@ impl Tenant {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines.values().for_each(|timeline| {
|
||||
let timeline = Arc::clone(timeline);
|
||||
let span = Span::current();
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
let span =
|
||||
tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
|
||||
js.spawn(async move {
|
||||
if freeze_and_flush {
|
||||
timeline.flush_and_shutdown().instrument(span).await
|
||||
@@ -2674,7 +2710,7 @@ impl Tenant {
|
||||
activate_now_sem: tokio::sync::Semaphore::new(0),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||
cancel: CancellationToken::default(),
|
||||
gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
|
||||
gate: Gate::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3759,27 +3795,30 @@ async fn run_initdb(
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
// If the `select!` below doesn't finish the `wait_with_output`,
|
||||
// let the task get `wait()`ed for asynchronously by tokio.
|
||||
// This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
|
||||
// TODO: fix for this is non-trivial, see
|
||||
// https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
|
||||
//
|
||||
.kill_on_drop(true)
|
||||
.stdin(std::process::Stdio::null())
|
||||
// stdout invocation produces the same output every time, we don't need it
|
||||
.stdout(std::process::Stdio::null())
|
||||
// we would be interested in the stderr output, if there was any
|
||||
.stderr(std::process::Stdio::piped())
|
||||
.spawn()?;
|
||||
|
||||
tokio::select! {
|
||||
initdb_output = initdb_command.wait_with_output() => {
|
||||
let initdb_output = initdb_output?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let initdb_output = initdb_command.wait_with_output().await?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(
|
||||
initdb_output.status,
|
||||
initdb_output.stderr,
|
||||
));
|
||||
}
|
||||
|
||||
// This isn't true cancellation support, see above. Still return an error to
|
||||
// excercise the cancellation code path.
|
||||
if cancel.is_cancelled() {
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -3881,6 +3920,7 @@ pub(crate) mod harness {
|
||||
),
|
||||
gc_feedback: Some(tenant_conf.gc_feedback),
|
||||
heatmap_period: Some(tenant_conf.heatmap_period),
|
||||
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4035,7 +4075,7 @@ pub(crate) mod harness {
|
||||
.instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
tenant
|
||||
.attach(Some(preload), ctx)
|
||||
.attach(Some(preload), SpawnMode::Normal, ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
}
|
||||
@@ -5203,7 +5243,7 @@ mod tests {
|
||||
let raw_tline = tline.raw_timeline().unwrap();
|
||||
raw_tline
|
||||
.shutdown()
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id))
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
|
||||
.await;
|
||||
std::mem::forget(tline);
|
||||
}
|
||||
|
||||
@@ -5,10 +5,10 @@
|
||||
use super::ephemeral_file::EphemeralFile;
|
||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
|
||||
use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use bytes::Bytes;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::ops::Deref;
|
||||
|
||||
/// This is implemented by anything that can read 8 kB (PAGE_SZ)
|
||||
/// blocks, using the page cache
|
||||
@@ -39,6 +39,8 @@ pub enum BlockLease<'a> {
|
||||
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
|
||||
#[cfg(test)]
|
||||
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
|
||||
#[cfg(test)]
|
||||
Vec(Vec<u8>),
|
||||
}
|
||||
|
||||
impl From<PageReadGuard<'static>> for BlockLease<'static> {
|
||||
@@ -63,6 +65,10 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
BlockLease::EphemeralFileMutableTail(v) => v,
|
||||
#[cfg(test)]
|
||||
BlockLease::Arc(v) => v.deref(),
|
||||
#[cfg(test)]
|
||||
BlockLease::Vec(v) => {
|
||||
TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -169,10 +175,14 @@ impl FileBlockReader {
|
||||
}
|
||||
|
||||
/// Read a page from the underlying file into given buffer.
|
||||
async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
|
||||
async fn fill_buffer(
|
||||
&self,
|
||||
buf: PageWriteGuard<'static>,
|
||||
blkno: u32,
|
||||
) -> Result<PageWriteGuard<'static>, std::io::Error> {
|
||||
assert!(buf.len() == PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
|
||||
.read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
|
||||
.await
|
||||
}
|
||||
/// Read a block.
|
||||
@@ -196,9 +206,9 @@ impl FileBlockReader {
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
ReadBufResult::NotFound(write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
|
||||
let write_guard = self.fill_buffer(write_guard, blknum).await?;
|
||||
Ok(write_guard.mark_valid().into())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -345,6 +345,9 @@ pub struct TenantConf {
|
||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||
/// locations will use the heatmap uploaded by attached locations.
|
||||
pub heatmap_period: Duration,
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
pub lazy_slru_download: bool,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -430,6 +433,10 @@ pub struct TenantConfOpt {
|
||||
#[serde(with = "humantime_serde")]
|
||||
#[serde(default)]
|
||||
pub heatmap_period: Option<Duration>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -475,6 +482,9 @@ impl TenantConfOpt {
|
||||
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
|
||||
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
|
||||
heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
|
||||
lazy_slru_download: self
|
||||
.lazy_slru_download
|
||||
.unwrap_or(global_conf.lazy_slru_download),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -513,6 +523,7 @@ impl Default for TenantConf {
|
||||
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
|
||||
gc_feedback: false,
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -584,6 +595,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
.map(humantime),
|
||||
gc_feedback: value.gc_feedback,
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -136,7 +136,11 @@ async fn schedule_ordered_timeline_deletions(
|
||||
let mut already_running_deletions = vec![];
|
||||
|
||||
for (timeline_id, _) in sorted.into_iter().rev() {
|
||||
if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
|
||||
let span = tracing::info_span!("timeline_delete", %timeline_id);
|
||||
let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
|
||||
.instrument(span)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
match e {
|
||||
DeleteTimelineError::NotFound => {
|
||||
// Timeline deletion finished after call to clone above but before call
|
||||
@@ -409,7 +413,10 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant.attach(preload, ctx).await.context("attach")?;
|
||||
tenant
|
||||
.attach(preload, super::SpawnMode::Normal, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
|
||||
@@ -5,11 +5,11 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::cmp::min;
|
||||
use std::fs::OpenOptions;
|
||||
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
@@ -47,7 +47,10 @@ impl EphemeralFile {
|
||||
|
||||
let file = VirtualFile::open_with_options(
|
||||
&filename,
|
||||
OpenOptions::new().read(true).write(true).create(true),
|
||||
virtual_file::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -89,11 +92,10 @@ impl EphemeralFile {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
|
||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
||||
let write_guard = self
|
||||
.file
|
||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
return Ok(BlockLease::PageReadGuard(read_guard));
|
||||
|
||||
@@ -51,7 +51,10 @@ use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use std::collections::VecDeque;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -144,11 +147,221 @@ impl Drop for BatchedUpdates<'_> {
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<PersistentLayerDesc>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
pub struct OrderedSearchResult(SearchResult);
|
||||
|
||||
impl Ord for OrderedSearchResult {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.0.lsn_floor.cmp(&other.0.lsn_floor)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for OrderedSearchResult {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for OrderedSearchResult {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.lsn_floor == other.0.lsn_floor
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for OrderedSearchResult {}
|
||||
|
||||
pub struct RangeSearchResult {
|
||||
pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
|
||||
pub not_found: KeySpaceAccum,
|
||||
}
|
||||
|
||||
impl RangeSearchResult {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
found: BTreeMap::new(),
|
||||
not_found: KeySpaceAccum::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Collector for results of range search queries on the LayerMap.
|
||||
/// It should be provided with two iterators for the delta and image coverage
|
||||
/// that contain all the changes for layers which intersect the range.
|
||||
struct RangeSearchCollector<Iter>
|
||||
where
|
||||
Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
|
||||
{
|
||||
delta_coverage: Peekable<Iter>,
|
||||
image_coverage: Peekable<Iter>,
|
||||
key_range: Range<Key>,
|
||||
end_lsn: Lsn,
|
||||
|
||||
current_delta: Option<Arc<PersistentLayerDesc>>,
|
||||
current_image: Option<Arc<PersistentLayerDesc>>,
|
||||
|
||||
result: RangeSearchResult,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum NextLayerType {
|
||||
Delta(i128),
|
||||
Image(i128),
|
||||
Both(i128),
|
||||
}
|
||||
|
||||
impl NextLayerType {
|
||||
fn next_change_at_key(&self) -> Key {
|
||||
match self {
|
||||
NextLayerType::Delta(at) => Key::from_i128(*at),
|
||||
NextLayerType::Image(at) => Key::from_i128(*at),
|
||||
NextLayerType::Both(at) => Key::from_i128(*at),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Iter> RangeSearchCollector<Iter>
|
||||
where
|
||||
Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
|
||||
{
|
||||
fn new(
|
||||
key_range: Range<Key>,
|
||||
end_lsn: Lsn,
|
||||
delta_coverage: Iter,
|
||||
image_coverage: Iter,
|
||||
) -> Self {
|
||||
Self {
|
||||
delta_coverage: delta_coverage.peekable(),
|
||||
image_coverage: image_coverage.peekable(),
|
||||
key_range,
|
||||
end_lsn,
|
||||
current_delta: None,
|
||||
current_image: None,
|
||||
result: RangeSearchResult::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the collector. Collection is implemented via a two pointer algorithm.
|
||||
/// One pointer tracks the start of the current range and the other tracks
|
||||
/// the beginning of the next range which will overlap with the next change
|
||||
/// in coverage across both image and delta.
|
||||
fn collect(mut self) -> RangeSearchResult {
|
||||
let next_layer_type = self.choose_next_layer_type();
|
||||
let mut current_range_start = match next_layer_type {
|
||||
None => {
|
||||
// No changes for the range
|
||||
self.pad_range(self.key_range.clone());
|
||||
return self.result;
|
||||
}
|
||||
Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
|
||||
// Changes only after the end of the range
|
||||
self.pad_range(self.key_range.clone());
|
||||
return self.result;
|
||||
}
|
||||
Some(layer_type) => {
|
||||
// Changes for the range exist. Record anything before the first
|
||||
// coverage change as not found.
|
||||
let coverage_start = layer_type.next_change_at_key();
|
||||
let range_before = self.key_range.start..coverage_start;
|
||||
self.pad_range(range_before);
|
||||
|
||||
self.advance(&layer_type);
|
||||
coverage_start
|
||||
}
|
||||
};
|
||||
|
||||
while current_range_start < self.key_range.end {
|
||||
let next_layer_type = self.choose_next_layer_type();
|
||||
match next_layer_type {
|
||||
Some(t) => {
|
||||
let current_range_end = t.next_change_at_key();
|
||||
self.add_range(current_range_start..current_range_end);
|
||||
current_range_start = current_range_end;
|
||||
|
||||
self.advance(&t);
|
||||
}
|
||||
None => {
|
||||
self.add_range(current_range_start..self.key_range.end);
|
||||
current_range_start = self.key_range.end;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.result
|
||||
}
|
||||
|
||||
/// Mark a range as not found (i.e. no layers intersect it)
|
||||
fn pad_range(&mut self, key_range: Range<Key>) {
|
||||
if !key_range.is_empty() {
|
||||
self.result.not_found.add_range(key_range);
|
||||
}
|
||||
}
|
||||
|
||||
/// Select the appropiate layer for the given range and update
|
||||
/// the collector.
|
||||
fn add_range(&mut self, covered_range: Range<Key>) {
|
||||
let selected = LayerMap::select_layer(
|
||||
self.current_delta.clone(),
|
||||
self.current_image.clone(),
|
||||
self.end_lsn,
|
||||
);
|
||||
|
||||
match selected {
|
||||
Some(search_result) => self
|
||||
.result
|
||||
.found
|
||||
.entry(OrderedSearchResult(search_result))
|
||||
.or_default()
|
||||
.add_range(covered_range),
|
||||
None => self.pad_range(covered_range),
|
||||
}
|
||||
}
|
||||
|
||||
/// Move to the next coverage change.
|
||||
fn advance(&mut self, layer_type: &NextLayerType) {
|
||||
match layer_type {
|
||||
NextLayerType::Delta(_) => {
|
||||
let (_, layer) = self.delta_coverage.next().unwrap();
|
||||
self.current_delta = layer;
|
||||
}
|
||||
NextLayerType::Image(_) => {
|
||||
let (_, layer) = self.image_coverage.next().unwrap();
|
||||
self.current_image = layer;
|
||||
}
|
||||
NextLayerType::Both(_) => {
|
||||
let (_, image_layer) = self.image_coverage.next().unwrap();
|
||||
let (_, delta_layer) = self.delta_coverage.next().unwrap();
|
||||
|
||||
self.current_image = image_layer;
|
||||
self.current_delta = delta_layer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pick the next coverage change: the one at the lesser key or both if they're alligned.
|
||||
fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
|
||||
let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
|
||||
let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
|
||||
|
||||
match (next_delta_at, next_image_at) {
|
||||
(None, None) => None,
|
||||
(Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
|
||||
(None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
|
||||
(Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
|
||||
Some(NextLayerType::Image(*next_image_at))
|
||||
}
|
||||
(Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
|
||||
Some(NextLayerType::Delta(*next_delta_at))
|
||||
}
|
||||
(Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Find the latest layer (by lsn.end) that covers the given
|
||||
@@ -186,7 +399,18 @@ impl LayerMap {
|
||||
let latest_delta = version.delta_coverage.query(key.to_i128());
|
||||
let latest_image = version.image_coverage.query(key.to_i128());
|
||||
|
||||
match (latest_delta, latest_image) {
|
||||
Self::select_layer(latest_delta, latest_image, end_lsn)
|
||||
}
|
||||
|
||||
fn select_layer(
|
||||
delta_layer: Option<Arc<PersistentLayerDesc>>,
|
||||
image_layer: Option<Arc<PersistentLayerDesc>>,
|
||||
end_lsn: Lsn,
|
||||
) -> Option<SearchResult> {
|
||||
assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
|
||||
assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
|
||||
|
||||
match (delta_layer, image_layer) {
|
||||
(None, None) => None,
|
||||
(None, Some(image)) => {
|
||||
let lsn_floor = image.get_lsn_range().start;
|
||||
@@ -223,6 +447,17 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
|
||||
let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
|
||||
|
||||
let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
|
||||
let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
|
||||
let image_changes = version.image_coverage.range_overlaps(&raw_range);
|
||||
|
||||
let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
|
||||
Some(collector.collect())
|
||||
}
|
||||
|
||||
/// Start a batch of updates, applied on drop
|
||||
pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
|
||||
BatchedUpdates { layer_map: self }
|
||||
@@ -283,15 +518,15 @@ impl LayerMap {
|
||||
///
|
||||
/// This is used for garbage collection, to determine if an old layer can
|
||||
/// be deleted.
|
||||
pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
|
||||
pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> bool {
|
||||
if key.is_empty() {
|
||||
// Vacuously true. There's a newer image for all 0 of the kerys in the range.
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
|
||||
Some(v) => v,
|
||||
None => return Ok(false),
|
||||
None => return false,
|
||||
};
|
||||
|
||||
let start = key.start.to_i128();
|
||||
@@ -304,17 +539,17 @@ impl LayerMap {
|
||||
|
||||
// Check the start is covered
|
||||
if !layer_covers(version.image_coverage.query(start)) {
|
||||
return Ok(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check after all changes of coverage
|
||||
for (_, change_val) in version.image_coverage.range(start..end) {
|
||||
if !layer_covers(change_val) {
|
||||
return Ok(false);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
true
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
|
||||
@@ -325,18 +560,14 @@ impl LayerMap {
|
||||
/// Divide the whole given range of keys into sub-ranges based on the latest
|
||||
/// image layer that covers each range at the specified lsn (inclusive).
|
||||
/// This is used when creating new image layers.
|
||||
///
|
||||
// FIXME: clippy complains that the result type is very complex. She's probably
|
||||
// right...
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub fn image_coverage(
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
|
||||
) -> Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> {
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.0) {
|
||||
Some(v) => v,
|
||||
None => return Ok(vec![]),
|
||||
None => return vec![],
|
||||
};
|
||||
|
||||
let start = key_range.start.to_i128();
|
||||
@@ -359,7 +590,7 @@ impl LayerMap {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
||||
coverage.push((kr, current_val.take()));
|
||||
|
||||
Ok(coverage)
|
||||
coverage
|
||||
}
|
||||
|
||||
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
|
||||
@@ -410,24 +641,19 @@ impl LayerMap {
|
||||
/// This number is used to compute the largest number of deltas that
|
||||
/// we'll need to visit for any page reconstruction in this region.
|
||||
/// We use this heuristic to decide whether to create an image layer.
|
||||
pub fn count_deltas(
|
||||
&self,
|
||||
key: &Range<Key>,
|
||||
lsn: &Range<Lsn>,
|
||||
limit: Option<usize>,
|
||||
) -> Result<usize> {
|
||||
pub fn count_deltas(&self, key: &Range<Key>, lsn: &Range<Lsn>, limit: Option<usize>) -> usize {
|
||||
// We get the delta coverage of the region, and for each part of the coverage
|
||||
// we recurse right underneath the delta. The recursion depth is limited by
|
||||
// the largest result this function could return, which is in practice between
|
||||
// 3 and 10 (since we usually try to create an image when the number gets larger).
|
||||
|
||||
if lsn.is_empty() || key.is_empty() || limit == Some(0) {
|
||||
return Ok(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
|
||||
Some(v) => v,
|
||||
None => return Ok(0),
|
||||
None => return 0,
|
||||
};
|
||||
|
||||
let start = key.start.to_i128();
|
||||
@@ -448,8 +674,7 @@ impl LayerMap {
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath =
|
||||
self.count_deltas(&kr, &lr, new_limit)?;
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
max_stacked_deltas,
|
||||
base_count + max_stacked_deltas_underneath,
|
||||
@@ -471,7 +696,7 @@ impl LayerMap {
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
max_stacked_deltas,
|
||||
base_count + max_stacked_deltas_underneath,
|
||||
@@ -480,7 +705,7 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(max_stacked_deltas)
|
||||
max_stacked_deltas
|
||||
}
|
||||
|
||||
/// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
|
||||
@@ -592,10 +817,7 @@ impl LayerMap {
|
||||
if limit == Some(difficulty) {
|
||||
break;
|
||||
}
|
||||
for (img_range, last_img) in self
|
||||
.image_coverage(range, lsn)
|
||||
.expect("why would this err?")
|
||||
{
|
||||
for (img_range, last_img) in self.image_coverage(range, lsn) {
|
||||
if limit == Some(difficulty) {
|
||||
break;
|
||||
}
|
||||
@@ -606,9 +828,7 @@ impl LayerMap {
|
||||
};
|
||||
|
||||
if img_lsn < lsn {
|
||||
let num_deltas = self
|
||||
.count_deltas(&img_range, &(img_lsn..lsn), limit)
|
||||
.expect("why would this err lol?");
|
||||
let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
|
||||
difficulty = std::cmp::max(difficulty, num_deltas);
|
||||
}
|
||||
}
|
||||
@@ -646,3 +866,126 @@ impl LayerMap {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct LayerDesc {
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
is_delta: bool,
|
||||
}
|
||||
|
||||
fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
|
||||
let mut layer_map = LayerMap::default();
|
||||
|
||||
for layer in layers {
|
||||
layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
|
||||
layer.key_range,
|
||||
layer.lsn_range,
|
||||
layer.is_delta,
|
||||
));
|
||||
}
|
||||
|
||||
layer_map.flush_updates();
|
||||
layer_map
|
||||
}
|
||||
|
||||
fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
|
||||
assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
|
||||
let lhs: Vec<_> = lhs
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
|
||||
.collect();
|
||||
let rhs: Vec<_> = rhs
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
|
||||
.collect();
|
||||
|
||||
assert_eq!(lhs, rhs);
|
||||
}
|
||||
|
||||
fn brute_force_range_search(
|
||||
layer_map: &LayerMap,
|
||||
key_range: Range<Key>,
|
||||
end_lsn: Lsn,
|
||||
) -> RangeSearchResult {
|
||||
let mut range_search_result = RangeSearchResult::new();
|
||||
|
||||
let mut key = key_range.start;
|
||||
while key != key_range.end {
|
||||
let res = layer_map.search(key, end_lsn);
|
||||
match res {
|
||||
Some(res) => {
|
||||
range_search_result
|
||||
.found
|
||||
.entry(OrderedSearchResult(res))
|
||||
.or_default()
|
||||
.add_key(key);
|
||||
}
|
||||
None => {
|
||||
range_search_result.not_found.add_key(key);
|
||||
}
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
range_search_result
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ranged_search_on_empty_layer_map() {
|
||||
let layer_map = LayerMap::default();
|
||||
let range = Key::from_i128(100)..Key::from_i128(200);
|
||||
|
||||
let res = layer_map.range_search(range, Lsn(100));
|
||||
assert!(res.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ranged_search() {
|
||||
let layers = vec![
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(15)..Key::from_i128(50),
|
||||
lsn_range: Lsn(0)..Lsn(5),
|
||||
is_delta: false,
|
||||
},
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(10)..Key::from_i128(20),
|
||||
lsn_range: Lsn(5)..Lsn(20),
|
||||
is_delta: true,
|
||||
},
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(15)..Key::from_i128(25),
|
||||
lsn_range: Lsn(20)..Lsn(30),
|
||||
is_delta: true,
|
||||
},
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(35)..Key::from_i128(40),
|
||||
lsn_range: Lsn(25)..Lsn(35),
|
||||
is_delta: true,
|
||||
},
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(35)..Key::from_i128(40),
|
||||
lsn_range: Lsn(35)..Lsn(40),
|
||||
is_delta: false,
|
||||
},
|
||||
];
|
||||
|
||||
let layer_map = create_layer_map(layers.clone());
|
||||
for start in 0..60 {
|
||||
for end in (start + 1)..60 {
|
||||
let range = Key::from_i128(start)..Key::from_i128(end);
|
||||
let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
|
||||
let expected = brute_force_range_search(&layer_map, range, Lsn(100));
|
||||
|
||||
assert_range_search_result_eq(result, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,6 +129,42 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
.map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
|
||||
}
|
||||
|
||||
/// Returns an iterator which includes all coverage changes for layers that intersect
|
||||
/// with the provided range.
|
||||
pub fn range_overlaps(
|
||||
&self,
|
||||
key_range: &Range<i128>,
|
||||
) -> impl Iterator<Item = (i128, Option<Value>)> + '_
|
||||
where
|
||||
Value: Eq,
|
||||
{
|
||||
let first_change = self.query(key_range.start);
|
||||
match first_change {
|
||||
Some(change) => {
|
||||
// If the start of the range is covered, we have to deal with two cases:
|
||||
// 1. Start of the range is aligned with the start of a layer.
|
||||
// In this case the return of `self.range` will contain the layer which aligns with the start of the key range.
|
||||
// We advance said iterator to avoid duplicating the first change.
|
||||
// 2. Start of the range is not aligned with the start of a layer.
|
||||
let range = key_range.start..key_range.end;
|
||||
let mut range_coverage = self.range(range).peekable();
|
||||
if range_coverage
|
||||
.peek()
|
||||
.is_some_and(|c| c.1.as_ref() == Some(&change))
|
||||
{
|
||||
range_coverage.next();
|
||||
}
|
||||
itertools::Either::Left(
|
||||
std::iter::once((key_range.start, Some(change))).chain(range_coverage),
|
||||
)
|
||||
}
|
||||
None => {
|
||||
let range = key_range.start..key_range.end;
|
||||
let coverage = self.range(range);
|
||||
itertools::Either::Right(coverage)
|
||||
}
|
||||
}
|
||||
}
|
||||
/// O(1) clone
|
||||
pub fn clone(&self) -> Self {
|
||||
Self {
|
||||
|
||||
@@ -7,6 +7,7 @@ use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
@@ -32,7 +33,8 @@ use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
TenantConfOpt,
|
||||
};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
@@ -466,6 +468,26 @@ pub async fn init_tenant_mgr(
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
if let Some(gen) = generations.get(&tenant_shard_id) {
|
||||
if let LocationMode::Attached(attached) = &location_conf.mode {
|
||||
if attached.generation > *gen {
|
||||
tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
|
||||
attached.generation
|
||||
);
|
||||
|
||||
// We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
|
||||
// local disk content: demote to secondary rather than detaching.
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf,
|
||||
&SecondaryLocationConfig { warm: false },
|
||||
)),
|
||||
);
|
||||
}
|
||||
}
|
||||
*gen
|
||||
} else {
|
||||
match &location_conf.mode {
|
||||
@@ -721,7 +743,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
tokio::select! {
|
||||
Some(joined) = join_set.join_next() => {
|
||||
match joined {
|
||||
Ok(()) => {}
|
||||
Ok(()) => {},
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the tasks");
|
||||
}
|
||||
@@ -876,13 +898,24 @@ impl TenantManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether the `TenantManager` is responsible for the tenant shard
|
||||
pub(crate) fn manages_tenant_shard(&self, tenant_shard_id: TenantShardId) -> bool {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.ok()
|
||||
.flatten();
|
||||
|
||||
peek_slot.is_some()
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn upsert_location(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_location_config: LocationConf,
|
||||
flush: Option<Duration>,
|
||||
spawn_mode: SpawnMode,
|
||||
mut spawn_mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
@@ -902,19 +935,29 @@ impl TenantManager {
|
||||
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
|
||||
match (&new_location_config.mode, peek_slot) {
|
||||
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
|
||||
if attach_conf.generation == tenant.generation {
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
match attach_conf.generation.cmp(&tenant.generation) {
|
||||
Ordering::Equal => {
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
} else {
|
||||
// Different generations, fall through to general case
|
||||
None
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
}
|
||||
Ordering::Less => {
|
||||
return Err(UpsertLocationError::BadRequest(anyhow::anyhow!(
|
||||
"Generation {:?} is less than existing {:?}",
|
||||
attach_conf.generation,
|
||||
tenant.generation
|
||||
)));
|
||||
}
|
||||
Ordering::Greater => {
|
||||
// Generation advanced, fall through to general case of replacing `Tenant` object
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
(
|
||||
@@ -1019,6 +1062,12 @@ impl TenantManager {
|
||||
}
|
||||
}
|
||||
slot_guard.drop_old_value().expect("We just shut it down");
|
||||
|
||||
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
|
||||
// the caller thinks they're creating but the tenant already existed. We must switch to
|
||||
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
|
||||
// rather than assuming it to be empty.
|
||||
spawn_mode = SpawnMode::Normal;
|
||||
}
|
||||
Some(TenantSlot::Secondary(state)) => {
|
||||
info!("Shutting down secondary tenant");
|
||||
@@ -1102,14 +1151,46 @@ impl TenantManager {
|
||||
None
|
||||
};
|
||||
|
||||
slot_guard.upsert(new_slot).map_err(|e| match e {
|
||||
TenantSlotUpsertError::InternalError(e) => {
|
||||
UpsertLocationError::Other(anyhow::anyhow!(e))
|
||||
match slot_guard.upsert(new_slot) {
|
||||
Err(TenantSlotUpsertError::InternalError(e)) => {
|
||||
Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
|
||||
}
|
||||
TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
|
||||
})?;
|
||||
Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
|
||||
Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
|
||||
// If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then
|
||||
// we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants
|
||||
// are shutdown.
|
||||
//
|
||||
// We must shut it down inline here.
|
||||
match new_slot {
|
||||
TenantSlot::InProgress(_) => {
|
||||
// Unreachable because we never insert an InProgress
|
||||
unreachable!()
|
||||
}
|
||||
TenantSlot::Attached(tenant) => {
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {
|
||||
info!("Finished shutting down just-spawned tenant");
|
||||
}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
barrier.wait().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
TenantSlot::Secondary(secondary_tenant) => {
|
||||
secondary_tenant.shutdown().await;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(attached_tenant)
|
||||
Err(UpsertLocationError::Unavailable(
|
||||
TenantMapError::ShuttingDown,
|
||||
))
|
||||
}
|
||||
Ok(()) => Ok(attached_tenant),
|
||||
}
|
||||
}
|
||||
|
||||
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
|
||||
@@ -1241,6 +1322,7 @@ impl TenantManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
activation_timeout: Duration,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
super::span::debug_assert_current_span_has_tenant_id();
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
// have to return the Tenant to the map while the background deletion runs.
|
||||
@@ -1728,14 +1810,31 @@ pub(crate) enum TenantSlotError {
|
||||
|
||||
/// Superset of TenantMapError: issues that can occur when using a SlotGuard
|
||||
/// to insert a new value.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum TenantSlotUpsertError {
|
||||
#[derive(thiserror::Error)]
|
||||
pub(crate) enum TenantSlotUpsertError {
|
||||
/// An error where the slot is in an unexpected state, indicating a code bug
|
||||
#[error("Internal error updating Tenant")]
|
||||
InternalError(Cow<'static, str>),
|
||||
|
||||
#[error(transparent)]
|
||||
MapState(#[from] TenantMapError),
|
||||
MapState(TenantMapError),
|
||||
|
||||
// If we encounter TenantManager shutdown during upsert, we must carry the Completion
|
||||
// from the SlotGuard, so that the caller can hold it while they clean up: otherwise
|
||||
// TenantManager shutdown might race ahead before we're done cleaning up any Tenant that
|
||||
// was protected by the SlotGuard.
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown((TenantSlot, utils::completion::Completion)),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for TenantSlotUpsertError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::InternalError(reason) => write!(f, "Internal Error {reason}"),
|
||||
Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"),
|
||||
Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1784,7 +1883,7 @@ pub struct SlotGuard {
|
||||
|
||||
/// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
|
||||
/// release any waiters as soon as this SlotGuard is dropped.
|
||||
_completion: utils::completion::Completion,
|
||||
completion: utils::completion::Completion,
|
||||
}
|
||||
|
||||
impl SlotGuard {
|
||||
@@ -1797,7 +1896,7 @@ impl SlotGuard {
|
||||
tenant_shard_id,
|
||||
old_value,
|
||||
upserted: false,
|
||||
_completion: completion,
|
||||
completion,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1830,9 +1929,16 @@ impl SlotGuard {
|
||||
}
|
||||
|
||||
let m = match &mut *locked {
|
||||
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
|
||||
TenantsMap::Initializing => {
|
||||
return Err(TenantSlotUpsertError::MapState(
|
||||
TenantMapError::StillInitializing,
|
||||
))
|
||||
}
|
||||
TenantsMap::ShuttingDown(_) => {
|
||||
return Err(TenantMapError::ShuttingDown.into());
|
||||
return Err(TenantSlotUpsertError::ShuttingDown((
|
||||
new_value,
|
||||
self.completion.clone(),
|
||||
)));
|
||||
}
|
||||
TenantsMap::Open(m) => m,
|
||||
};
|
||||
@@ -1880,7 +1986,9 @@ impl SlotGuard {
|
||||
Err(TenantSlotUpsertError::InternalError(_)) => {
|
||||
// We already logged the error, nothing else we can do.
|
||||
}
|
||||
Err(TenantSlotUpsertError::MapState(_)) => {
|
||||
Err(
|
||||
TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_),
|
||||
) => {
|
||||
// If the map is shutting down, we need not replace anything
|
||||
}
|
||||
Ok(()) => {}
|
||||
@@ -1978,18 +2086,22 @@ fn tenant_map_peek_slot<'a>(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
mode: TenantSlotPeekMode,
|
||||
) -> Result<Option<&'a TenantSlot>, TenantMapError> {
|
||||
let m = match tenants.deref() {
|
||||
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
|
||||
match tenants.deref() {
|
||||
TenantsMap::Initializing => Err(TenantMapError::StillInitializing),
|
||||
TenantsMap::ShuttingDown(m) => match mode {
|
||||
TenantSlotPeekMode::Read => m,
|
||||
TenantSlotPeekMode::Write => {
|
||||
return Err(TenantMapError::ShuttingDown);
|
||||
}
|
||||
TenantSlotPeekMode::Read => Ok(Some(
|
||||
// When reading in ShuttingDown state, we must translate None results
|
||||
// into a ShuttingDown error, because absence of a tenant shard ID in the map
|
||||
// isn't a reliable indicator of the tenant being gone: it might have been
|
||||
// InProgress when shutdown started, and cleaned up from that state such
|
||||
// that it's now no longer in the map. Callers will have to wait until
|
||||
// we next start up to get a proper answer. This avoids incorrect 404 API responses.
|
||||
m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?,
|
||||
)),
|
||||
TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown),
|
||||
},
|
||||
TenantsMap::Open(m) => m,
|
||||
};
|
||||
|
||||
Ok(m.get(tenant_shard_id))
|
||||
TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)),
|
||||
}
|
||||
}
|
||||
|
||||
enum TenantSlotAcquireMode {
|
||||
|
||||
@@ -257,6 +257,8 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
|
||||
|
||||
pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
|
||||
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
@@ -1066,6 +1068,28 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
self: &Arc<Self>,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"preserve_initdb_tar_zst",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
|
||||
)
|
||||
.await
|
||||
.context("backing up initdb archive")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
||||
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
||||
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
||||
@@ -1101,6 +1125,14 @@ impl RemoteTimelineClient {
|
||||
let layer_deletion_count = layers.len();
|
||||
self.deletion_queue_client.push_immediate(layers).await?;
|
||||
|
||||
// Delete the initdb.tar.zst, which is not always present, but deletion attempts of
|
||||
// inexistant objects are not considered errors.
|
||||
let initdb_path =
|
||||
remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
|
||||
self.deletion_queue_client
|
||||
.push_immediate(vec![initdb_path])
|
||||
.await?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
||||
@@ -1148,10 +1180,8 @@ impl RemoteTimelineClient {
|
||||
if p == &latest_index {
|
||||
return false;
|
||||
}
|
||||
if let Some(name) = p.object_name() {
|
||||
if name == INITDB_PATH {
|
||||
return false;
|
||||
}
|
||||
if p.object_name() == Some(INITDB_PRESERVED_PATH) {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})
|
||||
@@ -1689,6 +1719,11 @@ pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
fn remote_timelines_path_unsharded(tenant_id: &TenantId) -> RemotePath {
|
||||
let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_timeline_path(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
@@ -1724,6 +1759,16 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_initdb_preserved_archive_path(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> RemotePath {
|
||||
RemotePath::from_string(&format!(
|
||||
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}"
|
||||
))
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_index_path(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
|
||||
@@ -32,7 +32,8 @@ use utils::id::TimelineId;
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{
|
||||
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
|
||||
INITDB_PATH,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -430,6 +431,9 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
|
||||
let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
let remote_preserved_path =
|
||||
remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
let timeline_path = conf.timelines_path(tenant_shard_id);
|
||||
|
||||
if !timeline_path.exists() {
|
||||
@@ -456,8 +460,16 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download =
|
||||
download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
|
||||
let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
|
||||
.await
|
||||
{
|
||||
Ok(dl) => dl,
|
||||
Err(DownloadError::NotFound) => {
|
||||
download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
|
||||
.await?
|
||||
}
|
||||
Err(other) => Err(other)?,
|
||||
};
|
||||
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
|
||||
|
||||
|
||||
@@ -5,19 +5,21 @@ use camino::Utf8Path;
|
||||
use fail::fail_point;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::io::{ErrorKind, SeekFrom};
|
||||
use std::time::SystemTime;
|
||||
use tokio::fs::{self, File};
|
||||
use tokio::io::AsyncSeekExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
|
||||
use super::Generation;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
|
||||
upload_cancellable,
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
|
||||
},
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::{GenericRemoteStorage, TimeTravelError};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::LayerFileMetadata;
|
||||
@@ -144,3 +146,58 @@ pub(crate) async fn upload_initdb_dir(
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
|
||||
upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
|
||||
.await
|
||||
.with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
pub(crate) async fn time_travel_recover_tenant(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let warn_after = 3;
|
||||
let max_attempts = 10;
|
||||
let mut prefixes = Vec::with_capacity(2);
|
||||
if tenant_shard_id.is_zero() {
|
||||
// Also recover the unsharded prefix for a shard of zero:
|
||||
// - if the tenant is totally unsharded, the unsharded prefix contains all the data
|
||||
// - if the tenant is sharded, we still want to recover the initdb data, but we only
|
||||
// want to do it once, so let's do it on the 0 shard
|
||||
let timelines_path_unsharded =
|
||||
super::remote_timelines_path_unsharded(&tenant_shard_id.tenant_id);
|
||||
prefixes.push(timelines_path_unsharded);
|
||||
}
|
||||
if !tenant_shard_id.is_unsharded() {
|
||||
// If the tenant is sharded, we need to recover the sharded prefix
|
||||
let timelines_path = super::remote_timelines_path(tenant_shard_id);
|
||||
prefixes.push(timelines_path);
|
||||
}
|
||||
for prefix in &prefixes {
|
||||
backoff::retry(
|
||||
|| async {
|
||||
storage
|
||||
.time_travel_recover(Some(prefix), timestamp, done_if_after, cancel.clone())
|
||||
.await
|
||||
},
|
||||
|e| !matches!(e, TimeTravelError::Other(_)),
|
||||
warn_after,
|
||||
max_attempts,
|
||||
"time travel recovery of tenant prefix",
|
||||
backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -112,7 +112,7 @@ impl SecondaryTenant {
|
||||
// on shutdown we walk the tenants and fire their
|
||||
// individual cancellations?
|
||||
cancel: CancellationToken::new(),
|
||||
gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
|
||||
gate: Gate::default(),
|
||||
|
||||
shard_identity,
|
||||
tenant_conf: std::sync::Mutex::new(tenant_conf),
|
||||
|
||||
@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
@@ -649,7 +649,7 @@ impl DeltaLayer {
|
||||
{
|
||||
let file = VirtualFile::open_with_options(
|
||||
path,
|
||||
&*std::fs::OpenOptions::new().read(true).write(true),
|
||||
virtual_file::OpenOptions::new().read(true).write(true),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
@@ -884,7 +884,7 @@ impl DeltaLayerInner {
|
||||
|
||||
let keys = self.load_keys(ctx).await?;
|
||||
|
||||
async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
|
||||
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
@@ -906,13 +906,32 @@ impl DeltaLayerInner {
|
||||
|
||||
for entry in keys {
|
||||
let DeltaEntry { key, lsn, val, .. } = entry;
|
||||
let desc = match dump_blob(val, ctx).await {
|
||||
let desc = match dump_blob(&val, ctx).await {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => {
|
||||
format!("ERROR: {err}")
|
||||
}
|
||||
};
|
||||
println!(" key {key} at {lsn}: {desc}");
|
||||
|
||||
// Print more details about CHECKPOINT records. Would be nice to print details
|
||||
// of many other record types too, but these are particularly interesting, as
|
||||
// have a lot of special processing for them in walingest.rs.
|
||||
use pageserver_api::key::CHECKPOINT_KEY;
|
||||
use postgres_ffi::CheckPoint;
|
||||
if key == CHECKPOINT_KEY {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
let checkpoint = CheckPoint::decode(&img)?;
|
||||
println!(" CHECKPOINT: {:?}", checkpoint);
|
||||
}
|
||||
Value::WalRecord(_rec) => {
|
||||
println!(" unexpected walrecord value for checkpoint key");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -34,7 +34,7 @@ use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
@@ -327,7 +327,7 @@ impl ImageLayer {
|
||||
{
|
||||
let file = VirtualFile::open_with_options(
|
||||
path,
|
||||
&*std::fs::OpenOptions::new().read(true).write(true),
|
||||
virtual_file::OpenOptions::new().read(true).write(true),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
@@ -492,11 +492,15 @@ impl ImageLayerWriterInner {
|
||||
},
|
||||
);
|
||||
info!("new image layer {path}");
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
std::fs::OpenOptions::new().write(true).create_new(true),
|
||||
)
|
||||
.await?;
|
||||
let mut file = {
|
||||
VirtualFile::open_with_options(
|
||||
&path,
|
||||
virtual_file::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true),
|
||||
)
|
||||
.await?
|
||||
};
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
|
||||
|
||||
@@ -55,13 +55,13 @@ impl PersistentLayerDesc {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn new_test(key_range: Range<Key>) -> Self {
|
||||
pub fn new_test(key_range: Range<Key>, lsn_range: Range<Lsn>, is_delta: bool) -> Self {
|
||||
Self {
|
||||
tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
|
||||
timeline_id: TimelineId::generate(),
|
||||
key_range,
|
||||
lsn_range: Lsn(0)..Lsn(1),
|
||||
is_delta: false,
|
||||
lsn_range,
|
||||
is_delta,
|
||||
file_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
wait_duration
|
||||
} else {
|
||||
@@ -210,6 +214,58 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
fn log_compaction_error(
|
||||
e: &CompactionError,
|
||||
error_run_count: u32,
|
||||
sleep_duration: &std::time::Duration,
|
||||
task_cancelled: bool,
|
||||
) {
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use CompactionError::*;
|
||||
|
||||
enum LooksLike {
|
||||
Info,
|
||||
Error,
|
||||
}
|
||||
|
||||
let decision = match e {
|
||||
ShuttingDown => None,
|
||||
_ if task_cancelled => Some(LooksLike::Info),
|
||||
Other(e) => {
|
||||
let root_cause = e.root_cause();
|
||||
|
||||
let is_stopping = {
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
upload_queue || timeline
|
||||
};
|
||||
|
||||
if is_stopping {
|
||||
Some(LooksLike::Info)
|
||||
} else {
|
||||
Some(LooksLike::Error)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match decision {
|
||||
Some(LooksLike::Info) => info!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
|
||||
),
|
||||
Some(LooksLike::Error) => error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
|
||||
),
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// GC task's main loop
|
||||
///
|
||||
|
||||
@@ -14,6 +14,7 @@ use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
keyspace::{key_range_size, KeySpaceAccum},
|
||||
models::{
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
|
||||
LayerMapInfo, TimelineState,
|
||||
@@ -32,7 +33,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::sync::gate::Gate;
|
||||
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
@@ -123,7 +124,7 @@ pub(super) enum FlushLoopState {
|
||||
|
||||
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Hole {
|
||||
pub(crate) struct Hole {
|
||||
key_range: Range<Key>,
|
||||
coverage_size: usize,
|
||||
}
|
||||
@@ -391,8 +392,7 @@ pub(crate) enum PageReconstructError {
|
||||
#[error("Ancestor LSN wait error: {0}")]
|
||||
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||
|
||||
/// The operation was cancelled
|
||||
#[error("Cancelled")]
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
/// The ancestor of this is being stopped
|
||||
@@ -404,6 +404,34 @@ pub(crate) enum PageReconstructError {
|
||||
WalRedo(anyhow::Error),
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
/// Returns true if this error indicates a tenant/timeline shutdown alike situation
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use PageReconstructError::*;
|
||||
match self {
|
||||
Other(_) => false,
|
||||
AncestorLsnTimeout(_) => false,
|
||||
Cancelled | AncestorStopping(_) => true,
|
||||
WalRedo(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum CreateImageLayersError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
GetVectoredError(GetVectoredError),
|
||||
|
||||
#[error(transparent)]
|
||||
PageReconstructError(PageReconstructError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum FlushLayerError {
|
||||
/// Timeline cancellation token was cancelled
|
||||
@@ -411,7 +439,34 @@ enum FlushLayerError {
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
PageReconstructError(#[from] PageReconstructError),
|
||||
CreateImageLayersError(CreateImageLayersError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetVectoredError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
#[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
|
||||
Oversized(u64),
|
||||
|
||||
#[error("Requested at invalid LSN: {0}")]
|
||||
InvalidLsn(Lsn),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetReadyAncestorError {
|
||||
#[error("ancestor timeline {0} is being stopped")]
|
||||
AncestorStopping(TimelineId),
|
||||
|
||||
#[error("Ancestor LSN wait error: {0}")]
|
||||
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
@@ -456,22 +511,73 @@ pub(crate) enum WaitLsnError {
|
||||
Timeout(String),
|
||||
}
|
||||
|
||||
// The impls below achieve cancellation mapping for errors.
|
||||
// Perhaps there's a way of achieving this with less cruft.
|
||||
|
||||
impl From<CreateImageLayersError> for CompactionError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
|
||||
_ => CompactionError::Other(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CreateImageLayersError> for FlushLayerError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
|
||||
any => FlushLayerError::CreateImageLayersError(any),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CreateImageLayersError {
|
||||
fn from(e: PageReconstructError) -> Self {
|
||||
match e {
|
||||
PageReconstructError::Cancelled => CreateImageLayersError::Cancelled,
|
||||
_ => CreateImageLayersError::PageReconstructError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetVectoredError> for CreateImageLayersError {
|
||||
fn from(e: GetVectoredError) -> Self {
|
||||
match e {
|
||||
GetVectoredError::Cancelled => CreateImageLayersError::Cancelled,
|
||||
_ => CreateImageLayersError::GetVectoredError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetReadyAncestorError> for PageReconstructError {
|
||||
fn from(e: GetReadyAncestorError) -> Self {
|
||||
use GetReadyAncestorError::*;
|
||||
match e {
|
||||
AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
|
||||
AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
|
||||
Cancelled => PageReconstructError::Cancelled,
|
||||
Other(other) => PageReconstructError::Other(other),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
pub fn get_ancestor_lsn(&self) -> Lsn {
|
||||
pub(crate) fn get_ancestor_lsn(&self) -> Lsn {
|
||||
self.ancestor_lsn
|
||||
}
|
||||
|
||||
/// Get the ancestor's timeline id
|
||||
pub fn get_ancestor_timeline_id(&self) -> Option<TimelineId> {
|
||||
pub(crate) fn get_ancestor_timeline_id(&self) -> Option<TimelineId> {
|
||||
self.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(|ancestor| ancestor.timeline_id)
|
||||
}
|
||||
|
||||
/// Lock and get timeline's GC cutoff
|
||||
pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
|
||||
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
|
||||
self.latest_gc_cutoff_lsn.read()
|
||||
}
|
||||
|
||||
@@ -575,28 +681,79 @@ impl Timeline {
|
||||
res
|
||||
}
|
||||
|
||||
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
|
||||
|
||||
/// Look up multiple page versions at a given LSN
|
||||
///
|
||||
/// This naive implementation will be replaced with a more efficient one
|
||||
/// which actually vectorizes the read path.
|
||||
pub(crate) async fn get_vectored(
|
||||
&self,
|
||||
key_ranges: &[Range<Key>],
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(lsn));
|
||||
}
|
||||
|
||||
let key_count = key_ranges
|
||||
.iter()
|
||||
.map(|range| key_range_size(range) as u64)
|
||||
.sum();
|
||||
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
}
|
||||
|
||||
let _timer = crate::metrics::GET_VECTORED_LATENCY
|
||||
.for_task_kind(ctx.task_kind())
|
||||
.map(|t| t.start_timer());
|
||||
|
||||
let mut values = BTreeMap::new();
|
||||
for range in key_ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
|
||||
let block = self.get(key, lsn, ctx).await;
|
||||
|
||||
if matches!(
|
||||
block,
|
||||
Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||
) {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
values.insert(key, block);
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(values)
|
||||
}
|
||||
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
pub fn get_last_record_lsn(&self) -> Lsn {
|
||||
pub(crate) fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().last
|
||||
}
|
||||
|
||||
pub fn get_prev_record_lsn(&self) -> Lsn {
|
||||
pub(crate) fn get_prev_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().prev
|
||||
}
|
||||
|
||||
/// Atomically get both last and prev.
|
||||
pub fn get_last_record_rlsn(&self) -> RecordLsn {
|
||||
pub(crate) fn get_last_record_rlsn(&self) -> RecordLsn {
|
||||
self.last_record_lsn.load()
|
||||
}
|
||||
|
||||
pub fn get_disk_consistent_lsn(&self) -> Lsn {
|
||||
pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn.load()
|
||||
}
|
||||
|
||||
/// remote_consistent_lsn from the perspective of the tenant's current generation,
|
||||
/// not validated with control plane yet.
|
||||
/// See [`Self::get_remote_consistent_lsn_visible`].
|
||||
pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
|
||||
pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.remote_consistent_lsn_projected()
|
||||
} else {
|
||||
@@ -607,7 +764,7 @@ impl Timeline {
|
||||
/// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
|
||||
/// i.e. a value of remote_consistent_lsn_projected which has undergone
|
||||
/// generation validation in the deletion queue.
|
||||
pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
|
||||
pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.remote_consistent_lsn_visible()
|
||||
} else {
|
||||
@@ -618,7 +775,7 @@ impl Timeline {
|
||||
/// The sum of the file size of all historic layers in the layer map.
|
||||
/// This method makes no distinction between local and remote layers.
|
||||
/// Hence, the result **does not represent local filesystem usage**.
|
||||
pub async fn layer_size_sum(&self) -> u64 {
|
||||
pub(crate) async fn layer_size_sum(&self) -> u64 {
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map();
|
||||
let mut size = 0;
|
||||
@@ -628,7 +785,7 @@ impl Timeline {
|
||||
size
|
||||
}
|
||||
|
||||
pub fn resident_physical_size(&self) -> u64 {
|
||||
pub(crate) fn resident_physical_size(&self) -> u64 {
|
||||
self.metrics.resident_physical_size_get()
|
||||
}
|
||||
|
||||
@@ -704,7 +861,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Check that it is valid to request operations with that lsn.
|
||||
pub fn check_lsn_is_in_scope(
|
||||
pub(crate) fn check_lsn_is_in_scope(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
@@ -720,7 +877,7 @@ impl Timeline {
|
||||
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||
pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||
self.freeze_inmem_layer(false).await;
|
||||
self.flush_frozen_layers_and_wait().await
|
||||
}
|
||||
@@ -864,7 +1021,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
pub async fn writer(&self) -> TimelineWriter<'_> {
|
||||
pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
|
||||
TimelineWriter {
|
||||
tl: self,
|
||||
_write_guard: self.write_lock.lock().await,
|
||||
@@ -876,7 +1033,7 @@ impl Timeline {
|
||||
///
|
||||
/// Also flush after a period of time without new data -- it helps
|
||||
/// safekeepers to regard pageserver as caught up and suspend activity.
|
||||
pub async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
|
||||
pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
|
||||
let last_lsn = self.get_last_record_lsn();
|
||||
let open_layer_size = {
|
||||
let guard = self.layers.read().await;
|
||||
@@ -914,13 +1071,16 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn activate(
|
||||
pub(crate) fn activate(
|
||||
self: &Arc<Self>,
|
||||
broker_client: BrokerClientChannel,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
self.spawn_initial_logical_size_computation_task(ctx);
|
||||
if self.tenant_shard_id.is_zero() {
|
||||
// Logical size is only maintained accurately on shard zero.
|
||||
self.spawn_initial_logical_size_computation_task(ctx);
|
||||
}
|
||||
self.launch_wal_receiver(ctx, broker_client);
|
||||
self.set_state(TimelineState::Active);
|
||||
self.launch_eviction_task(background_jobs_can_start);
|
||||
@@ -930,7 +1090,6 @@ impl Timeline {
|
||||
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
||||
///
|
||||
/// While we are flushing, we continue to accept read I/O.
|
||||
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
|
||||
pub(crate) async fn flush_and_shutdown(&self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -979,6 +1138,8 @@ impl Timeline {
|
||||
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
|
||||
/// the graceful [`Timeline::flush_and_shutdown`] function.
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Signal any subscribers to our cancellation token to drop out
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
self.cancel.cancel();
|
||||
@@ -1014,7 +1175,7 @@ impl Timeline {
|
||||
self.gate.close().await;
|
||||
}
|
||||
|
||||
pub fn set_state(&self, new_state: TimelineState) {
|
||||
pub(crate) fn set_state(&self, new_state: TimelineState) {
|
||||
match (self.current_state(), new_state) {
|
||||
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
|
||||
info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
|
||||
@@ -1034,7 +1195,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_broken(&self, reason: String) {
|
||||
pub(crate) fn set_broken(&self, reason: String) {
|
||||
let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
|
||||
let broken_state = TimelineState::Broken {
|
||||
reason,
|
||||
@@ -1048,27 +1209,27 @@ impl Timeline {
|
||||
self.cancel.cancel();
|
||||
}
|
||||
|
||||
pub fn current_state(&self) -> TimelineState {
|
||||
pub(crate) fn current_state(&self) -> TimelineState {
|
||||
self.state.borrow().clone()
|
||||
}
|
||||
|
||||
pub fn is_broken(&self) -> bool {
|
||||
pub(crate) fn is_broken(&self) -> bool {
|
||||
matches!(&*self.state.borrow(), TimelineState::Broken { .. })
|
||||
}
|
||||
|
||||
pub fn is_active(&self) -> bool {
|
||||
pub(crate) fn is_active(&self) -> bool {
|
||||
self.current_state() == TimelineState::Active
|
||||
}
|
||||
|
||||
pub fn is_stopping(&self) -> bool {
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
self.current_state() == TimelineState::Stopping
|
||||
}
|
||||
|
||||
pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
|
||||
pub(crate) fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
|
||||
self.state.subscribe()
|
||||
}
|
||||
|
||||
pub async fn wait_to_become_active(
|
||||
pub(crate) async fn wait_to_become_active(
|
||||
&self,
|
||||
_ctx: &RequestContext, // Prepare for use by cancellation
|
||||
) -> Result<(), TimelineState> {
|
||||
@@ -1093,7 +1254,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
|
||||
pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map();
|
||||
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
|
||||
@@ -1117,7 +1278,10 @@ impl Timeline {
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||
pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||
pub(crate) async fn download_layer(
|
||||
&self,
|
||||
layer_file_name: &str,
|
||||
) -> anyhow::Result<Option<bool>> {
|
||||
let Some(layer) = self.find_layer(layer_file_name).await else {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -1134,7 +1298,7 @@ impl Timeline {
|
||||
/// Evict just one layer.
|
||||
///
|
||||
/// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
|
||||
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||
pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||
let _gate = self
|
||||
.gate
|
||||
.enter()
|
||||
@@ -1157,6 +1321,13 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
|
||||
|
||||
// Private functions
|
||||
impl Timeline {
|
||||
pub(crate) fn get_lazy_slru_download(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
|
||||
tenant_conf
|
||||
.lazy_slru_download
|
||||
.unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
|
||||
}
|
||||
|
||||
fn get_checkpoint_distance(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
|
||||
tenant_conf
|
||||
@@ -1365,7 +1536,7 @@ impl Timeline {
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
|
||||
|
||||
cancel,
|
||||
gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
|
||||
gate: Gate::default(),
|
||||
|
||||
compaction_lock: tokio::sync::Mutex::default(),
|
||||
gc_lock: tokio::sync::Mutex::default(),
|
||||
@@ -1687,6 +1858,12 @@ impl Timeline {
|
||||
priority: GetLogicalSizePriority,
|
||||
ctx: &RequestContext,
|
||||
) -> logical_size::CurrentLogicalSize {
|
||||
if !self.tenant_shard_id.is_zero() {
|
||||
// Logical size is only accurately maintained on shard zero: when called elsewhere, for example
|
||||
// when HTTP API is serving a GET for timeline zero, return zero
|
||||
return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
|
||||
}
|
||||
|
||||
let current_size = self.current_logical_size.current_size();
|
||||
debug!("Current size: {current_size:?}");
|
||||
|
||||
@@ -1929,7 +2106,7 @@ impl Timeline {
|
||||
.expect("only this task sets it");
|
||||
}
|
||||
|
||||
pub fn spawn_ondemand_logical_size_calculation(
|
||||
pub(crate) fn spawn_ondemand_logical_size_calculation(
|
||||
self: &Arc<Self>,
|
||||
lsn: Lsn,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
@@ -1975,6 +2152,9 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// We should never be calculating logical sizes on shard !=0, because these shards do not have
|
||||
// accurate relation sizes, and they do not emit consumption metrics.
|
||||
debug_assert!(self.tenant_shard_id.is_zero());
|
||||
|
||||
let _guard = self.gate.enter();
|
||||
|
||||
@@ -2008,7 +2188,7 @@ impl Timeline {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn calculate_logical_size(
|
||||
async fn calculate_logical_size(
|
||||
&self,
|
||||
up_to_lsn: Lsn,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
@@ -2262,60 +2442,8 @@ impl Timeline {
|
||||
timeline.ancestor_lsn,
|
||||
cont_lsn
|
||||
);
|
||||
let ancestor = match timeline.get_ancestor_timeline() {
|
||||
Ok(timeline) => timeline,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
|
||||
// It's possible that the ancestor timeline isn't active yet, or
|
||||
// is active but hasn't yet caught up to the branch point. Wait
|
||||
// for it.
|
||||
//
|
||||
// This cannot happen while the pageserver is running normally,
|
||||
// because you cannot create a branch from a point that isn't
|
||||
// present in the pageserver yet. However, we don't wait for the
|
||||
// branch point to be uploaded to cloud storage before creating
|
||||
// a branch. I.e., the branch LSN need not be remote consistent
|
||||
// for the branching operation to succeed.
|
||||
//
|
||||
// Hence, if we try to load a tenant in such a state where
|
||||
// 1. the existence of the branch was persisted (in IndexPart and/or locally)
|
||||
// 2. but the ancestor state is behind branch_lsn because it was not yet persisted
|
||||
// then we will need to wait for the ancestor timeline to
|
||||
// re-stream WAL up to branch_lsn before we access it.
|
||||
//
|
||||
// How can a tenant get in such a state?
|
||||
// - ungraceful pageserver process exit
|
||||
// - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
|
||||
//
|
||||
// NB: this could be avoided by requiring
|
||||
// branch_lsn >= remote_consistent_lsn
|
||||
// during branch creation.
|
||||
match ancestor.wait_to_become_active(ctx).await {
|
||||
Ok(()) => {}
|
||||
Err(TimelineState::Stopping) => {
|
||||
return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
|
||||
}
|
||||
Err(state) => {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"Timeline {} will not become active. Current state: {:?}",
|
||||
ancestor.timeline_id,
|
||||
&state,
|
||||
)));
|
||||
}
|
||||
}
|
||||
ancestor
|
||||
.wait_lsn(timeline.ancestor_lsn, ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
|
||||
WaitLsnError::Shutdown => PageReconstructError::Cancelled,
|
||||
e @ WaitLsnError::BadState => {
|
||||
PageReconstructError::Other(anyhow::anyhow!(e))
|
||||
}
|
||||
})?;
|
||||
|
||||
timeline_owned = ancestor;
|
||||
timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
|
||||
timeline = &*timeline_owned;
|
||||
prev_lsn = Lsn(u64::MAX);
|
||||
continue 'outer;
|
||||
@@ -2445,6 +2573,66 @@ impl Timeline {
|
||||
Some((lsn, img))
|
||||
}
|
||||
|
||||
async fn get_ready_ancestor_timeline(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, GetReadyAncestorError> {
|
||||
let ancestor = match self.get_ancestor_timeline() {
|
||||
Ok(timeline) => timeline,
|
||||
Err(e) => return Err(GetReadyAncestorError::from(e)),
|
||||
};
|
||||
|
||||
// It's possible that the ancestor timeline isn't active yet, or
|
||||
// is active but hasn't yet caught up to the branch point. Wait
|
||||
// for it.
|
||||
//
|
||||
// This cannot happen while the pageserver is running normally,
|
||||
// because you cannot create a branch from a point that isn't
|
||||
// present in the pageserver yet. However, we don't wait for the
|
||||
// branch point to be uploaded to cloud storage before creating
|
||||
// a branch. I.e., the branch LSN need not be remote consistent
|
||||
// for the branching operation to succeed.
|
||||
//
|
||||
// Hence, if we try to load a tenant in such a state where
|
||||
// 1. the existence of the branch was persisted (in IndexPart and/or locally)
|
||||
// 2. but the ancestor state is behind branch_lsn because it was not yet persisted
|
||||
// then we will need to wait for the ancestor timeline to
|
||||
// re-stream WAL up to branch_lsn before we access it.
|
||||
//
|
||||
// How can a tenant get in such a state?
|
||||
// - ungraceful pageserver process exit
|
||||
// - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
|
||||
//
|
||||
// NB: this could be avoided by requiring
|
||||
// branch_lsn >= remote_consistent_lsn
|
||||
// during branch creation.
|
||||
match ancestor.wait_to_become_active(ctx).await {
|
||||
Ok(()) => {}
|
||||
Err(TimelineState::Stopping) => {
|
||||
return Err(GetReadyAncestorError::AncestorStopping(
|
||||
ancestor.timeline_id,
|
||||
));
|
||||
}
|
||||
Err(state) => {
|
||||
return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
|
||||
"Timeline {} will not become active. Current state: {:?}",
|
||||
ancestor.timeline_id,
|
||||
&state,
|
||||
)));
|
||||
}
|
||||
}
|
||||
ancestor
|
||||
.wait_lsn(self.ancestor_lsn, ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
|
||||
WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
|
||||
e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
|
||||
})?;
|
||||
|
||||
Ok(ancestor)
|
||||
}
|
||||
|
||||
fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
|
||||
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
|
||||
format!(
|
||||
@@ -2582,7 +2770,7 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
err @ Err(
|
||||
FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
|
||||
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
|
||||
) => {
|
||||
error!("could not flush frozen layer: {err:?}");
|
||||
break err;
|
||||
@@ -2655,12 +2843,12 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))]
|
||||
async fn flush_frozen_layer(
|
||||
self: &Arc<Self>,
|
||||
frozen_layer: Arc<InMemoryLayer>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), FlushLayerError> {
|
||||
span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
@@ -2859,6 +3047,21 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client
|
||||
.preserve_initdb_archive(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
&self.cancel,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
|
||||
// in layer map immediately. The caller is responsible to put it into the layer map.
|
||||
async fn create_delta_layer(
|
||||
@@ -2950,11 +3153,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
async fn time_for_new_image_layer(
|
||||
&self,
|
||||
partition: &KeySpace,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<bool> {
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
@@ -2974,20 +3173,20 @@ impl Timeline {
|
||||
// but the range is already covered by image layers at more recent LSNs. Before we
|
||||
// create a new image layer, check if the range is already covered at more recent LSNs.
|
||||
if !layers
|
||||
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
|
||||
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
|
||||
{
|
||||
debug!(
|
||||
"Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
|
||||
img_range.start, img_range.end, cutoff_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
||||
let image_coverage = layers.image_coverage(part_range, lsn);
|
||||
for (img_range, last_img) in image_coverage {
|
||||
let img_lsn = if let Some(last_img) = last_img {
|
||||
last_img.get_lsn_range().end
|
||||
@@ -3008,7 +3207,7 @@ impl Timeline {
|
||||
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||
if img_lsn < lsn {
|
||||
let num_deltas =
|
||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
|
||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold));
|
||||
|
||||
max_deltas = max_deltas.max(num_deltas);
|
||||
if num_deltas >= threshold {
|
||||
@@ -3016,7 +3215,7 @@ impl Timeline {
|
||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3026,7 +3225,7 @@ impl Timeline {
|
||||
max_deltas,
|
||||
"none of the partitioned ranges had >= {threshold} deltas"
|
||||
);
|
||||
Ok(false)
|
||||
false
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %force))]
|
||||
@@ -3036,7 +3235,7 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<ResidentLayer>, PageReconstructError> {
|
||||
) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
let mut image_layers = Vec::new();
|
||||
|
||||
@@ -3054,7 +3253,7 @@ impl Timeline {
|
||||
for partition in partitioning.parts.iter() {
|
||||
let img_range = start..partition.ranges.last().unwrap().end;
|
||||
start = img_range.end;
|
||||
if force || self.time_for_new_image_layer(partition, lsn).await? {
|
||||
if force || self.time_for_new_image_layer(partition, lsn).await {
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
@@ -3065,10 +3264,12 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
"failpoint image-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
|
||||
let mut key_request_accum = KeySpaceAccum::new();
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
@@ -3081,34 +3282,55 @@ impl Timeline {
|
||||
key = key.next();
|
||||
continue;
|
||||
}
|
||||
let img = match self.get(key, lsn, ctx).await {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
|
||||
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
image_layer_writer.put_image(key, &img).await?;
|
||||
key_request_accum.add_key(key);
|
||||
if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| key.next() == range.end
|
||||
{
|
||||
let results = self
|
||||
.get_vectored(
|
||||
&key_request_accum.consume_keyspace().ranges,
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for (img_key, img) in results {
|
||||
let img = match img {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(img_key)
|
||||
|| is_rel_vm_block_key(img_key)
|
||||
{
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(
|
||||
CreateImageLayersError::PageReconstructError(err),
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
image_layer_writer.put_image(img_key, &img).await?;
|
||||
}
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
@@ -3215,7 +3437,7 @@ enum DurationRecorder {
|
||||
}
|
||||
|
||||
impl DurationRecorder {
|
||||
pub fn till_now(&self) -> DurationRecorder {
|
||||
fn till_now(&self) -> DurationRecorder {
|
||||
match self {
|
||||
DurationRecorder::NotStarted => {
|
||||
panic!("must only call on recorded measurements")
|
||||
@@ -3226,7 +3448,7 @@ impl DurationRecorder {
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn into_recorded(self) -> Option<RecordedDuration> {
|
||||
fn into_recorded(self) -> Option<RecordedDuration> {
|
||||
match self {
|
||||
DurationRecorder::NotStarted => None,
|
||||
DurationRecorder::Recorded(recorded, _) => Some(recorded),
|
||||
@@ -3484,7 +3706,7 @@ impl Timeline {
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
||||
// That is why it is better to measure size of hole as number of covering image layers.
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
|
||||
if coverage_size >= min_hole_coverage_size {
|
||||
heap.push(Hole {
|
||||
key_range,
|
||||
@@ -4110,7 +4332,7 @@ impl Timeline {
|
||||
// we cannot remove C, even though it's older than 2500, because
|
||||
// the delta layer 2000-3000 depends on it.
|
||||
if !layers
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
|
||||
{
|
||||
debug!("keeping {} because it is the latest layer", l.filename());
|
||||
// Collect delta key ranges that need image layers to allow garbage
|
||||
@@ -4240,7 +4462,7 @@ impl Timeline {
|
||||
.walredo_mgr
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
.await
|
||||
.context("Failed to reconstruct a page image:")
|
||||
.context("reconstruct a page image")
|
||||
{
|
||||
Ok(img) => img,
|
||||
Err(e) => return Err(PageReconstructError::WalRedo(e)),
|
||||
@@ -4426,7 +4648,9 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_download_all_remote_layers_task_info(&self) -> Option<DownloadRemoteLayersTaskInfo> {
|
||||
pub(crate) fn get_download_all_remote_layers_task_info(
|
||||
&self,
|
||||
) -> Option<DownloadRemoteLayersTaskInfo> {
|
||||
self.download_all_remote_layers_task_info
|
||||
.read()
|
||||
.unwrap()
|
||||
@@ -4522,7 +4746,7 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
|
||||
// TODO Currently, Deref is used to allow easy access to read methods from this trait.
|
||||
// This is probably considered a bad practice in Rust and should be fixed eventually,
|
||||
// but will cause large code changes.
|
||||
pub struct TimelineWriter<'a> {
|
||||
pub(crate) struct TimelineWriter<'a> {
|
||||
tl: &'a Timeline,
|
||||
_write_guard: tokio::sync::MutexGuard<'a, ()>,
|
||||
}
|
||||
@@ -4540,7 +4764,7 @@ impl<'a> TimelineWriter<'a> {
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
pub async fn put(
|
||||
pub(crate) async fn put(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
|
||||
@@ -356,12 +356,14 @@ impl DeleteTimelineFlow {
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
|
||||
#[instrument(skip_all, fields(%inplace))]
|
||||
pub async fn run(
|
||||
tenant: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
inplace: bool,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
super::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
@@ -319,6 +319,13 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
if !self.tenant_shard_id.is_zero() {
|
||||
// Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
|
||||
// for consumption metrics (consumption metrics are only sent from shard 0). We may therefore
|
||||
// skip imitating logical size accesses for eviction purposes.
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
|
||||
let mut state = self.eviction_task_timeline_state.lock().await;
|
||||
|
||||
// Only do the imitate_layer accesses approximately as often as the threshold. A little
|
||||
|
||||
@@ -101,6 +101,14 @@ impl From<&Exact> for u64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl Approximate {
|
||||
/// For use in situations where we don't have a sane logical size value but need
|
||||
/// to return something, e.g. in HTTP API on shard >0 of a sharded tenant.
|
||||
pub(crate) fn zero() -> Self {
|
||||
Self(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl CurrentLogicalSize {
|
||||
pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
|
||||
match self {
|
||||
|
||||
@@ -426,13 +426,21 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
// Send the replication feedback message.
|
||||
// Regular standby_status_update fields are put into this message.
|
||||
let current_timeline_size = timeline
|
||||
.get_current_logical_size(
|
||||
crate::tenant::timeline::GetLogicalSizePriority::User,
|
||||
&ctx,
|
||||
)
|
||||
// FIXME: https://github.com/neondatabase/neon/issues/5963
|
||||
.size_dont_care_about_accuracy();
|
||||
let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
|
||||
timeline
|
||||
.get_current_logical_size(
|
||||
crate::tenant::timeline::GetLogicalSizePriority::User,
|
||||
&ctx,
|
||||
)
|
||||
// FIXME: https://github.com/neondatabase/neon/issues/5963
|
||||
.size_dont_care_about_accuracy()
|
||||
} else {
|
||||
// Non-zero shards send zero for logical size. The safekeeper will ignore
|
||||
// this number. This is because in a sharded tenant, only shard zero maintains
|
||||
// accurate logical size.
|
||||
0
|
||||
};
|
||||
|
||||
let status_update = PageserverFeedback {
|
||||
current_timeline_size,
|
||||
last_received_lsn,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user