mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-28 20:50:37 +00:00
Compare commits
1 Commits
release-43
...
remove_ini
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bd235a5fe3 |
@@ -1,3 +1,17 @@
|
|||||||
|
# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
|
||||||
|
# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
|
||||||
|
# optimizations enabled by "opt-level=1" don't affect debuggability too much.
|
||||||
|
#
|
||||||
|
# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
|
||||||
|
#
|
||||||
|
[profile.dev.package."*"]
|
||||||
|
# Set the default for dependencies in Development mode.
|
||||||
|
opt-level = 3
|
||||||
|
|
||||||
|
[profile.dev]
|
||||||
|
# Turn on a small amount of optimization in Development mode.
|
||||||
|
opt-level = 1
|
||||||
|
|
||||||
[build]
|
[build]
|
||||||
# This is only present for local builds, as it will be overridden
|
# This is only present for local builds, as it will be overridden
|
||||||
# by the RUSTDOCFLAGS env var in CI.
|
# by the RUSTDOCFLAGS env var in CI.
|
||||||
|
|||||||
2
.github/PULL_REQUEST_TEMPLATE/release-pr.md
vendored
2
.github/PULL_REQUEST_TEMPLATE/release-pr.md
vendored
@@ -3,7 +3,7 @@
|
|||||||
**NB: this PR must be merged only by 'Create a merge commit'!**
|
**NB: this PR must be merged only by 'Create a merge commit'!**
|
||||||
|
|
||||||
### Checklist when preparing for release
|
### Checklist when preparing for release
|
||||||
- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b)
|
- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow)
|
||||||
- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
|
- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
|
||||||
- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?
|
- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?
|
||||||
|
|
||||||
|
|||||||
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -1,7 +1,5 @@
|
|||||||
self-hosted-runner:
|
self-hosted-runner:
|
||||||
labels:
|
labels:
|
||||||
- arm64
|
|
||||||
- dev
|
|
||||||
- gen3
|
- gen3
|
||||||
- large
|
- large
|
||||||
- small
|
- small
|
||||||
|
|||||||
18
.github/workflows/build_and_test.yml
vendored
18
.github/workflows/build_and_test.yml
vendored
@@ -172,10 +172,10 @@ jobs:
|
|||||||
# https://github.com/EmbarkStudios/cargo-deny
|
# https://github.com/EmbarkStudios/cargo-deny
|
||||||
- name: Check rust licenses/bans/advisories/sources
|
- name: Check rust licenses/bans/advisories/sources
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
run: cargo deny check --hide-inclusion-graph
|
run: cargo deny check
|
||||||
|
|
||||||
build-neon:
|
build-neon:
|
||||||
needs: [ check-permissions, tag ]
|
needs: [ check-permissions ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
@@ -187,7 +187,6 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
BUILD_TYPE: ${{ matrix.build_type }}
|
BUILD_TYPE: ${{ matrix.build_type }}
|
||||||
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Fix git ownership
|
- name: Fix git ownership
|
||||||
@@ -404,7 +403,7 @@ jobs:
|
|||||||
uses: ./.github/actions/save-coverage-data
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
regress-tests:
|
regress-tests:
|
||||||
needs: [ check-permissions, build-neon, tag ]
|
needs: [ check-permissions, build-neon ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
@@ -436,7 +435,6 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
|
||||||
|
|
||||||
- name: Merge and upload coverage data
|
- name: Merge and upload coverage data
|
||||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||||
@@ -587,13 +585,10 @@ jobs:
|
|||||||
id: upload-coverage-report-new
|
id: upload-coverage-report-new
|
||||||
env:
|
env:
|
||||||
BUCKET: neon-github-public-dev
|
BUCKET: neon-github-public-dev
|
||||||
# A differential coverage report is available only for PRs.
|
|
||||||
# (i.e. for pushes into main/release branches we have a regular coverage report)
|
|
||||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
|
|
||||||
run: |
|
run: |
|
||||||
|
BASELINE="$(git merge-base HEAD origin/main)"
|
||||||
CURRENT="${COMMIT_SHA}"
|
CURRENT="${COMMIT_SHA}"
|
||||||
BASELINE="$(git merge-base $BASE_SHA $CURRENT)"
|
|
||||||
|
|
||||||
cp /tmp/coverage/report/lcov.info ./${CURRENT}.info
|
cp /tmp/coverage/report/lcov.info ./${CURRENT}.info
|
||||||
|
|
||||||
@@ -853,7 +848,7 @@ jobs:
|
|||||||
run:
|
run:
|
||||||
shell: sh -eu {0}
|
shell: sh -eu {0}
|
||||||
env:
|
env:
|
||||||
VM_BUILDER_VERSION: v0.19.0
|
VM_BUILDER_VERSION: v0.18.5
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -875,7 +870,8 @@ jobs:
|
|||||||
- name: Build vm image
|
- name: Build vm image
|
||||||
run: |
|
run: |
|
||||||
./vm-builder \
|
./vm-builder \
|
||||||
-spec=vm-image-spec.yaml \
|
-enable-file-cache \
|
||||||
|
-cgroup-uid=postgres \
|
||||||
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||||
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||||
|
|
||||||
|
|||||||
181
.github/workflows/neon_extra_builds.yml
vendored
181
.github/workflows/neon_extra_builds.yml
vendored
@@ -21,10 +21,7 @@ env:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
check-macos-build:
|
check-macos-build:
|
||||||
if: |
|
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
|
||||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
|
||||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
|
||||||
github.ref_name == 'main'
|
|
||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|
||||||
@@ -115,182 +112,8 @@ jobs:
|
|||||||
- name: Check that no warnings are produced
|
- name: Check that no warnings are produced
|
||||||
run: ./run_clippy.sh
|
run: ./run_clippy.sh
|
||||||
|
|
||||||
check-linux-arm-build:
|
|
||||||
timeout-minutes: 90
|
|
||||||
runs-on: [ self-hosted, dev, arm64 ]
|
|
||||||
|
|
||||||
env:
|
|
||||||
# Use release build only, to have less debug info around
|
|
||||||
# Hence keeping target/ (and general cache size) smaller
|
|
||||||
BUILD_TYPE: release
|
|
||||||
CARGO_FEATURES: --features testing
|
|
||||||
CARGO_FLAGS: --locked --release
|
|
||||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
|
||||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
|
||||||
|
|
||||||
container:
|
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
|
||||||
options: --init
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Fix git ownership
|
|
||||||
run: |
|
|
||||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
|
||||||
#
|
|
||||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
|
||||||
# Ref https://github.com/actions/checkout/issues/785
|
|
||||||
#
|
|
||||||
git config --global --add safe.directory ${{ github.workspace }}
|
|
||||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
|
||||||
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
submodules: true
|
|
||||||
fetch-depth: 1
|
|
||||||
|
|
||||||
- name: Set pg 14 revision for caching
|
|
||||||
id: pg_v14_rev
|
|
||||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Set pg 15 revision for caching
|
|
||||||
id: pg_v15_rev
|
|
||||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Set pg 16 revision for caching
|
|
||||||
id: pg_v16_rev
|
|
||||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Set env variables
|
|
||||||
run: |
|
|
||||||
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Cache postgres v14 build
|
|
||||||
id: cache_pg_14
|
|
||||||
uses: actions/cache@v3
|
|
||||||
with:
|
|
||||||
path: pg_install/v14
|
|
||||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
|
||||||
|
|
||||||
- name: Cache postgres v15 build
|
|
||||||
id: cache_pg_15
|
|
||||||
uses: actions/cache@v3
|
|
||||||
with:
|
|
||||||
path: pg_install/v15
|
|
||||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
|
||||||
|
|
||||||
- name: Cache postgres v16 build
|
|
||||||
id: cache_pg_16
|
|
||||||
uses: actions/cache@v3
|
|
||||||
with:
|
|
||||||
path: pg_install/v16
|
|
||||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
|
||||||
|
|
||||||
- name: Build postgres v14
|
|
||||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
|
||||||
run: mold -run make postgres-v14 -j$(nproc)
|
|
||||||
|
|
||||||
- name: Build postgres v15
|
|
||||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
|
||||||
run: mold -run make postgres-v15 -j$(nproc)
|
|
||||||
|
|
||||||
- name: Build postgres v16
|
|
||||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
|
||||||
run: mold -run make postgres-v16 -j$(nproc)
|
|
||||||
|
|
||||||
- name: Build neon extensions
|
|
||||||
run: mold -run make neon-pg-ext -j$(nproc)
|
|
||||||
|
|
||||||
- name: Build walproposer-lib
|
|
||||||
run: mold -run make walproposer-lib -j$(nproc)
|
|
||||||
|
|
||||||
- name: Run cargo build
|
|
||||||
run: |
|
|
||||||
mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
|
||||||
|
|
||||||
- name: Run cargo test
|
|
||||||
run: |
|
|
||||||
cargo test $CARGO_FLAGS $CARGO_FEATURES
|
|
||||||
|
|
||||||
# Run separate tests for real S3
|
|
||||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
|
||||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
|
||||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
|
||||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
|
||||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
|
||||||
|
|
||||||
# Run separate tests for real Azure Blob Storage
|
|
||||||
# XXX: replace region with `eu-central-1`-like region
|
|
||||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
|
||||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
|
||||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
|
||||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
|
||||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
|
||||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
|
||||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
|
||||||
|
|
||||||
check-codestyle-rust-arm:
|
|
||||||
timeout-minutes: 90
|
|
||||||
runs-on: [ self-hosted, dev, arm64 ]
|
|
||||||
|
|
||||||
container:
|
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
|
||||||
options: --init
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
submodules: true
|
|
||||||
fetch-depth: 1
|
|
||||||
|
|
||||||
# Some of our rust modules use FFI and need those to be checked
|
|
||||||
- name: Get postgres headers
|
|
||||||
run: make postgres-headers -j$(nproc)
|
|
||||||
|
|
||||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
|
||||||
# This will catch compiler & clippy warnings in all feature combinations.
|
|
||||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
|
||||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
|
||||||
- run: |
|
|
||||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
|
||||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
|
||||||
echo "No clippy args found in .neon_clippy_args"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
|
||||||
- name: Run cargo clippy (debug)
|
|
||||||
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
|
||||||
- name: Run cargo clippy (release)
|
|
||||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
|
||||||
|
|
||||||
- name: Check documentation generation
|
|
||||||
run: cargo doc --workspace --no-deps --document-private-items
|
|
||||||
env:
|
|
||||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
|
||||||
|
|
||||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
|
||||||
- name: Check formatting
|
|
||||||
if: ${{ !cancelled() }}
|
|
||||||
run: cargo fmt --all -- --check
|
|
||||||
|
|
||||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
|
||||||
- name: Check rust dependencies
|
|
||||||
if: ${{ !cancelled() }}
|
|
||||||
run: |
|
|
||||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
|
||||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
|
||||||
|
|
||||||
# https://github.com/EmbarkStudios/cargo-deny
|
|
||||||
- name: Check rust licenses/bans/advisories/sources
|
|
||||||
if: ${{ !cancelled() }}
|
|
||||||
run: cargo deny check
|
|
||||||
|
|
||||||
gather-rust-build-stats:
|
gather-rust-build-stats:
|
||||||
if: |
|
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
|
||||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
|
|
||||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
|
||||||
github.ref_name == 'main'
|
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
|
|||||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -2,7 +2,7 @@ name: Create Release Branch
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '0 6 * * 1'
|
- cron: '0 7 * * 5'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|||||||
@@ -9,24 +9,6 @@ refactoring, additional comments, and so forth. Let's try to raise the
|
|||||||
bar, and clean things up as we go. Try to leave code in a better shape
|
bar, and clean things up as we go. Try to leave code in a better shape
|
||||||
than it was before.
|
than it was before.
|
||||||
|
|
||||||
## Pre-commit hook
|
|
||||||
|
|
||||||
We have a sample pre-commit hook in `pre-commit.py`.
|
|
||||||
To set it up, run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ln -s ../../pre-commit.py .git/hooks/pre-commit
|
|
||||||
```
|
|
||||||
|
|
||||||
This will run following checks on staged files before each commit:
|
|
||||||
- `rustfmt`
|
|
||||||
- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
|
|
||||||
|
|
||||||
There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
|
|
||||||
and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
|
|
||||||
|
|
||||||
If you want to skip the hook, run `git commit` with `--no-verify` option.
|
|
||||||
|
|
||||||
## Submitting changes
|
## Submitting changes
|
||||||
|
|
||||||
1. Get at least one +1 on your PR before you push.
|
1. Get at least one +1 on your PR before you push.
|
||||||
|
|||||||
1137
Cargo.lock
generated
1137
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
32
Cargo.toml
32
Cargo.toml
@@ -37,7 +37,7 @@ license = "Apache-2.0"
|
|||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
arc-swap = "1.6"
|
arc-swap = "1.6"
|
||||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||||
azure_core = "0.16"
|
azure_core = "0.16"
|
||||||
azure_identity = "0.16"
|
azure_identity = "0.16"
|
||||||
azure_storage = "0.16"
|
azure_storage = "0.16"
|
||||||
@@ -45,11 +45,12 @@ azure_storage_blobs = "0.16"
|
|||||||
flate2 = "1.0.26"
|
flate2 = "1.0.26"
|
||||||
async-stream = "0.3"
|
async-stream = "0.3"
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
aws-config = { version = "1.0", default-features = false, features=["rustls"] }
|
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
|
||||||
aws-sdk-s3 = "1.0"
|
aws-sdk-s3 = "0.29"
|
||||||
aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
|
aws-smithy-http = "0.56"
|
||||||
aws-smithy-types = "1.0"
|
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
|
||||||
aws-credential-types = "1.0"
|
aws-credential-types = "0.56"
|
||||||
|
aws-types = "0.56"
|
||||||
axum = { version = "0.6.20", features = ["ws"] }
|
axum = { version = "0.6.20", features = ["ws"] }
|
||||||
base64 = "0.13.0"
|
base64 = "0.13.0"
|
||||||
bincode = "1.3"
|
bincode = "1.3"
|
||||||
@@ -82,13 +83,12 @@ hex = "0.4"
|
|||||||
hex-literal = "0.4"
|
hex-literal = "0.4"
|
||||||
hmac = "0.12.1"
|
hmac = "0.12.1"
|
||||||
hostname = "0.3.1"
|
hostname = "0.3.1"
|
||||||
http-types = { version = "2", default-features = false }
|
http-types = "2"
|
||||||
humantime = "2.1"
|
humantime = "2.1"
|
||||||
humantime-serde = "1.1.1"
|
humantime-serde = "1.1.1"
|
||||||
hyper = "0.14"
|
hyper = "0.14"
|
||||||
hyper-tungstenite = "0.11"
|
hyper-tungstenite = "0.11"
|
||||||
inotify = "0.10.2"
|
inotify = "0.10.2"
|
||||||
ipnet = "2.9.0"
|
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
jsonwebtoken = "8"
|
jsonwebtoken = "8"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
@@ -122,24 +122,20 @@ rustls-pemfile = "1"
|
|||||||
rustls-split = "0.3"
|
rustls-split = "0.3"
|
||||||
scopeguard = "1.1"
|
scopeguard = "1.1"
|
||||||
sysinfo = "0.29.2"
|
sysinfo = "0.29.2"
|
||||||
sd-notify = "0.4.1"
|
|
||||||
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
serde_path_to_error = "0.1"
|
|
||||||
serde_with = "2.0"
|
serde_with = "2.0"
|
||||||
serde_assert = "0.5.0"
|
serde_assert = "0.5.0"
|
||||||
sha2 = "0.10.2"
|
sha2 = "0.10.2"
|
||||||
signal-hook = "0.3"
|
signal-hook = "0.3"
|
||||||
smallvec = "1.11"
|
smallvec = "1.11"
|
||||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
|
||||||
socket2 = "0.5"
|
socket2 = "0.5"
|
||||||
strum = "0.24"
|
strum = "0.24"
|
||||||
strum_macros = "0.24"
|
strum_macros = "0.24"
|
||||||
svg_fmt = "0.4.1"
|
svg_fmt = "0.4.1"
|
||||||
sync_wrapper = "0.1.2"
|
sync_wrapper = "0.1.2"
|
||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
task-local-extensions = "0.1.4"
|
|
||||||
test-context = "0.1"
|
test-context = "0.1"
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
|
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
|
||||||
@@ -168,11 +164,11 @@ env_logger = "0.10"
|
|||||||
log = "0.4"
|
log = "0.4"
|
||||||
|
|
||||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||||
|
|
||||||
## Other git libraries
|
## Other git libraries
|
||||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||||
@@ -209,7 +205,7 @@ tonic-build = "0.9"
|
|||||||
|
|
||||||
# This is only needed for proxy's tests.
|
# This is only needed for proxy's tests.
|
||||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||||
|
|
||||||
################# Binary contents sections
|
################# Binary contents sections
|
||||||
|
|
||||||
|
|||||||
@@ -387,10 +387,18 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|||||||
ARG PG_VERSION
|
ARG PG_VERSION
|
||||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN case "${PG_VERSION}" in \
|
||||||
|
"v14" | "v15") \
|
||||||
|
export TIMESCALEDB_VERSION=2.10.1 \
|
||||||
|
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
|
||||||
|
;; \
|
||||||
|
*) \
|
||||||
|
echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
|
||||||
|
esac && \
|
||||||
|
apt-get update && \
|
||||||
apt-get install -y cmake && \
|
apt-get install -y cmake && \
|
||||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
|
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
|
||||||
echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
|
echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
|
||||||
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
||||||
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
|
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||||
cd build && \
|
cd build && \
|
||||||
@@ -706,23 +714,6 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -
|
|||||||
cargo pgrx install --release && \
|
cargo pgrx install --release && \
|
||||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "wal2json-build"
|
|
||||||
# Compile "wal2json" extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
|
|
||||||
FROM build-deps AS wal2json-pg-build
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
|
||||||
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
|
|
||||||
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
|
|
||||||
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
# Layer "neon-pg-ext-build"
|
# Layer "neon-pg-ext-build"
|
||||||
@@ -759,7 +750,6 @@ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|||||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
|
||||||
COPY pgxn/ pgxn/
|
COPY pgxn/ pgxn/
|
||||||
|
|
||||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||||
|
|||||||
@@ -149,9 +149,6 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
|
|||||||
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
|
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
|
||||||
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
|
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
|
||||||
|
|
||||||
# create postgres compute node
|
|
||||||
> cargo neon endpoint create main
|
|
||||||
|
|
||||||
# start postgres compute node
|
# start postgres compute node
|
||||||
> cargo neon endpoint start main
|
> cargo neon endpoint start main
|
||||||
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||||
@@ -188,11 +185,8 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
|
|||||||
(L) main [de200bd42b49cc1814412c7e592dd6e9]
|
(L) main [de200bd42b49cc1814412c7e592dd6e9]
|
||||||
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
|
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
|
||||||
|
|
||||||
# create postgres on that branch
|
|
||||||
> cargo neon endpoint create migration_check --branch-name migration_check
|
|
||||||
|
|
||||||
# start postgres on that branch
|
# start postgres on that branch
|
||||||
> cargo neon endpoint start migration_check
|
> cargo neon endpoint start migration_check --branch-name migration_check
|
||||||
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||||
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
|
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
|
||||||
|
|
||||||
|
|||||||
@@ -38,4 +38,3 @@ toml_edit.workspace = true
|
|||||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||||
zstd = "0.12.4"
|
zstd = "0.12.4"
|
||||||
bytes = "1.0"
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@
|
|||||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||||
//! -S /var/db/postgres/specs/current.json \
|
//! -S /var/db/postgres/specs/current.json \
|
||||||
//! -b /usr/local/bin/postgres \
|
//! -b /usr/local/bin/postgres \
|
||||||
//! -r http://pg-ext-s3-gateway
|
//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
@@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus;
|
|||||||
|
|
||||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||||
use compute_tools::configurator::launch_configurator;
|
use compute_tools::configurator::launch_configurator;
|
||||||
use compute_tools::extension_server::get_pg_version;
|
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
|
||||||
use compute_tools::http::api::launch_http_server;
|
use compute_tools::http::api::launch_http_server;
|
||||||
use compute_tools::logger::*;
|
use compute_tools::logger::*;
|
||||||
use compute_tools::monitor::launch_monitor;
|
use compute_tools::monitor::launch_monitor;
|
||||||
@@ -60,7 +60,7 @@ use compute_tools::spec::*;
|
|||||||
|
|
||||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||||
// in-case of not-set environment var
|
// in-case of not-set environment var
|
||||||
const BUILD_TAG_DEFAULT: &str = "latest";
|
const BUILD_TAG_DEFAULT: &str = "5670669815";
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||||
@@ -74,18 +74,10 @@ fn main() -> Result<()> {
|
|||||||
let pgbin_default = String::from("postgres");
|
let pgbin_default = String::from("postgres");
|
||||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||||
|
|
||||||
let ext_remote_storage = matches
|
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
|
||||||
.get_one::<String>("remote-ext-config")
|
let ext_remote_storage = remote_ext_config.map(|x| {
|
||||||
// Compatibility hack: if the control plane specified any remote-ext-config
|
init_remote_storage(x).expect("cannot initialize remote extension storage from config")
|
||||||
// use the default value for extension storage proxy gateway.
|
});
|
||||||
// Remove this once the control plane is updated to pass the gateway URL
|
|
||||||
.map(|conf| {
|
|
||||||
if conf.starts_with("http") {
|
|
||||||
conf.trim_end_matches('/')
|
|
||||||
} else {
|
|
||||||
"http://pg-ext-s3-gateway"
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let http_port = *matches
|
let http_port = *matches
|
||||||
.get_one::<u16>("http-port")
|
.get_one::<u16>("http-port")
|
||||||
@@ -206,7 +198,7 @@ fn main() -> Result<()> {
|
|||||||
live_config_allowed,
|
live_config_allowed,
|
||||||
state: Mutex::new(new_state),
|
state: Mutex::new(new_state),
|
||||||
state_changed: Condvar::new(),
|
state_changed: Condvar::new(),
|
||||||
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
|
ext_remote_storage,
|
||||||
ext_download_progress: RwLock::new(HashMap::new()),
|
ext_download_progress: RwLock::new(HashMap::new()),
|
||||||
build_tag,
|
build_tag,
|
||||||
};
|
};
|
||||||
@@ -487,6 +479,13 @@ fn cli() -> clap::Command {
|
|||||||
)
|
)
|
||||||
.value_name("FILECACHE_CONNSTR"),
|
.value_name("FILECACHE_CONNSTR"),
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
// DEPRECATED, NO LONGER DOES ANYTHING.
|
||||||
|
// See https://github.com/neondatabase/cloud/issues/7516
|
||||||
|
Arg::new("file-cache-on-disk")
|
||||||
|
.long("file-cache-on-disk")
|
||||||
|
.action(clap::ArgAction::SetTrue),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
|||||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||||
use utils::measured_stream::MeasuredReader;
|
use utils::measured_stream::MeasuredReader;
|
||||||
|
|
||||||
use remote_storage::{DownloadError, RemotePath};
|
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||||
|
|
||||||
use crate::checker::create_availability_check_data;
|
use crate::checker::create_availability_check_data;
|
||||||
use crate::pg_helpers::*;
|
use crate::pg_helpers::*;
|
||||||
@@ -59,8 +59,8 @@ pub struct ComputeNode {
|
|||||||
pub state: Mutex<ComputeState>,
|
pub state: Mutex<ComputeState>,
|
||||||
/// `Condvar` to allow notifying waiters about state changes.
|
/// `Condvar` to allow notifying waiters about state changes.
|
||||||
pub state_changed: Condvar,
|
pub state_changed: Condvar,
|
||||||
/// the address of extension storage proxy gateway
|
/// the S3 bucket that we search for extensions in
|
||||||
pub ext_remote_storage: Option<String>,
|
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
||||||
// key: ext_archive_name, value: started download time, download_completed?
|
// key: ext_archive_name, value: started download time, download_completed?
|
||||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||||
pub build_tag: String,
|
pub build_tag: String,
|
||||||
@@ -698,7 +698,6 @@ impl ComputeNode {
|
|||||||
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
||||||
handle_grants(spec, &mut client, self.connstr.as_str())?;
|
handle_grants(spec, &mut client, self.connstr.as_str())?;
|
||||||
handle_extensions(spec, &mut client)?;
|
handle_extensions(spec, &mut client)?;
|
||||||
handle_extension_neon(&mut client)?;
|
|
||||||
create_availability_check_data(&mut client)?;
|
create_availability_check_data(&mut client)?;
|
||||||
|
|
||||||
// 'Close' connection
|
// 'Close' connection
|
||||||
@@ -728,12 +727,7 @@ impl ComputeNode {
|
|||||||
|
|
||||||
// Write new config
|
// Write new config
|
||||||
let pgdata_path = Path::new(&self.pgdata);
|
let pgdata_path = Path::new(&self.pgdata);
|
||||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
||||||
config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
|
|
||||||
// temporarily reset max_cluster_size in config
|
|
||||||
// to avoid the possibility of hitting the limit, while we are reconfiguring:
|
|
||||||
// creating new extensions, roles, etc...
|
|
||||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
|
||||||
self.pg_reload_conf()?;
|
self.pg_reload_conf()?;
|
||||||
|
|
||||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||||
@@ -748,16 +742,11 @@ impl ComputeNode {
|
|||||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||||
handle_grants(&spec, &mut client, self.connstr.as_str())?;
|
handle_grants(&spec, &mut client, self.connstr.as_str())?;
|
||||||
handle_extensions(&spec, &mut client)?;
|
handle_extensions(&spec, &mut client)?;
|
||||||
handle_extension_neon(&mut client)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 'Close' connection
|
// 'Close' connection
|
||||||
drop(client);
|
drop(client);
|
||||||
|
|
||||||
// reset max_cluster_size in config back to original value and reload config
|
|
||||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
|
||||||
self.pg_reload_conf()?;
|
|
||||||
|
|
||||||
let unknown_op = "unknown".to_string();
|
let unknown_op = "unknown".to_string();
|
||||||
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
|
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
|
||||||
info!(
|
info!(
|
||||||
@@ -818,17 +807,7 @@ impl ComputeNode {
|
|||||||
|
|
||||||
let config_time = Utc::now();
|
let config_time = Utc::now();
|
||||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||||
let pgdata_path = Path::new(&self.pgdata);
|
|
||||||
// temporarily reset max_cluster_size in config
|
|
||||||
// to avoid the possibility of hitting the limit, while we are applying config:
|
|
||||||
// creating new extensions, roles, etc...
|
|
||||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
|
||||||
self.pg_reload_conf()?;
|
|
||||||
|
|
||||||
self.apply_config(&compute_state)?;
|
self.apply_config(&compute_state)?;
|
||||||
|
|
||||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
|
||||||
self.pg_reload_conf()?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let startup_end_time = Utc::now();
|
let startup_end_time = Utc::now();
|
||||||
@@ -976,12 +955,12 @@ LIMIT 100",
|
|||||||
real_ext_name: String,
|
real_ext_name: String,
|
||||||
ext_path: RemotePath,
|
ext_path: RemotePath,
|
||||||
) -> Result<u64, DownloadError> {
|
) -> Result<u64, DownloadError> {
|
||||||
let ext_remote_storage =
|
let remote_storage = self
|
||||||
self.ext_remote_storage
|
.ext_remote_storage
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||||
"Remote extensions storage is not configured",
|
"Remote extensions storage is not configured",
|
||||||
)))?;
|
)))?;
|
||||||
|
|
||||||
let ext_archive_name = ext_path.object_name().expect("bad path");
|
let ext_archive_name = ext_path.object_name().expect("bad path");
|
||||||
|
|
||||||
@@ -1037,7 +1016,7 @@ LIMIT 100",
|
|||||||
let download_size = extension_server::download_extension(
|
let download_size = extension_server::download_extension(
|
||||||
&real_ext_name,
|
&real_ext_name,
|
||||||
&ext_path,
|
&ext_path,
|
||||||
ext_remote_storage,
|
remote_storage,
|
||||||
&self.pgbin,
|
&self.pgbin,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -93,25 +93,5 @@ pub fn write_postgres_conf(
|
|||||||
writeln!(file, "neon.extension_server_port={}", port)?;
|
writeln!(file, "neon.extension_server_port={}", port)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is essential to keep this line at the end of the file,
|
|
||||||
// because it is intended to override any settings above.
|
|
||||||
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// create file compute_ctl_temp_override.conf in pgdata_dir
|
|
||||||
/// add provided options to this file
|
|
||||||
pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
|
|
||||||
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
|
||||||
let mut file = File::create(path)?;
|
|
||||||
write!(file, "{}", options)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// remove file compute_ctl_temp_override.conf in pgdata_dir
|
|
||||||
pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
|
|
||||||
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
|
||||||
std::fs::remove_file(path)?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -71,16 +71,18 @@ More specifically, here is an example ext_index.json
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
use anyhow::Context;
|
||||||
use anyhow::{self, Result};
|
use anyhow::{self, Result};
|
||||||
use anyhow::{bail, Context};
|
|
||||||
use bytes::Bytes;
|
|
||||||
use compute_api::spec::RemoteExtSpec;
|
use compute_api::spec::RemoteExtSpec;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use remote_storage::*;
|
use remote_storage::*;
|
||||||
use reqwest::StatusCode;
|
use serde_json;
|
||||||
|
use std::io::Read;
|
||||||
|
use std::num::NonZeroUsize;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str;
|
use std::str;
|
||||||
use tar::Archive;
|
use tar::Archive;
|
||||||
|
use tokio::io::AsyncReadExt;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
use tracing::log::warn;
|
use tracing::log::warn;
|
||||||
use zstd::stream::read::Decoder;
|
use zstd::stream::read::Decoder;
|
||||||
@@ -131,36 +133,67 @@ fn parse_pg_version(human_version: &str) -> &str {
|
|||||||
panic!("Unsuported postgres version {human_version}");
|
panic!("Unsuported postgres version {human_version}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::parse_pg_version;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_pg_version() {
|
||||||
|
assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
|
||||||
|
assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
|
||||||
|
assert_eq!(
|
||||||
|
parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
|
||||||
|
"v15"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
|
||||||
|
assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
|
||||||
|
assert_eq!(
|
||||||
|
parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
|
||||||
|
"v14"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
|
||||||
|
assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
|
||||||
|
assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
|
||||||
|
assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn test_parse_pg_unsupported_version() {
|
||||||
|
parse_pg_version("PostgreSQL 13.14");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn test_parse_pg_incorrect_version_format() {
|
||||||
|
parse_pg_version("PostgreSQL 14");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// download the archive for a given extension,
|
// download the archive for a given extension,
|
||||||
// unzip it, and place files in the appropriate locations (share/lib)
|
// unzip it, and place files in the appropriate locations (share/lib)
|
||||||
pub async fn download_extension(
|
pub async fn download_extension(
|
||||||
ext_name: &str,
|
ext_name: &str,
|
||||||
ext_path: &RemotePath,
|
ext_path: &RemotePath,
|
||||||
ext_remote_storage: &str,
|
remote_storage: &GenericRemoteStorage,
|
||||||
pgbin: &str,
|
pgbin: &str,
|
||||||
) -> Result<u64> {
|
) -> Result<u64> {
|
||||||
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
||||||
|
let mut download = remote_storage.download(ext_path).await?;
|
||||||
// TODO add retry logic
|
let mut download_buffer = Vec::new();
|
||||||
let download_buffer =
|
download
|
||||||
match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
|
.download_stream
|
||||||
Ok(buffer) => buffer,
|
.read_to_end(&mut download_buffer)
|
||||||
Err(error_message) => {
|
.await?;
|
||||||
return Err(anyhow::anyhow!(
|
|
||||||
"error downloading extension {:?}: {:?}",
|
|
||||||
ext_name,
|
|
||||||
error_message
|
|
||||||
));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let download_size = download_buffer.len() as u64;
|
let download_size = download_buffer.len() as u64;
|
||||||
info!("Download size {:?}", download_size);
|
|
||||||
// it's unclear whether it is more performant to decompress into memory or not
|
// it's unclear whether it is more performant to decompress into memory or not
|
||||||
// TODO: decompressing into memory can be avoided
|
// TODO: decompressing into memory can be avoided
|
||||||
let decoder = Decoder::new(download_buffer.as_ref())?;
|
let mut decoder = Decoder::new(download_buffer.as_slice())?;
|
||||||
let mut archive = Archive::new(decoder);
|
let mut decompress_buffer = Vec::new();
|
||||||
|
decoder.read_to_end(&mut decompress_buffer)?;
|
||||||
|
let mut archive = Archive::new(decompress_buffer.as_slice());
|
||||||
let unzip_dest = pgbin
|
let unzip_dest = pgbin
|
||||||
.strip_suffix("/bin/postgres")
|
.strip_suffix("/bin/postgres")
|
||||||
.expect("bad pgbin")
|
.expect("bad pgbin")
|
||||||
@@ -228,69 +261,27 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do request to extension storage proxy, i.e.
|
// This function initializes the necessary structs to use remote storage
|
||||||
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
|
||||||
// using HHTP GET
|
#[derive(Debug, serde::Deserialize)]
|
||||||
// and return the response body as bytes
|
struct RemoteExtJson {
|
||||||
//
|
bucket: String,
|
||||||
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
|
region: String,
|
||||||
let uri = format!("{}/{}", ext_remote_storage, ext_path);
|
endpoint: Option<String>,
|
||||||
|
prefix: Option<String>,
|
||||||
info!("Download extension {:?} from uri {:?}", ext_path, uri);
|
|
||||||
|
|
||||||
let resp = reqwest::get(uri).await?;
|
|
||||||
|
|
||||||
match resp.status() {
|
|
||||||
StatusCode::OK => match resp.bytes().await {
|
|
||||||
Ok(resp) => {
|
|
||||||
info!("Download extension {:?} completed successfully", ext_path);
|
|
||||||
Ok(resp)
|
|
||||||
}
|
|
||||||
Err(e) => bail!("could not deserialize remote extension response: {}", e),
|
|
||||||
},
|
|
||||||
StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
|
|
||||||
_ => bail!(
|
|
||||||
"unexpected remote extension response status code: {}",
|
|
||||||
resp.status()
|
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::parse_pg_version;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_parse_pg_version() {
|
|
||||||
assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
|
|
||||||
assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
|
|
||||||
assert_eq!(
|
|
||||||
parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
|
|
||||||
"v15"
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
|
|
||||||
assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
|
|
||||||
assert_eq!(
|
|
||||||
parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
|
|
||||||
"v14"
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
|
|
||||||
assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
|
|
||||||
assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
|
|
||||||
assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn test_parse_pg_unsupported_version() {
|
|
||||||
parse_pg_version("PostgreSQL 13.14");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn test_parse_pg_incorrect_version_format() {
|
|
||||||
parse_pg_version("PostgreSQL 14");
|
|
||||||
}
|
}
|
||||||
|
let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
|
||||||
|
|
||||||
|
let config = S3Config {
|
||||||
|
bucket_name: remote_ext_json.bucket,
|
||||||
|
bucket_region: remote_ext_json.region,
|
||||||
|
prefix_in_bucket: remote_ext_json.prefix,
|
||||||
|
endpoint: remote_ext_json.endpoint,
|
||||||
|
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
|
||||||
|
max_keys_per_list_response: None,
|
||||||
|
};
|
||||||
|
let config = RemoteStorageConfig {
|
||||||
|
storage: RemoteStorageKind::AwsS3(config),
|
||||||
|
};
|
||||||
|
GenericRemoteStorage::from_config(&config)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// download extension files from remote extension storage on demand
|
// download extension files from S3 on demand
|
||||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||||
info!("serving {:?} POST request", route);
|
info!("serving {:?} POST request", route);
|
||||||
info!("req.uri {:?}", req.uri());
|
info!("req.uri {:?}", req.uri());
|
||||||
@@ -227,7 +227,7 @@ async fn handle_configure_request(
|
|||||||
|
|
||||||
let parsed_spec = match ParsedSpec::try_from(spec) {
|
let parsed_spec = match ParsedSpec::try_from(spec) {
|
||||||
Ok(ps) => ps,
|
Ok(ps) => ps,
|
||||||
Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
|
Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
|
||||||
};
|
};
|
||||||
|
|
||||||
// XXX: wrap state update under lock in code blocks. Otherwise,
|
// XXX: wrap state update under lock in code blocks. Otherwise,
|
||||||
|
|||||||
@@ -156,17 +156,17 @@ paths:
|
|||||||
description: Error text or 'OK' if download succeeded.
|
description: Error text or 'OK' if download succeeded.
|
||||||
example: "OK"
|
example: "OK"
|
||||||
400:
|
400:
|
||||||
description: Request is invalid.
|
description: Request is invalid.
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/GenericError"
|
$ref: "#/components/schemas/GenericError"
|
||||||
500:
|
500:
|
||||||
description: Extension download request failed.
|
description: Extension download request failed.
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/GenericError"
|
$ref: "#/components/schemas/GenericError"
|
||||||
|
|
||||||
components:
|
components:
|
||||||
securitySchemes:
|
securitySchemes:
|
||||||
|
|||||||
@@ -118,6 +118,19 @@ pub fn get_spec_from_control_plane(
|
|||||||
spec
|
spec
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// It takes cluster specification and does the following:
|
||||||
|
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
|
||||||
|
/// - Update `pg_hba.conf` to allow external connections.
|
||||||
|
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||||
|
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||||
|
// always write all config into it creating new file.
|
||||||
|
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
|
||||||
|
|
||||||
|
update_pg_hba(pgdata_path)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
||||||
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||||
// XXX: consider making it a part of spec.json
|
// XXX: consider making it a part of spec.json
|
||||||
@@ -661,33 +674,3 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
|
|
||||||
#[instrument(skip_all)]
|
|
||||||
pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
|
||||||
info!("handle extension neon");
|
|
||||||
|
|
||||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
|
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
|
|
||||||
info!("create neon extension with query: {}", query);
|
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
|
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
query = "ALTER EXTENSION neon SET SCHEMA neon";
|
|
||||||
info!("alter neon extension schema with query: {}", query);
|
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
// this will be a no-op if extension is already up to date,
|
|
||||||
// which may happen in two cases:
|
|
||||||
// - extension was just installed
|
|
||||||
// - extension was already installed and is up to date
|
|
||||||
let query = "ALTER EXTENSION neon UPDATE";
|
|
||||||
info!("update neon extension schema with query: {}", query);
|
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ pub struct AttachmentService {
|
|||||||
env: LocalEnv,
|
env: LocalEnv,
|
||||||
listen: String,
|
listen: String,
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
client: reqwest::blocking::Client,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const COMMAND: &str = "attachment_service";
|
const COMMAND: &str = "attachment_service";
|
||||||
@@ -25,16 +24,6 @@ pub struct AttachHookResponse {
|
|||||||
pub gen: Option<u32>,
|
pub gen: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct InspectRequest {
|
|
||||||
pub tenant_id: TenantId,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct InspectResponse {
|
|
||||||
pub attachment: Option<(u32, NodeId)>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AttachmentService {
|
impl AttachmentService {
|
||||||
pub fn from_env(env: &LocalEnv) -> Self {
|
pub fn from_env(env: &LocalEnv) -> Self {
|
||||||
let path = env.base_data_dir.join("attachments.json");
|
let path = env.base_data_dir.join("attachments.json");
|
||||||
@@ -53,9 +42,6 @@ impl AttachmentService {
|
|||||||
env: env.clone(),
|
env: env.clone(),
|
||||||
path,
|
path,
|
||||||
listen,
|
listen,
|
||||||
client: reqwest::blocking::ClientBuilder::new()
|
|
||||||
.build()
|
|
||||||
.expect("Failed to construct http client"),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -98,13 +84,16 @@ impl AttachmentService {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.join("attach-hook")
|
.join("attach-hook")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
let client = reqwest::blocking::ClientBuilder::new()
|
||||||
|
.build()
|
||||||
|
.expect("Failed to construct http client");
|
||||||
|
|
||||||
let request = AttachHookRequest {
|
let request = AttachHookRequest {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
node_id: Some(pageserver_id),
|
node_id: Some(pageserver_id),
|
||||||
};
|
};
|
||||||
|
|
||||||
let response = self.client.post(url).json(&request).send()?;
|
let response = client.post(url).json(&request).send()?;
|
||||||
if response.status() != StatusCode::OK {
|
if response.status() != StatusCode::OK {
|
||||||
return Err(anyhow!("Unexpected status {}", response.status()));
|
return Err(anyhow!("Unexpected status {}", response.status()));
|
||||||
}
|
}
|
||||||
@@ -112,26 +101,4 @@ impl AttachmentService {
|
|||||||
let response = response.json::<AttachHookResponse>()?;
|
let response = response.json::<AttachHookResponse>()?;
|
||||||
Ok(response.gen)
|
Ok(response.gen)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
|
|
||||||
use hyper::StatusCode;
|
|
||||||
|
|
||||||
let url = self
|
|
||||||
.env
|
|
||||||
.control_plane_api
|
|
||||||
.clone()
|
|
||||||
.unwrap()
|
|
||||||
.join("inspect")
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let request = InspectRequest { tenant_id };
|
|
||||||
|
|
||||||
let response = self.client.post(url).json(&request).send()?;
|
|
||||||
if response.status() != StatusCode::OK {
|
|
||||||
return Err(anyhow!("Unexpected status {}", response.status()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let response = response.json::<InspectResponse>()?;
|
|
||||||
Ok(response.attachment)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ use clap::Parser;
|
|||||||
use hex::FromHex;
|
use hex::FromHex;
|
||||||
use hyper::StatusCode;
|
use hyper::StatusCode;
|
||||||
use hyper::{Body, Request, Response};
|
use hyper::{Body, Request, Response};
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
@@ -33,9 +32,7 @@ use pageserver_api::control_api::{
|
|||||||
ValidateResponseTenant,
|
ValidateResponseTenant,
|
||||||
};
|
};
|
||||||
|
|
||||||
use control_plane::attachment_service::{
|
use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
|
||||||
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
|
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
#[command(author, version, about, long_about = None)]
|
#[command(author, version, about, long_about = None)]
|
||||||
@@ -174,8 +171,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
|
|||||||
if state.pageserver == Some(reattach_req.node_id) {
|
if state.pageserver == Some(reattach_req.node_id) {
|
||||||
state.generation += 1;
|
state.generation += 1;
|
||||||
response.tenants.push(ReAttachResponseTenant {
|
response.tenants.push(ReAttachResponseTenant {
|
||||||
// TODO(sharding): make this shard-aware
|
id: *t,
|
||||||
id: TenantShardId::unsharded(*t),
|
|
||||||
gen: state.generation,
|
gen: state.generation,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -198,8 +194,7 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
|
|||||||
};
|
};
|
||||||
|
|
||||||
for req_tenant in validate_req.tenants {
|
for req_tenant in validate_req.tenants {
|
||||||
// TODO(sharding): make this shard-aware
|
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
|
||||||
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
|
|
||||||
let valid = tenant_state.generation == req_tenant.gen;
|
let valid = tenant_state.generation == req_tenant.gen;
|
||||||
response.tenants.push(ValidateResponseTenant {
|
response.tenants.push(ValidateResponseTenant {
|
||||||
id: req_tenant.id,
|
id: req_tenant.id,
|
||||||
@@ -260,28 +255,12 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
|
|
||||||
|
|
||||||
let state = get_state(&req).inner.clone();
|
|
||||||
let locked = state.write().await;
|
|
||||||
let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
|
|
||||||
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
InspectResponse {
|
|
||||||
attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||||
endpoint::make_router()
|
endpoint::make_router()
|
||||||
.data(Arc::new(State::new(persistent_state)))
|
.data(Arc::new(State::new(persistent_state)))
|
||||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||||
.post("/validate", |r| request_span(r, handle_validate))
|
.post("/validate", |r| request_span(r, handle_validate))
|
||||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||||
.post("/inspect", |r| request_span(r, handle_inspect))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
@@ -289,7 +268,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
logging::init(
|
logging::init(
|
||||||
LogFormat::Plain,
|
LogFormat::Plain,
|
||||||
logging::TracingErrorLayerEnablement::Disabled,
|
logging::TracingErrorLayerEnablement::Disabled,
|
||||||
logging::Output::Stdout,
|
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let args = Cli::parse();
|
let args = Cli::parse();
|
||||||
|
|||||||
@@ -11,14 +11,13 @@ use compute_api::spec::ComputeMode;
|
|||||||
use control_plane::attachment_service::AttachmentService;
|
use control_plane::attachment_service::AttachmentService;
|
||||||
use control_plane::endpoint::ComputeControlPlane;
|
use control_plane::endpoint::ComputeControlPlane;
|
||||||
use control_plane::local_env::LocalEnv;
|
use control_plane::local_env::LocalEnv;
|
||||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
use control_plane::pageserver::PageServerNode;
|
||||||
use control_plane::safekeeper::SafekeeperNode;
|
use control_plane::safekeeper::SafekeeperNode;
|
||||||
use control_plane::tenant_migration::migrate_tenant;
|
|
||||||
use control_plane::{broker, local_env};
|
use control_plane::{broker, local_env};
|
||||||
use pageserver_api::models::TimelineInfo;
|
use pageserver_api::models::TimelineInfo;
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
|
||||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
|
||||||
};
|
};
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use safekeeper_api::{
|
use safekeeper_api::{
|
||||||
@@ -47,8 +46,8 @@ const DEFAULT_PG_VERSION: &str = "15";
|
|||||||
|
|
||||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
|
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
|
||||||
|
|
||||||
fn default_conf(num_pageservers: u16) -> String {
|
fn default_conf() -> String {
|
||||||
let mut template = format!(
|
format!(
|
||||||
r#"
|
r#"
|
||||||
# Default built-in configuration, defined in main.rs
|
# Default built-in configuration, defined in main.rs
|
||||||
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
|
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
|
||||||
@@ -56,33 +55,21 @@ control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
|
|||||||
[broker]
|
[broker]
|
||||||
listen_addr = '{DEFAULT_BROKER_ADDR}'
|
listen_addr = '{DEFAULT_BROKER_ADDR}'
|
||||||
|
|
||||||
|
[[pageservers]]
|
||||||
|
id = {DEFAULT_PAGESERVER_ID}
|
||||||
|
listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
|
||||||
|
listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
|
||||||
|
pg_auth_type = '{trust_auth}'
|
||||||
|
http_auth_type = '{trust_auth}'
|
||||||
|
|
||||||
[[safekeepers]]
|
[[safekeepers]]
|
||||||
id = {DEFAULT_SAFEKEEPER_ID}
|
id = {DEFAULT_SAFEKEEPER_ID}
|
||||||
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
|
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
|
||||||
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
|
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
|
||||||
|
|
||||||
"#,
|
"#,
|
||||||
);
|
trust_auth = AuthType::Trust,
|
||||||
|
)
|
||||||
for i in 0..num_pageservers {
|
|
||||||
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
|
|
||||||
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
|
|
||||||
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
|
|
||||||
|
|
||||||
template += &format!(
|
|
||||||
r#"
|
|
||||||
[[pageservers]]
|
|
||||||
id = {pageserver_id}
|
|
||||||
listen_pg_addr = '127.0.0.1:{pg_port}'
|
|
||||||
listen_http_addr = '127.0.0.1:{http_port}'
|
|
||||||
pg_auth_type = '{trust_auth}'
|
|
||||||
http_auth_type = '{trust_auth}'
|
|
||||||
"#,
|
|
||||||
trust_auth = AuthType::Trust,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
template
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -308,9 +295,6 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||||
let num_pageservers = init_match
|
|
||||||
.get_one::<u16>("num-pageservers")
|
|
||||||
.expect("num-pageservers arg has a default");
|
|
||||||
// Create config file
|
// Create config file
|
||||||
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
|
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
|
||||||
// load and parse the file
|
// load and parse the file
|
||||||
@@ -322,7 +306,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
|||||||
})?
|
})?
|
||||||
} else {
|
} else {
|
||||||
// Built-in default config
|
// Built-in default config
|
||||||
default_conf(*num_pageservers)
|
default_conf()
|
||||||
};
|
};
|
||||||
|
|
||||||
let pg_version = init_match
|
let pg_version = init_match
|
||||||
@@ -336,9 +320,6 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
|||||||
env.init(pg_version, force)
|
env.init(pg_version, force)
|
||||||
.context("Failed to initialize neon repository")?;
|
.context("Failed to initialize neon repository")?;
|
||||||
|
|
||||||
// Create remote storage location for default LocalFs remote storage
|
|
||||||
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
|
|
||||||
|
|
||||||
// Initialize pageserver, create initial tenant and timeline.
|
// Initialize pageserver, create initial tenant and timeline.
|
||||||
for ps_conf in &env.pageservers {
|
for ps_conf in &env.pageservers {
|
||||||
PageServerNode::from_env(&env, ps_conf)
|
PageServerNode::from_env(&env, ps_conf)
|
||||||
@@ -415,7 +396,6 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
|||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
Some(pg_version),
|
Some(pg_version),
|
||||||
None,
|
|
||||||
)?;
|
)?;
|
||||||
let new_timeline_id = timeline_info.timeline_id;
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
let last_record_lsn = timeline_info.last_record_lsn;
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
@@ -453,15 +433,6 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
|||||||
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
||||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||||
}
|
}
|
||||||
Some(("migrate", matches)) => {
|
|
||||||
let tenant_id = get_tenant_id(matches, env)?;
|
|
||||||
let new_pageserver = get_pageserver(env, matches)?;
|
|
||||||
let new_pageserver_id = new_pageserver.conf.id;
|
|
||||||
|
|
||||||
migrate_tenant(env, tenant_id, new_pageserver)?;
|
|
||||||
println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||||
None => bail!("no tenant subcommand provided"),
|
None => bail!("no tenant subcommand provided"),
|
||||||
}
|
}
|
||||||
@@ -488,16 +459,8 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
.copied()
|
.copied()
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
let timeline_info =
|
||||||
|
pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
|
||||||
let timeline_info = pageserver.timeline_create(
|
|
||||||
tenant_id,
|
|
||||||
new_timeline_id_opt,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
Some(pg_version),
|
|
||||||
None,
|
|
||||||
)?;
|
|
||||||
let new_timeline_id = timeline_info.timeline_id;
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
|
|
||||||
let last_record_lsn = timeline_info.last_record_lsn;
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
@@ -584,7 +547,6 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
start_lsn,
|
start_lsn,
|
||||||
Some(ancestor_timeline_id),
|
Some(ancestor_timeline_id),
|
||||||
None,
|
None,
|
||||||
None,
|
|
||||||
)?;
|
)?;
|
||||||
let new_timeline_id = timeline_info.timeline_id;
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
|
|
||||||
@@ -611,9 +573,11 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
};
|
};
|
||||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||||
|
|
||||||
|
// All subcommands take an optional --tenant-id option
|
||||||
|
let tenant_id = get_tenant_id(sub_args, env)?;
|
||||||
|
|
||||||
match sub_name {
|
match sub_name {
|
||||||
"list" => {
|
"list" => {
|
||||||
let tenant_id = get_tenant_id(sub_args, env)?;
|
|
||||||
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
|
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
|
||||||
eprintln!("Failed to load timeline info: {}", e);
|
eprintln!("Failed to load timeline info: {}", e);
|
||||||
HashMap::new()
|
HashMap::new()
|
||||||
@@ -673,7 +637,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
println!("{table}");
|
println!("{table}");
|
||||||
}
|
}
|
||||||
"create" => {
|
"create" => {
|
||||||
let tenant_id = get_tenant_id(sub_args, env)?;
|
|
||||||
let branch_name = sub_args
|
let branch_name = sub_args
|
||||||
.get_one::<String>("branch-name")
|
.get_one::<String>("branch-name")
|
||||||
.map(|s| s.as_str())
|
.map(|s| s.as_str())
|
||||||
@@ -718,18 +681,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
||||||
};
|
};
|
||||||
|
|
||||||
match (mode, hot_standby) {
|
|
||||||
(ComputeMode::Static(_), true) => {
|
|
||||||
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
|
||||||
}
|
|
||||||
(ComputeMode::Primary, true) => {
|
|
||||||
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
|
|
||||||
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
|
|
||||||
|
|
||||||
cplane.new_endpoint(
|
cplane.new_endpoint(
|
||||||
&endpoint_id,
|
&endpoint_id,
|
||||||
tenant_id,
|
tenant_id,
|
||||||
@@ -742,6 +693,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
"start" => {
|
"start" => {
|
||||||
|
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
|
||||||
|
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
|
||||||
let endpoint_id = sub_args
|
let endpoint_id = sub_args
|
||||||
.get_one::<String>("endpoint_id")
|
.get_one::<String>("endpoint_id")
|
||||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
||||||
@@ -770,28 +723,80 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
env.safekeepers.iter().map(|sk| sk.id).collect()
|
env.safekeepers.iter().map(|sk| sk.id).collect()
|
||||||
};
|
};
|
||||||
|
|
||||||
let endpoint = cplane
|
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
|
||||||
.endpoints
|
|
||||||
.get(endpoint_id.as_str())
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
|
|
||||||
|
|
||||||
cplane.check_conflicting_endpoints(
|
|
||||||
endpoint.mode,
|
|
||||||
endpoint.tenant_id,
|
|
||||||
endpoint.timeline_id,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
|
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
|
||||||
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
||||||
let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
|
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
|
||||||
|
|
||||||
Some(env.generate_auth_token(&claims)?)
|
Some(env.generate_auth_token(&claims)?)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
println!("Starting existing endpoint {endpoint_id}...");
|
let hot_standby = sub_args
|
||||||
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
.get_one::<bool>("hot-standby")
|
||||||
|
.copied()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if let Some(endpoint) = endpoint {
|
||||||
|
match (&endpoint.mode, hot_standby) {
|
||||||
|
(ComputeMode::Static(_), true) => {
|
||||||
|
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
||||||
|
}
|
||||||
|
(ComputeMode::Primary, true) => {
|
||||||
|
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
println!("Starting existing endpoint {endpoint_id}...");
|
||||||
|
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||||
|
} else {
|
||||||
|
let branch_name = sub_args
|
||||||
|
.get_one::<String>("branch-name")
|
||||||
|
.map(|s| s.as_str())
|
||||||
|
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||||
|
let timeline_id = env
|
||||||
|
.get_branch_timeline_id(branch_name, tenant_id)
|
||||||
|
.ok_or_else(|| {
|
||||||
|
anyhow!("Found no timeline id for branch name '{branch_name}'")
|
||||||
|
})?;
|
||||||
|
let lsn = sub_args
|
||||||
|
.get_one::<String>("lsn")
|
||||||
|
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||||
|
.transpose()
|
||||||
|
.context("Failed to parse Lsn from the request")?;
|
||||||
|
let pg_version = sub_args
|
||||||
|
.get_one::<u32>("pg-version")
|
||||||
|
.copied()
|
||||||
|
.context("Failed to `pg-version` from the argument string")?;
|
||||||
|
|
||||||
|
let mode = match (lsn, hot_standby) {
|
||||||
|
(Some(lsn), false) => ComputeMode::Static(lsn),
|
||||||
|
(None, true) => ComputeMode::Replica,
|
||||||
|
(None, false) => ComputeMode::Primary,
|
||||||
|
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// when used with custom port this results in non obvious behaviour
|
||||||
|
// port is remembered from first start command, i e
|
||||||
|
// start --port X
|
||||||
|
// stop
|
||||||
|
// start <-- will also use port X even without explicit port argument
|
||||||
|
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
|
||||||
|
|
||||||
|
let ep = cplane.new_endpoint(
|
||||||
|
endpoint_id,
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
pg_port,
|
||||||
|
http_port,
|
||||||
|
pg_version,
|
||||||
|
mode,
|
||||||
|
pageserver_id,
|
||||||
|
)?;
|
||||||
|
ep.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
"reconfigure" => {
|
"reconfigure" => {
|
||||||
let endpoint_id = sub_args
|
let endpoint_id = sub_args
|
||||||
@@ -862,20 +867,20 @@ fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Res
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
|
|
||||||
let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
|
|
||||||
NodeId(id_str.parse().context("while parsing pageserver id")?)
|
|
||||||
} else {
|
|
||||||
DEFAULT_PAGESERVER_ID
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(PageServerNode::from_env(
|
|
||||||
env,
|
|
||||||
env.get_pageserver_conf(node_id)?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||||
|
fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
|
||||||
|
let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
|
||||||
|
NodeId(id_str.parse().context("while parsing pageserver id")?)
|
||||||
|
} else {
|
||||||
|
DEFAULT_PAGESERVER_ID
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(PageServerNode::from_env(
|
||||||
|
env,
|
||||||
|
env.get_pageserver_conf(node_id)?,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
match sub_match.subcommand() {
|
match sub_match.subcommand() {
|
||||||
Some(("start", subcommand_args)) => {
|
Some(("start", subcommand_args)) => {
|
||||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||||
@@ -912,20 +917,6 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(("migrate", subcommand_args)) => {
|
|
||||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
|
||||||
//TODO what shutdown strategy should we use here?
|
|
||||||
if let Err(e) = pageserver.stop(false) {
|
|
||||||
eprintln!("pageserver stop failed: {}", e);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
|
|
||||||
eprintln!("pageserver start failed: {e}");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(("status", subcommand_args)) => {
|
Some(("status", subcommand_args)) => {
|
||||||
match get_pageserver(env, subcommand_args)?.check_status() {
|
match get_pageserver(env, subcommand_args)?.check_status() {
|
||||||
Ok(_) => println!("Page server is up and running"),
|
Ok(_) => println!("Page server is up and running"),
|
||||||
@@ -1212,7 +1203,7 @@ fn cli() -> Command {
|
|||||||
let remote_ext_config_args = Arg::new("remote-ext-config")
|
let remote_ext_config_args = Arg::new("remote-ext-config")
|
||||||
.long("remote-ext-config")
|
.long("remote-ext-config")
|
||||||
.num_args(1)
|
.num_args(1)
|
||||||
.help("Configure the remote extensions storage proxy gateway to request for extensions.")
|
.help("Configure the S3 bucket that we search for extensions in.")
|
||||||
.required(false);
|
.required(false);
|
||||||
|
|
||||||
let lsn_arg = Arg::new("lsn")
|
let lsn_arg = Arg::new("lsn")
|
||||||
@@ -1233,13 +1224,6 @@ fn cli() -> Command {
|
|||||||
.help("Force initialization even if the repository is not empty")
|
.help("Force initialization even if the repository is not empty")
|
||||||
.required(false);
|
.required(false);
|
||||||
|
|
||||||
let num_pageservers_arg = Arg::new("num-pageservers")
|
|
||||||
.value_parser(value_parser!(u16))
|
|
||||||
.long("num-pageservers")
|
|
||||||
.help("How many pageservers to create (default 1)")
|
|
||||||
.required(false)
|
|
||||||
.default_value("1");
|
|
||||||
|
|
||||||
Command::new("Neon CLI")
|
Command::new("Neon CLI")
|
||||||
.arg_required_else_help(true)
|
.arg_required_else_help(true)
|
||||||
.version(GIT_VERSION)
|
.version(GIT_VERSION)
|
||||||
@@ -1247,7 +1231,6 @@ fn cli() -> Command {
|
|||||||
Command::new("init")
|
Command::new("init")
|
||||||
.about("Initialize a new Neon repository, preparing configs for services to start with")
|
.about("Initialize a new Neon repository, preparing configs for services to start with")
|
||||||
.arg(pageserver_config_args.clone())
|
.arg(pageserver_config_args.clone())
|
||||||
.arg(num_pageservers_arg.clone())
|
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new("config")
|
Arg::new("config")
|
||||||
.long("config")
|
.long("config")
|
||||||
@@ -1275,7 +1258,6 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("create")
|
.subcommand(Command::new("create")
|
||||||
.about("Create a new blank timeline")
|
.about("Create a new blank timeline")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
.arg(timeline_id_arg.clone())
|
|
||||||
.arg(branch_name_arg.clone())
|
.arg(branch_name_arg.clone())
|
||||||
.arg(pg_version_arg.clone())
|
.arg(pg_version_arg.clone())
|
||||||
)
|
)
|
||||||
@@ -1319,10 +1301,6 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("config")
|
.subcommand(Command::new("config")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
||||||
.subcommand(Command::new("migrate")
|
|
||||||
.about("Migrate a tenant from one pageserver to another")
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(pageserver_id_arg.clone()))
|
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("pageserver")
|
Command::new("pageserver")
|
||||||
@@ -1397,7 +1375,15 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("start")
|
.subcommand(Command::new("start")
|
||||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||||
.arg(endpoint_id_arg.clone())
|
.arg(endpoint_id_arg.clone())
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(branch_name_arg.clone())
|
||||||
|
.arg(timeline_id_arg.clone())
|
||||||
|
.arg(lsn_arg)
|
||||||
|
.arg(pg_port_arg)
|
||||||
|
.arg(http_port_arg)
|
||||||
.arg(endpoint_pageserver_id_arg.clone())
|
.arg(endpoint_pageserver_id_arg.clone())
|
||||||
|
.arg(pg_version_arg)
|
||||||
|
.arg(hot_standby_arg)
|
||||||
.arg(safekeepers_arg)
|
.arg(safekeepers_arg)
|
||||||
.arg(remote_ext_config_args)
|
.arg(remote_ext_config_args)
|
||||||
)
|
)
|
||||||
@@ -1410,6 +1396,7 @@ fn cli() -> Command {
|
|||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("stop")
|
Command::new("stop")
|
||||||
.arg(endpoint_id_arg)
|
.arg(endpoint_id_arg)
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new("destroy")
|
Arg::new("destroy")
|
||||||
.help("Also delete data directory (now optional, should be default in future)")
|
.help("Also delete data directory (now optional, should be default in future)")
|
||||||
|
|||||||
@@ -45,7 +45,6 @@ use std::sync::Arc;
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
use compute_api::spec::RemoteExtSpec;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::{NodeId, TenantId, TimelineId};
|
use utils::id::{NodeId, TenantId, TimelineId};
|
||||||
|
|
||||||
@@ -125,7 +124,6 @@ impl ComputeControlPlane {
|
|||||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||||
let pageserver =
|
let pageserver =
|
||||||
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||||
|
|
||||||
let ep = Arc::new(Endpoint {
|
let ep = Arc::new(Endpoint {
|
||||||
endpoint_id: endpoint_id.to_owned(),
|
endpoint_id: endpoint_id.to_owned(),
|
||||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
||||||
@@ -170,30 +168,6 @@ impl ComputeControlPlane {
|
|||||||
|
|
||||||
Ok(ep)
|
Ok(ep)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn check_conflicting_endpoints(
|
|
||||||
&self,
|
|
||||||
mode: ComputeMode,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
) -> Result<()> {
|
|
||||||
if matches!(mode, ComputeMode::Primary) {
|
|
||||||
// this check is not complete, as you could have a concurrent attempt at
|
|
||||||
// creating another primary, both reading the state before checking it here,
|
|
||||||
// but it's better than nothing.
|
|
||||||
let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
|
|
||||||
v.tenant_id == tenant_id
|
|
||||||
&& v.timeline_id == timeline_id
|
|
||||||
&& v.mode == mode
|
|
||||||
&& v.status() != "stopped"
|
|
||||||
});
|
|
||||||
|
|
||||||
if let Some((key, _)) = duplicates.next() {
|
|
||||||
bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -502,18 +476,6 @@ impl Endpoint {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// check for file remote_extensions_spec.json
|
|
||||||
// if it is present, read it and pass to compute_ctl
|
|
||||||
let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
|
|
||||||
let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
|
|
||||||
let remote_extensions: Option<RemoteExtSpec>;
|
|
||||||
|
|
||||||
if let Ok(spec_file) = remote_extensions_spec {
|
|
||||||
remote_extensions = serde_json::from_reader(spec_file).ok();
|
|
||||||
} else {
|
|
||||||
remote_extensions = None;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Create spec file
|
// Create spec file
|
||||||
let spec = ComputeSpec {
|
let spec = ComputeSpec {
|
||||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||||
@@ -535,7 +497,7 @@ impl Endpoint {
|
|||||||
pageserver_connstring: Some(pageserver_connstring),
|
pageserver_connstring: Some(pageserver_connstring),
|
||||||
safekeeper_connstrings,
|
safekeeper_connstrings,
|
||||||
storage_auth_token: auth_token.clone(),
|
storage_auth_token: auth_token.clone(),
|
||||||
remote_extensions,
|
remote_extensions: None,
|
||||||
};
|
};
|
||||||
let spec_path = self.endpoint_path().join("spec.json");
|
let spec_path = self.endpoint_path().join("spec.json");
|
||||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||||
|
|||||||
@@ -14,4 +14,3 @@ pub mod local_env;
|
|||||||
pub mod pageserver;
|
pub mod pageserver;
|
||||||
pub mod postgresql_conf;
|
pub mod postgresql_conf;
|
||||||
pub mod safekeeper;
|
pub mod safekeeper;
|
||||||
pub mod tenant_migration;
|
|
||||||
|
|||||||
@@ -11,15 +11,11 @@ use std::io::{BufReader, Write};
|
|||||||
use std::num::NonZeroU64;
|
use std::num::NonZeroU64;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::process::{Child, Command};
|
use std::process::{Child, Command};
|
||||||
use std::time::Duration;
|
|
||||||
use std::{io, result};
|
use std::{io, result};
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
|
||||||
self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
|
|
||||||
};
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||||
@@ -35,9 +31,6 @@ use utils::{
|
|||||||
use crate::local_env::PageServerConf;
|
use crate::local_env::PageServerConf;
|
||||||
use crate::{background_process, local_env::LocalEnv};
|
use crate::{background_process, local_env::LocalEnv};
|
||||||
|
|
||||||
/// Directory within .neon which will be used by default for LocalFs remote storage.
|
|
||||||
pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
|
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum PageserverHttpError {
|
pub enum PageserverHttpError {
|
||||||
#[error("Reqwest error: {0}")]
|
#[error("Reqwest error: {0}")]
|
||||||
@@ -105,10 +98,8 @@ impl PageServerNode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
|
// pageserver conf overrides defined by neon_local configuration.
|
||||||
///
|
fn neon_local_overrides(&self) -> Vec<String> {
|
||||||
/// These all end up on the command line of the `pageserver` binary.
|
|
||||||
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
|
||||||
let id = format!("id={}", self.conf.id);
|
let id = format!("id={}", self.conf.id);
|
||||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||||
let pg_distrib_dir_param = format!(
|
let pg_distrib_dir_param = format!(
|
||||||
@@ -141,25 +132,12 @@ impl PageServerNode {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
if !cli_overrides
|
|
||||||
.iter()
|
|
||||||
.any(|c| c.starts_with("remote_storage"))
|
|
||||||
{
|
|
||||||
overrides.push(format!(
|
|
||||||
"remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
|
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
|
||||||
{
|
{
|
||||||
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
||||||
// are one level below that, so refer to keys with ../
|
// are one level below that, so refer to keys with ../
|
||||||
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply the user-provided overrides
|
|
||||||
overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
|
|
||||||
|
|
||||||
overrides
|
overrides
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -225,6 +203,9 @@ impl PageServerNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
|
fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
|
||||||
|
let mut overrides = self.neon_local_overrides();
|
||||||
|
overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
|
||||||
|
|
||||||
let datadir = self.repo_path();
|
let datadir = self.repo_path();
|
||||||
print!(
|
print!(
|
||||||
"Starting pageserver node {} at '{}' in {:?}",
|
"Starting pageserver node {} at '{}' in {:?}",
|
||||||
@@ -267,7 +248,8 @@ impl PageServerNode {
|
|||||||
) -> Vec<Cow<'a, str>> {
|
) -> Vec<Cow<'a, str>> {
|
||||||
let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
|
let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
|
||||||
|
|
||||||
let overrides = self.neon_local_overrides(config_overrides);
|
let mut overrides = self.neon_local_overrides();
|
||||||
|
overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
|
||||||
for config_override in overrides {
|
for config_override in overrides {
|
||||||
args.push(Cow::Borrowed("-c"));
|
args.push(Cow::Borrowed("-c"));
|
||||||
args.push(Cow::Owned(config_override));
|
args.push(Cow::Owned(config_override));
|
||||||
@@ -410,7 +392,7 @@ impl PageServerNode {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let request = models::TenantCreateRequest {
|
let request = models::TenantCreateRequest {
|
||||||
new_tenant_id: TenantShardId::unsharded(new_tenant_id),
|
new_tenant_id,
|
||||||
generation,
|
generation,
|
||||||
config,
|
config,
|
||||||
};
|
};
|
||||||
@@ -519,32 +501,6 @@ impl PageServerNode {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn location_config(
|
|
||||||
&self,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
config: LocationConfig,
|
|
||||||
flush_ms: Option<Duration>,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let req_body = TenantLocationConfigRequest { tenant_id, config };
|
|
||||||
|
|
||||||
let path = format!(
|
|
||||||
"{}/tenant/{}/location_config",
|
|
||||||
self.http_base_url, tenant_id
|
|
||||||
);
|
|
||||||
let path = if let Some(flush_ms) = flush_ms {
|
|
||||||
format!("{}?flush_ms={}", path, flush_ms.as_millis())
|
|
||||||
} else {
|
|
||||||
path
|
|
||||||
};
|
|
||||||
|
|
||||||
self.http_request(Method::PUT, path)?
|
|
||||||
.json(&req_body)
|
|
||||||
.send()?
|
|
||||||
.error_from_body()?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||||
let timeline_infos: Vec<TimelineInfo> = self
|
let timeline_infos: Vec<TimelineInfo> = self
|
||||||
.http_request(
|
.http_request(
|
||||||
@@ -565,7 +521,6 @@ impl PageServerNode {
|
|||||||
ancestor_start_lsn: Option<Lsn>,
|
ancestor_start_lsn: Option<Lsn>,
|
||||||
ancestor_timeline_id: Option<TimelineId>,
|
ancestor_timeline_id: Option<TimelineId>,
|
||||||
pg_version: Option<u32>,
|
pg_version: Option<u32>,
|
||||||
existing_initdb_timeline_id: Option<TimelineId>,
|
|
||||||
) -> anyhow::Result<TimelineInfo> {
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
// If timeline ID was not specified, generate one
|
// If timeline ID was not specified, generate one
|
||||||
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
||||||
@@ -579,7 +534,6 @@ impl PageServerNode {
|
|||||||
ancestor_start_lsn,
|
ancestor_start_lsn,
|
||||||
ancestor_timeline_id,
|
ancestor_timeline_id,
|
||||||
pg_version,
|
pg_version,
|
||||||
existing_initdb_timeline_id,
|
|
||||||
})
|
})
|
||||||
.send()?
|
.send()?
|
||||||
.error_from_body()?
|
.error_from_body()?
|
||||||
|
|||||||
@@ -1,197 +0,0 @@
|
|||||||
//!
|
|
||||||
//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
|
|
||||||
//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
|
|
||||||
//! point to the new pageserver.
|
|
||||||
//!
|
|
||||||
use crate::local_env::LocalEnv;
|
|
||||||
use crate::{
|
|
||||||
attachment_service::AttachmentService, endpoint::ComputeControlPlane,
|
|
||||||
pageserver::PageServerNode,
|
|
||||||
};
|
|
||||||
use pageserver_api::models::{
|
|
||||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
|
||||||
};
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::time::Duration;
|
|
||||||
use utils::{
|
|
||||||
id::{TenantId, TimelineId},
|
|
||||||
lsn::Lsn,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Given an attached pageserver, retrieve the LSN for all timelines
|
|
||||||
fn get_lsns(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
pageserver: &PageServerNode,
|
|
||||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
|
||||||
let timelines = pageserver.timeline_list(&tenant_id)?;
|
|
||||||
Ok(timelines
|
|
||||||
.into_iter()
|
|
||||||
.map(|t| (t.timeline_id, t.last_record_lsn))
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
|
|
||||||
/// `baseline`.
|
|
||||||
fn await_lsn(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
pageserver: &PageServerNode,
|
|
||||||
baseline: HashMap<TimelineId, Lsn>,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
loop {
|
|
||||||
let latest = match get_lsns(tenant_id, pageserver) {
|
|
||||||
Ok(l) => l,
|
|
||||||
Err(e) => {
|
|
||||||
println!(
|
|
||||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
|
||||||
pageserver.conf.id
|
|
||||||
);
|
|
||||||
std::thread::sleep(Duration::from_millis(500));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut any_behind: bool = false;
|
|
||||||
for (timeline_id, baseline_lsn) in &baseline {
|
|
||||||
match latest.get(timeline_id) {
|
|
||||||
Some(latest_lsn) => {
|
|
||||||
println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
|
||||||
if latest_lsn < baseline_lsn {
|
|
||||||
any_behind = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
// Expected timeline isn't yet visible on migration destination.
|
|
||||||
// (IRL we would have to account for timeline deletion, but this
|
|
||||||
// is just test helper)
|
|
||||||
any_behind = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !any_behind {
|
|
||||||
println!("✅ LSN caught up. Proceeding...");
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
std::thread::sleep(Duration::from_millis(500));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This function spans multiple services, to demonstrate live migration of a tenant
|
|
||||||
/// between pageservers:
|
|
||||||
/// - Coordinate attach/secondary/detach on pageservers
|
|
||||||
/// - call into attachment_service for generations
|
|
||||||
/// - reconfigure compute endpoints to point to new attached pageserver
|
|
||||||
pub fn migrate_tenant(
|
|
||||||
env: &LocalEnv,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
dest_ps: PageServerNode,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
// Get a new generation
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
|
||||||
|
|
||||||
fn build_location_config(
|
|
||||||
mode: LocationConfigMode,
|
|
||||||
generation: Option<u32>,
|
|
||||||
secondary_conf: Option<LocationConfigSecondary>,
|
|
||||||
) -> LocationConfig {
|
|
||||||
LocationConfig {
|
|
||||||
mode,
|
|
||||||
generation,
|
|
||||||
secondary_conf,
|
|
||||||
tenant_conf: TenantConfig::default(),
|
|
||||||
shard_number: 0,
|
|
||||||
shard_count: 0,
|
|
||||||
shard_stripe_size: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let previous = attachment_service.inspect(tenant_id)?;
|
|
||||||
let mut baseline_lsns = None;
|
|
||||||
if let Some((generation, origin_ps_id)) = &previous {
|
|
||||||
let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
|
|
||||||
|
|
||||||
if origin_ps_id == &dest_ps.conf.id {
|
|
||||||
println!("🔁 Already attached to {origin_ps_id}, freshening...");
|
|
||||||
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
|
||||||
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
|
||||||
dest_ps.location_config(tenant_id, dest_conf, None)?;
|
|
||||||
println!("✅ Migration complete");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
|
|
||||||
|
|
||||||
let stale_conf =
|
|
||||||
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
|
|
||||||
origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
|
|
||||||
|
|
||||||
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
|
|
||||||
}
|
|
||||||
|
|
||||||
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
|
||||||
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
|
|
||||||
|
|
||||||
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
|
|
||||||
dest_ps.location_config(tenant_id, dest_conf, None)?;
|
|
||||||
|
|
||||||
if let Some(baseline) = baseline_lsns {
|
|
||||||
println!("🕑 Waiting for LSN to catch up...");
|
|
||||||
await_lsn(tenant_id, &dest_ps, baseline)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let cplane = ComputeControlPlane::load(env.clone())?;
|
|
||||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
|
||||||
if endpoint.tenant_id == tenant_id {
|
|
||||||
println!(
|
|
||||||
"🔁 Reconfiguring endpoint {} to use pageserver {}",
|
|
||||||
endpoint_name, dest_ps.conf.id
|
|
||||||
);
|
|
||||||
endpoint.reconfigure(Some(dest_ps.conf.id))?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for other_ps_conf in &env.pageservers {
|
|
||||||
if other_ps_conf.id == dest_ps.conf.id {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let other_ps = PageServerNode::from_env(env, other_ps_conf);
|
|
||||||
let other_ps_tenants = other_ps.tenant_list()?;
|
|
||||||
|
|
||||||
// Check if this tenant is attached
|
|
||||||
let found = other_ps_tenants
|
|
||||||
.into_iter()
|
|
||||||
.map(|t| t.id)
|
|
||||||
.any(|i| i == tenant_id);
|
|
||||||
if !found {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Downgrade to a secondary location
|
|
||||||
let secondary_conf = build_location_config(
|
|
||||||
LocationConfigMode::Secondary,
|
|
||||||
None,
|
|
||||||
Some(LocationConfigSecondary { warm: true }),
|
|
||||||
);
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"💤 Switching to secondary mode on pageserver {}",
|
|
||||||
other_ps.conf.id
|
|
||||||
);
|
|
||||||
other_ps.location_config(tenant_id, secondary_conf, None)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
|
||||||
dest_ps.conf.id
|
|
||||||
);
|
|
||||||
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
|
||||||
dest_ps.location_config(tenant_id, dest_conf, None)?;
|
|
||||||
|
|
||||||
println!("✅ Migration complete");
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
22
deny.toml
22
deny.toml
@@ -74,30 +74,10 @@ highlight = "all"
|
|||||||
workspace-default-features = "allow"
|
workspace-default-features = "allow"
|
||||||
external-default-features = "allow"
|
external-default-features = "allow"
|
||||||
allow = []
|
allow = []
|
||||||
|
deny = []
|
||||||
skip = []
|
skip = []
|
||||||
skip-tree = []
|
skip-tree = []
|
||||||
|
|
||||||
[[bans.deny]]
|
|
||||||
# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
|
|
||||||
# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
|
|
||||||
name = "async-std"
|
|
||||||
|
|
||||||
[[bans.deny]]
|
|
||||||
name = "async-io"
|
|
||||||
|
|
||||||
[[bans.deny]]
|
|
||||||
name = "async-waker"
|
|
||||||
|
|
||||||
[[bans.deny]]
|
|
||||||
name = "async-global-executor"
|
|
||||||
|
|
||||||
[[bans.deny]]
|
|
||||||
name = "async-executor"
|
|
||||||
|
|
||||||
[[bans.deny]]
|
|
||||||
name = "smol"
|
|
||||||
|
|
||||||
# This section is considered when running `cargo deny check sources`.
|
# This section is considered when running `cargo deny check sources`.
|
||||||
# More documentation about the 'sources' section can be found here:
|
# More documentation about the 'sources' section can be found here:
|
||||||
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
|
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
|
||||||
|
|||||||
@@ -177,7 +177,7 @@ I e during migration create_branch can be called on old pageserver and newly cre
|
|||||||
|
|
||||||
The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.
|
The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.
|
||||||
|
|
||||||
The approach largely follows this guide: <https://www.notion.so/neondatabase/Cloud-Ad-hoc-tenant-relocation-f687474f7bfc42269e6214e3acba25c7>
|
The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
|
||||||
|
|
||||||
The happy path sequence:
|
The happy path sequence:
|
||||||
|
|
||||||
|
|||||||
@@ -1,205 +0,0 @@
|
|||||||
# Name
|
|
||||||
|
|
||||||
Created on: 2023-09-08
|
|
||||||
Author: Arpad Müller
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
Enable the pageserver to recover from data corruption events by implementing
|
|
||||||
a feature to re-apply historic WAL records in parallel to the already occurring
|
|
||||||
WAL replay.
|
|
||||||
|
|
||||||
The feature is outside of the user-visible backup and history story, and only
|
|
||||||
serves as a second-level backup for the case that there is a bug in the
|
|
||||||
pageservers that corrupted the served pages.
|
|
||||||
|
|
||||||
The RFC proposes the addition of two new features:
|
|
||||||
* recover a broken branch from WAL (downtime is allowed)
|
|
||||||
* a test recovery system to recover random branches to make sure recovery works
|
|
||||||
|
|
||||||
## Motivation
|
|
||||||
|
|
||||||
The historic WAL is currently stored in S3 even after it has been replayed by
|
|
||||||
the pageserver and thus been integrated into the pageserver's storage system.
|
|
||||||
This is done to defend from data corruption failures inside the pageservers.
|
|
||||||
|
|
||||||
However, application of this WAL in the disaster recovery setting is currently
|
|
||||||
very manual and we want to automate this to make it easier.
|
|
||||||
|
|
||||||
### Use cases
|
|
||||||
|
|
||||||
There are various use cases for this feature, like:
|
|
||||||
|
|
||||||
* The main motivation is replaying in the instance of pageservers corrupting
|
|
||||||
data.
|
|
||||||
* We might want to, beyond the user-visible history features, through our
|
|
||||||
support channels and upon customer request, in select instances, recover
|
|
||||||
historic versions beyond the range of history that we officially support.
|
|
||||||
* Running the recovery process in the background for random tenant timelines
|
|
||||||
to figure out if there was a corruption of data (we would compare with what
|
|
||||||
the pageserver stores for the "official" timeline).
|
|
||||||
* Using the WAL to arrive at historic pages we can then back up to S3 so that
|
|
||||||
WAL itself can be discarded, or at least not used for future replays.
|
|
||||||
Again, this sounds a lot like what the pageserver is already doing, but the
|
|
||||||
point is to provide a fallback to the service provided by the pageserver.
|
|
||||||
|
|
||||||
## Design
|
|
||||||
|
|
||||||
### Design constraints
|
|
||||||
|
|
||||||
The main design constraint is that the feature needs to be *simple* enough that
|
|
||||||
the number of bugs are as low, and reliability as high as possible: the main
|
|
||||||
goal of this endeavour is to achieve higher correctness than the pageserver.
|
|
||||||
|
|
||||||
For the background process, we cannot afford a downtime of the timeline that is
|
|
||||||
being cloned, as we don't want to restrict ourselves to offline tenants only.
|
|
||||||
In the scenario where we want to recover from disasters or roll back to a
|
|
||||||
historic lsn through support staff, downtimes are more affordable, and
|
|
||||||
inevitable if the original had been subject to the corruption. Ideally, the
|
|
||||||
two code paths would share code, so the solution would be designed for not
|
|
||||||
requiring downtimes.
|
|
||||||
|
|
||||||
### API endpoint changes
|
|
||||||
|
|
||||||
This RFC proposes two API endpoint changes in the safekeeper and the
|
|
||||||
pageserver.
|
|
||||||
|
|
||||||
Remember, the pageserver timeline API creation endpoint is to this URL:
|
|
||||||
|
|
||||||
```
|
|
||||||
/v1/tenant/{tenant_id}/timeline/
|
|
||||||
```
|
|
||||||
|
|
||||||
Where `{tenant_id}` is the ID of the tenant the timeline is created for,
|
|
||||||
and specified as part of the URL. The timeline ID is passed via the POST
|
|
||||||
request body as the only required parameter `new_timeline_id`.
|
|
||||||
|
|
||||||
This proposal adds one optional parameter called
|
|
||||||
`existing_initdb_timeline_id` to the request's json body. If the parameter
|
|
||||||
is not specified, behaviour should be as existing, so the pageserver runs
|
|
||||||
initdb.
|
|
||||||
If the parameter is specified, it is expected to point to a timeline ID.
|
|
||||||
In fact that ID might match `new_timeline_id`, what's important is that
|
|
||||||
S3 storage contains a matching initdb under the URL matching the given
|
|
||||||
tenant and timeline.
|
|
||||||
|
|
||||||
Having both `ancestor_timeline_id` and `existing_initdb_timeline_id`
|
|
||||||
specified is illegal and will yield in an HTTP error. This feature is
|
|
||||||
only meant for the "main" branch that doesn't have any ancestors
|
|
||||||
of its own, as only here initdb is relevant.
|
|
||||||
|
|
||||||
For the safekeeper, we propose the addition of the following copy endpoint:
|
|
||||||
|
|
||||||
```
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy
|
|
||||||
```
|
|
||||||
it is meant for POST requests with json, and the two URL parameters
|
|
||||||
`tenant_id` and `source_timeline_id`. The json request body contains
|
|
||||||
the two required parameters `target_timeline_id` and `until_lsn`.
|
|
||||||
|
|
||||||
After invoking, the copy endpoint starts a copy process of the WAL from
|
|
||||||
the source ID to the target ID. The lsn is updated according to the
|
|
||||||
progress of the API call.
|
|
||||||
|
|
||||||
### Higher level features
|
|
||||||
|
|
||||||
We want the API changes to support the following higher level features:
|
|
||||||
|
|
||||||
* recovery-after-corruption DR of the main timeline of a tenant. This
|
|
||||||
feature allows for downtime.
|
|
||||||
* test DR of the main timeline into a special copy timeline. this feature
|
|
||||||
is meant to run against selected production tenants in the background,
|
|
||||||
without the user noticing, so it does not allow for downtime.
|
|
||||||
|
|
||||||
The recovery-after-corruption DR only needs the pageserver changes.
|
|
||||||
It works as follows:
|
|
||||||
|
|
||||||
* delete the timeline from the pageservers via timeline deletion API
|
|
||||||
* re-create it via timeline creation API (same ID as before) and set
|
|
||||||
`existing_initdb_timeline_id` to the same timeline ID
|
|
||||||
|
|
||||||
The test DR requires also the copy primitive and works as follows:
|
|
||||||
|
|
||||||
* copy the WAL of the timeline to a new place
|
|
||||||
* create a new timeline for the tenant
|
|
||||||
|
|
||||||
## Non Goals
|
|
||||||
|
|
||||||
At the danger of being repetitive, the main goal of this feature is to be a
|
|
||||||
backup method, so reliability is very important. This implies that other
|
|
||||||
aspects like performance or space reduction are less important.
|
|
||||||
|
|
||||||
### Corrupt WAL
|
|
||||||
|
|
||||||
The process suggested by this RFC assumes that the WAL is free of corruption.
|
|
||||||
In some instances, corruption can make it into WAL, like for example when
|
|
||||||
higher level components like postgres or the application first read corrupt
|
|
||||||
data, and then execute a write with data derived from that earlier read. That
|
|
||||||
written data might then contain the corruption.
|
|
||||||
|
|
||||||
Common use cases can hit this quite easily. For example, an application reads
|
|
||||||
some counter, increments it, and then writes the new counter value to the
|
|
||||||
database.
|
|
||||||
On a lower level, the compute might put FPIs (Full Page Images) into the WAL,
|
|
||||||
which have corrupt data for rows unrelated to the write operation at hand.
|
|
||||||
|
|
||||||
Separating corrupt writes from non-corrupt ones is a hard problem in general,
|
|
||||||
and if the application was involved in making the corrupt write, a recovery
|
|
||||||
would also involve the application. Therefore, corruption that has made it into
|
|
||||||
the WAL is outside of the scope of this feature. However, the WAL replay can be
|
|
||||||
issued to right before the point in time where the corruption occured. Then the
|
|
||||||
data loss is isolated to post-corruption writes only.
|
|
||||||
|
|
||||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
|
||||||
|
|
||||||
Most changes would happen to the pageservers.
|
|
||||||
For the higher level features, maybe other components like the console would
|
|
||||||
be involved.
|
|
||||||
|
|
||||||
We need to make sure that the shadow timelines are not subject to the usual
|
|
||||||
limits and billing we apply to existing timelines.
|
|
||||||
|
|
||||||
## Proposed implementation
|
|
||||||
|
|
||||||
The first problem to keep in mind is the reproducability of `initdb`.
|
|
||||||
So an initial step would be to upload `initdb` snapshots to S3.
|
|
||||||
|
|
||||||
After that, we'd have the endpoint spawn a background process which
|
|
||||||
performs the replay of the WAL to that new timeline. This process should
|
|
||||||
follow the existing workflows as closely as possible, just using the
|
|
||||||
WAL records of a different timeline.
|
|
||||||
|
|
||||||
The timeline created will be in a special state that solely looks for WAL
|
|
||||||
entries of the timeline it is trying to copy. Once the target LSN is reached,
|
|
||||||
it turns into a normal timeline that also accepts writes to its own
|
|
||||||
timeline ID.
|
|
||||||
|
|
||||||
### Scalability
|
|
||||||
|
|
||||||
For now we want to run this entire process on a single node, and as
|
|
||||||
it is by nature linear, it's hard to parallelize. However, for the
|
|
||||||
verification workloads, we can easily start the WAL replay in parallel
|
|
||||||
for different points in time. This is valuable especially for tenants
|
|
||||||
with large WAL records.
|
|
||||||
|
|
||||||
Compare this with the tricks to make addition circuits execute with
|
|
||||||
lower latency by making them perform the addition for both possible
|
|
||||||
values of the carry bit, and then, in a second step, taking the
|
|
||||||
result for the carry bit that was actually obtained.
|
|
||||||
|
|
||||||
The other scalability dimension to consider is the WAL length, which
|
|
||||||
is a growing question as tenants accumulate changes. There are
|
|
||||||
possible approaches to this, including creating snapshots of the
|
|
||||||
page files and uploading them to S3, but if we do this for every single
|
|
||||||
branch, we lose the cheap branching property.
|
|
||||||
|
|
||||||
### Implementation by component
|
|
||||||
|
|
||||||
The proposed changes for the various components of the neon architecture
|
|
||||||
are written up in this notion page:
|
|
||||||
|
|
||||||
https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2
|
|
||||||
|
|
||||||
### Unresolved questions
|
|
||||||
|
|
||||||
none known (outside of the mentioned ones).
|
|
||||||
@@ -17,10 +17,5 @@ postgres_ffi.workspace = true
|
|||||||
enum-map.workspace = true
|
enum-map.workspace = true
|
||||||
strum.workspace = true
|
strum.workspace = true
|
||||||
strum_macros.workspace = true
|
strum_macros.workspace = true
|
||||||
hex.workspace = true
|
|
||||||
thiserror.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
|
||||||
bincode.workspace = true
|
|
||||||
|
|||||||
@@ -4,9 +4,7 @@
|
|||||||
//! See docs/rfcs/025-generation-numbers.md
|
//! See docs/rfcs/025-generation-numbers.md
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::NodeId;
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
use crate::shard::TenantShardId;
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ReAttachRequest {
|
pub struct ReAttachRequest {
|
||||||
@@ -15,7 +13,7 @@ pub struct ReAttachRequest {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ReAttachResponseTenant {
|
pub struct ReAttachResponseTenant {
|
||||||
pub id: TenantShardId,
|
pub id: TenantId,
|
||||||
pub gen: u32,
|
pub gen: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -26,7 +24,7 @@ pub struct ReAttachResponse {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ValidateRequestTenant {
|
pub struct ValidateRequestTenant {
|
||||||
pub id: TenantShardId,
|
pub id: TenantId,
|
||||||
pub gen: u32,
|
pub gen: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -42,6 +40,6 @@ pub struct ValidateResponse {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ValidateResponseTenant {
|
pub struct ValidateResponseTenant {
|
||||||
pub id: TenantShardId,
|
pub id: TenantId,
|
||||||
pub valid: bool,
|
pub valid: bool,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,142 +0,0 @@
|
|||||||
use anyhow::{bail, Result};
|
|
||||||
use byteorder::{ByteOrder, BE};
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use std::fmt;
|
|
||||||
|
|
||||||
/// Key used in the Repository kv-store.
|
|
||||||
///
|
|
||||||
/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
|
|
||||||
/// for what we actually store in these fields.
|
|
||||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
|
||||||
pub struct Key {
|
|
||||||
pub field1: u8,
|
|
||||||
pub field2: u32,
|
|
||||||
pub field3: u32,
|
|
||||||
pub field4: u32,
|
|
||||||
pub field5: u8,
|
|
||||||
pub field6: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub const KEY_SIZE: usize = 18;
|
|
||||||
|
|
||||||
impl Key {
|
|
||||||
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
|
||||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
|
||||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
|
||||||
pub fn to_i128(&self) -> i128 {
|
|
||||||
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
|
||||||
(((self.field1 & 0xf) as i128) << 120)
|
|
||||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
|
||||||
| ((self.field3 as i128) << 72)
|
|
||||||
| ((self.field4 as i128) << 40)
|
|
||||||
| ((self.field5 as i128) << 32)
|
|
||||||
| self.field6 as i128
|
|
||||||
}
|
|
||||||
|
|
||||||
pub const fn from_i128(x: i128) -> Self {
|
|
||||||
Key {
|
|
||||||
field1: ((x >> 120) & 0xf) as u8,
|
|
||||||
field2: ((x >> 104) & 0xFFFF) as u32,
|
|
||||||
field3: (x >> 72) as u32,
|
|
||||||
field4: (x >> 40) as u32,
|
|
||||||
field5: (x >> 32) as u8,
|
|
||||||
field6: x as u32,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn next(&self) -> Key {
|
|
||||||
self.add(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add(&self, x: u32) -> Key {
|
|
||||||
let mut key = *self;
|
|
||||||
|
|
||||||
let r = key.field6.overflowing_add(x);
|
|
||||||
key.field6 = r.0;
|
|
||||||
if r.1 {
|
|
||||||
let r = key.field5.overflowing_add(1);
|
|
||||||
key.field5 = r.0;
|
|
||||||
if r.1 {
|
|
||||||
let r = key.field4.overflowing_add(1);
|
|
||||||
key.field4 = r.0;
|
|
||||||
if r.1 {
|
|
||||||
let r = key.field3.overflowing_add(1);
|
|
||||||
key.field3 = r.0;
|
|
||||||
if r.1 {
|
|
||||||
let r = key.field2.overflowing_add(1);
|
|
||||||
key.field2 = r.0;
|
|
||||||
if r.1 {
|
|
||||||
let r = key.field1.overflowing_add(1);
|
|
||||||
key.field1 = r.0;
|
|
||||||
assert!(!r.1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
key
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn from_slice(b: &[u8]) -> Self {
|
|
||||||
Key {
|
|
||||||
field1: b[0],
|
|
||||||
field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
|
|
||||||
field3: u32::from_be_bytes(b[5..9].try_into().unwrap()),
|
|
||||||
field4: u32::from_be_bytes(b[9..13].try_into().unwrap()),
|
|
||||||
field5: b[13],
|
|
||||||
field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
|
||||||
buf[0] = self.field1;
|
|
||||||
BE::write_u32(&mut buf[1..5], self.field2);
|
|
||||||
BE::write_u32(&mut buf[5..9], self.field3);
|
|
||||||
BE::write_u32(&mut buf[9..13], self.field4);
|
|
||||||
buf[13] = self.field5;
|
|
||||||
BE::write_u32(&mut buf[14..18], self.field6);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for Key {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
write!(
|
|
||||||
f,
|
|
||||||
"{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}",
|
|
||||||
self.field1, self.field2, self.field3, self.field4, self.field5, self.field6
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Key {
|
|
||||||
pub const MIN: Key = Key {
|
|
||||||
field1: u8::MIN,
|
|
||||||
field2: u32::MIN,
|
|
||||||
field3: u32::MIN,
|
|
||||||
field4: u32::MIN,
|
|
||||||
field5: u8::MIN,
|
|
||||||
field6: u32::MIN,
|
|
||||||
};
|
|
||||||
pub const MAX: Key = Key {
|
|
||||||
field1: u8::MAX,
|
|
||||||
field2: u32::MAX,
|
|
||||||
field3: u32::MAX,
|
|
||||||
field4: u32::MAX,
|
|
||||||
field5: u8::MAX,
|
|
||||||
field6: u32::MAX,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn from_hex(s: &str) -> Result<Self> {
|
|
||||||
if s.len() != 36 {
|
|
||||||
bail!("parse error");
|
|
||||||
}
|
|
||||||
Ok(Key {
|
|
||||||
field1: u8::from_str_radix(&s[0..2], 16)?,
|
|
||||||
field2: u32::from_str_radix(&s[2..10], 16)?,
|
|
||||||
field3: u32::from_str_radix(&s[10..18], 16)?,
|
|
||||||
field4: u32::from_str_radix(&s[18..26], 16)?,
|
|
||||||
field5: u8::from_str_radix(&s[26..28], 16)?,
|
|
||||||
field6: u32::from_str_radix(&s[28..36], 16)?,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -4,10 +4,8 @@ use const_format::formatcp;
|
|||||||
|
|
||||||
/// Public API types
|
/// Public API types
|
||||||
pub mod control_api;
|
pub mod control_api;
|
||||||
pub mod key;
|
|
||||||
pub mod models;
|
pub mod models;
|
||||||
pub mod reltag;
|
pub mod reltag;
|
||||||
pub mod shard;
|
|
||||||
|
|
||||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||||
|
|||||||
@@ -10,12 +10,13 @@ use serde_with::serde_as;
|
|||||||
use strum_macros;
|
use strum_macros;
|
||||||
use utils::{
|
use utils::{
|
||||||
completion,
|
completion,
|
||||||
|
generation::Generation,
|
||||||
history_buffer::HistoryBufferWithDropCounter,
|
history_buffer::HistoryBufferWithDropCounter,
|
||||||
id::{NodeId, TenantId, TimelineId},
|
id::{NodeId, TenantId, TimelineId},
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{reltag::RelTag, shard::TenantShardId};
|
use crate::reltag::RelTag;
|
||||||
use anyhow::bail;
|
use anyhow::bail;
|
||||||
use bytes::{BufMut, Bytes, BytesMut};
|
use bytes::{BufMut, Bytes, BytesMut};
|
||||||
|
|
||||||
@@ -179,8 +180,6 @@ pub struct TimelineCreateRequest {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub ancestor_timeline_id: Option<TimelineId>,
|
pub ancestor_timeline_id: Option<TimelineId>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub existing_initdb_timeline_id: Option<TimelineId>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub ancestor_start_lsn: Option<Lsn>,
|
pub ancestor_start_lsn: Option<Lsn>,
|
||||||
pub pg_version: Option<u32>,
|
pub pg_version: Option<u32>,
|
||||||
}
|
}
|
||||||
@@ -188,7 +187,7 @@ pub struct TimelineCreateRequest {
|
|||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
pub struct TenantCreateRequest {
|
pub struct TenantCreateRequest {
|
||||||
pub new_tenant_id: TenantShardId,
|
pub new_tenant_id: TenantId,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub generation: Option<u32>,
|
pub generation: Option<u32>,
|
||||||
@@ -263,19 +262,10 @@ pub struct LocationConfig {
|
|||||||
pub mode: LocationConfigMode,
|
pub mode: LocationConfigMode,
|
||||||
/// If attaching, in what generation?
|
/// If attaching, in what generation?
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub generation: Option<u32>,
|
pub generation: Option<Generation>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub secondary_conf: Option<LocationConfigSecondary>,
|
pub secondary_conf: Option<LocationConfigSecondary>,
|
||||||
|
|
||||||
// Shard parameters: if shard_count is nonzero, then other shard_* fields
|
|
||||||
// must be set accurately.
|
|
||||||
#[serde(default)]
|
|
||||||
pub shard_number: u8,
|
|
||||||
#[serde(default)]
|
|
||||||
pub shard_count: u8,
|
|
||||||
#[serde(default)]
|
|
||||||
pub shard_stripe_size: u32,
|
|
||||||
|
|
||||||
// If requesting mode `Secondary`, configuration for that.
|
// If requesting mode `Secondary`, configuration for that.
|
||||||
// Custom storage configuration for the tenant, if any
|
// Custom storage configuration for the tenant, if any
|
||||||
pub tenant_conf: TenantConfig,
|
pub tenant_conf: TenantConfig,
|
||||||
@@ -316,7 +306,25 @@ impl std::ops::Deref for TenantConfigRequest {
|
|||||||
|
|
||||||
impl TenantConfigRequest {
|
impl TenantConfigRequest {
|
||||||
pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
|
pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
|
||||||
let config = TenantConfig::default();
|
let config = TenantConfig {
|
||||||
|
checkpoint_distance: None,
|
||||||
|
checkpoint_timeout: None,
|
||||||
|
compaction_target_size: None,
|
||||||
|
compaction_period: None,
|
||||||
|
compaction_threshold: None,
|
||||||
|
gc_horizon: None,
|
||||||
|
gc_period: None,
|
||||||
|
image_creation_threshold: None,
|
||||||
|
pitr_interval: None,
|
||||||
|
walreceiver_connect_timeout: None,
|
||||||
|
lagging_wal_timeout: None,
|
||||||
|
max_lsn_wal_lag: None,
|
||||||
|
trace_read_requests: None,
|
||||||
|
eviction_policy: None,
|
||||||
|
min_resident_size_override: None,
|
||||||
|
evictions_low_residence_duration_metric_threshold: None,
|
||||||
|
gc_feedback: None,
|
||||||
|
};
|
||||||
TenantConfigRequest { tenant_id, config }
|
TenantConfigRequest { tenant_id, config }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -384,9 +392,7 @@ pub struct TimelineInfo {
|
|||||||
/// The LSN that we are advertizing to safekeepers
|
/// The LSN that we are advertizing to safekeepers
|
||||||
pub remote_consistent_lsn_visible: Lsn,
|
pub remote_consistent_lsn_visible: Lsn,
|
||||||
|
|
||||||
pub current_logical_size: u64,
|
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||||
pub current_logical_size_is_accurate: bool,
|
|
||||||
|
|
||||||
/// Sum of the size of all layer files.
|
/// Sum of the size of all layer files.
|
||||||
/// If a layer is present in both local FS and S3, it counts only once.
|
/// If a layer is present in both local FS and S3, it counts only once.
|
||||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||||
|
|||||||
@@ -1,612 +0,0 @@
|
|||||||
use std::{ops::RangeInclusive, str::FromStr};
|
|
||||||
|
|
||||||
use hex::FromHex;
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use thiserror;
|
|
||||||
use utils::id::TenantId;
|
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
|
||||||
pub struct ShardNumber(pub u8);
|
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
|
||||||
pub struct ShardCount(pub u8);
|
|
||||||
|
|
||||||
impl ShardCount {
|
|
||||||
pub const MAX: Self = Self(u8::MAX);
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ShardNumber {
|
|
||||||
pub const MAX: Self = Self(u8::MAX);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// TenantShardId identify the units of work for the Pageserver.
|
|
||||||
///
|
|
||||||
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
|
|
||||||
///
|
|
||||||
/// # The second shard in a two-shard tenant
|
|
||||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
|
||||||
///
|
|
||||||
/// Historically, tenants could not have multiple shards, and were identified
|
|
||||||
/// by TenantId. To support this, TenantShardId has a special legacy
|
|
||||||
/// mode where `shard_count` is equal to zero: this represents a single-sharded
|
|
||||||
/// tenant which should be written as a TenantId with no suffix.
|
|
||||||
///
|
|
||||||
/// The human-readable encoding of TenantShardId, such as used in API URLs,
|
|
||||||
/// is both forward and backward compatible: a legacy TenantId can be
|
|
||||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
|
||||||
/// as a TenantId.
|
|
||||||
///
|
|
||||||
/// Note that the binary encoding is _not_ backward compatible, because
|
|
||||||
/// at the time sharding is introduced, there are no existing binary structures
|
|
||||||
/// containing TenantId that we need to handle.
|
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
|
||||||
pub struct TenantShardId {
|
|
||||||
pub tenant_id: TenantId,
|
|
||||||
pub shard_number: ShardNumber,
|
|
||||||
pub shard_count: ShardCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TenantShardId {
|
|
||||||
pub fn unsharded(tenant_id: TenantId) -> Self {
|
|
||||||
Self {
|
|
||||||
tenant_id,
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
|
|
||||||
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
|
|
||||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
|
||||||
RangeInclusive::new(
|
|
||||||
Self {
|
|
||||||
tenant_id,
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
},
|
|
||||||
Self {
|
|
||||||
tenant_id,
|
|
||||||
shard_number: ShardNumber::MAX,
|
|
||||||
shard_count: ShardCount::MAX,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn shard_slug(&self) -> String {
|
|
||||||
format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for TenantShardId {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
if self.shard_count != ShardCount(0) {
|
|
||||||
write!(
|
|
||||||
f,
|
|
||||||
"{}-{:02x}{:02x}",
|
|
||||||
self.tenant_id, self.shard_number.0, self.shard_count.0
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
|
|
||||||
// is distinct from the normal single shard case (shard count == 1).
|
|
||||||
self.tenant_id.fmt(f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Debug for TenantShardId {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
// Debug is the same as Display: the compact hex representation
|
|
||||||
write!(f, "{}", self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::str::FromStr for TenantShardId {
|
|
||||||
type Err = hex::FromHexError;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
// Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
|
|
||||||
if s.len() == 32 {
|
|
||||||
// Legacy case: no shard specified
|
|
||||||
Ok(Self {
|
|
||||||
tenant_id: TenantId::from_str(s)?,
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
})
|
|
||||||
} else if s.len() == 37 {
|
|
||||||
let bytes = s.as_bytes();
|
|
||||||
let tenant_id = TenantId::from_hex(&bytes[0..32])?;
|
|
||||||
let mut shard_parts: [u8; 2] = [0u8; 2];
|
|
||||||
hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
|
|
||||||
Ok(Self {
|
|
||||||
tenant_id,
|
|
||||||
shard_number: ShardNumber(shard_parts[0]),
|
|
||||||
shard_count: ShardCount(shard_parts[1]),
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
Err(hex::FromHexError::InvalidStringLength)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<[u8; 18]> for TenantShardId {
|
|
||||||
fn from(b: [u8; 18]) -> Self {
|
|
||||||
let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
|
|
||||||
|
|
||||||
Self {
|
|
||||||
tenant_id: TenantId::from(tenant_id_bytes),
|
|
||||||
shard_number: ShardNumber(b[16]),
|
|
||||||
shard_count: ShardCount(b[17]),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// For use within the context of a particular tenant, when we need to know which
|
|
||||||
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
|
||||||
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
|
||||||
/// TenantShardId.
|
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
|
|
||||||
pub struct ShardIndex {
|
|
||||||
pub shard_number: ShardNumber,
|
|
||||||
pub shard_count: ShardCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ShardIndex {
|
|
||||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
|
||||||
Self {
|
|
||||||
shard_number: number,
|
|
||||||
shard_count: count,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub fn unsharded() -> Self {
|
|
||||||
Self {
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
|
||||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// For use in constructing remote storage paths: concatenate this with a TenantId
|
|
||||||
/// to get a fully qualified TenantShardId.
|
|
||||||
///
|
|
||||||
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
|
|
||||||
/// that the legacy pre-sharding remote key format is preserved.
|
|
||||||
pub fn get_suffix(&self) -> String {
|
|
||||||
if self.is_unsharded() {
|
|
||||||
"".to_string()
|
|
||||||
} else {
|
|
||||||
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for ShardIndex {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Debug for ShardIndex {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
// Debug is the same as Display: the compact hex representation
|
|
||||||
write!(f, "{}", self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::str::FromStr for ShardIndex {
|
|
||||||
type Err = hex::FromHexError;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
// Expect format: 1 byte shard number, 1 byte shard count
|
|
||||||
if s.len() == 4 {
|
|
||||||
let bytes = s.as_bytes();
|
|
||||||
let mut shard_parts: [u8; 2] = [0u8; 2];
|
|
||||||
hex::decode_to_slice(bytes, &mut shard_parts)?;
|
|
||||||
Ok(Self {
|
|
||||||
shard_number: ShardNumber(shard_parts[0]),
|
|
||||||
shard_count: ShardCount(shard_parts[1]),
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
Err(hex::FromHexError::InvalidStringLength)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<[u8; 2]> for ShardIndex {
|
|
||||||
fn from(b: [u8; 2]) -> Self {
|
|
||||||
Self {
|
|
||||||
shard_number: ShardNumber(b[0]),
|
|
||||||
shard_count: ShardCount(b[1]),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Serialize for TenantShardId {
|
|
||||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
||||||
where
|
|
||||||
S: serde::Serializer,
|
|
||||||
{
|
|
||||||
if serializer.is_human_readable() {
|
|
||||||
serializer.collect_str(self)
|
|
||||||
} else {
|
|
||||||
let mut packed: [u8; 18] = [0; 18];
|
|
||||||
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
|
||||||
packed[16] = self.shard_number.0;
|
|
||||||
packed[17] = self.shard_count.0;
|
|
||||||
|
|
||||||
packed.serialize(serializer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> Deserialize<'de> for TenantShardId {
|
|
||||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
struct IdVisitor {
|
|
||||||
is_human_readable_deserializer: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
|
||||||
type Value = TenantShardId;
|
|
||||||
|
|
||||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
||||||
if self.is_human_readable_deserializer {
|
|
||||||
formatter.write_str("value in form of hex string")
|
|
||||||
} else {
|
|
||||||
formatter.write_str("value in form of integer array([u8; 18])")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
|
||||||
where
|
|
||||||
A: serde::de::SeqAccess<'de>,
|
|
||||||
{
|
|
||||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
|
||||||
let id: [u8; 18] = Deserialize::deserialize(s)?;
|
|
||||||
Ok(TenantShardId::from(id))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
|
||||||
where
|
|
||||||
E: serde::de::Error,
|
|
||||||
{
|
|
||||||
TenantShardId::from_str(v).map_err(E::custom)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if deserializer.is_human_readable() {
|
|
||||||
deserializer.deserialize_str(IdVisitor {
|
|
||||||
is_human_readable_deserializer: true,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
deserializer.deserialize_tuple(
|
|
||||||
18,
|
|
||||||
IdVisitor {
|
|
||||||
is_human_readable_deserializer: false,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Stripe size in number of pages
|
|
||||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
|
||||||
pub struct ShardStripeSize(pub u32);
|
|
||||||
|
|
||||||
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
|
||||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
|
||||||
pub struct ShardLayout(u8);
|
|
||||||
|
|
||||||
const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
|
||||||
|
|
||||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
|
||||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
|
||||||
|
|
||||||
/// The ShardIdentity contains the information needed for one member of map
|
|
||||||
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
|
||||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
|
||||||
pub struct ShardIdentity {
|
|
||||||
pub layout: ShardLayout,
|
|
||||||
pub number: ShardNumber,
|
|
||||||
pub count: ShardCount,
|
|
||||||
pub stripe_size: ShardStripeSize,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
|
||||||
pub enum ShardConfigError {
|
|
||||||
#[error("Invalid shard count")]
|
|
||||||
InvalidCount,
|
|
||||||
#[error("Invalid shard number")]
|
|
||||||
InvalidNumber,
|
|
||||||
#[error("Invalid stripe size")]
|
|
||||||
InvalidStripeSize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ShardIdentity {
|
|
||||||
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
|
|
||||||
/// tenants. Modern single-shard tenants should not use this: they should
|
|
||||||
/// have number=0 count=1.
|
|
||||||
pub fn unsharded() -> Self {
|
|
||||||
Self {
|
|
||||||
number: ShardNumber(0),
|
|
||||||
count: ShardCount(0),
|
|
||||||
layout: LAYOUT_V1,
|
|
||||||
stripe_size: DEFAULT_STRIPE_SIZE,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
|
||||||
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Count must be nonzero, and number must be < count. To construct
|
|
||||||
/// the legacy case (count==0), use Self::unsharded instead.
|
|
||||||
pub fn new(
|
|
||||||
number: ShardNumber,
|
|
||||||
count: ShardCount,
|
|
||||||
stripe_size: ShardStripeSize,
|
|
||||||
) -> Result<Self, ShardConfigError> {
|
|
||||||
if count.0 == 0 {
|
|
||||||
Err(ShardConfigError::InvalidCount)
|
|
||||||
} else if number.0 > count.0 - 1 {
|
|
||||||
Err(ShardConfigError::InvalidNumber)
|
|
||||||
} else if stripe_size.0 == 0 {
|
|
||||||
Err(ShardConfigError::InvalidStripeSize)
|
|
||||||
} else {
|
|
||||||
Ok(Self {
|
|
||||||
number,
|
|
||||||
count,
|
|
||||||
layout: LAYOUT_V1,
|
|
||||||
stripe_size,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Serialize for ShardIndex {
|
|
||||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
||||||
where
|
|
||||||
S: serde::Serializer,
|
|
||||||
{
|
|
||||||
if serializer.is_human_readable() {
|
|
||||||
serializer.collect_str(self)
|
|
||||||
} else {
|
|
||||||
// Binary encoding is not used in index_part.json, but is included in anticipation of
|
|
||||||
// switching various structures (e.g. inter-process communication, remote metadata) to more
|
|
||||||
// compact binary encodings in future.
|
|
||||||
let mut packed: [u8; 2] = [0; 2];
|
|
||||||
packed[0] = self.shard_number.0;
|
|
||||||
packed[1] = self.shard_count.0;
|
|
||||||
packed.serialize(serializer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> Deserialize<'de> for ShardIndex {
|
|
||||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
struct IdVisitor {
|
|
||||||
is_human_readable_deserializer: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
|
||||||
type Value = ShardIndex;
|
|
||||||
|
|
||||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
||||||
if self.is_human_readable_deserializer {
|
|
||||||
formatter.write_str("value in form of hex string")
|
|
||||||
} else {
|
|
||||||
formatter.write_str("value in form of integer array([u8; 2])")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
|
||||||
where
|
|
||||||
A: serde::de::SeqAccess<'de>,
|
|
||||||
{
|
|
||||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
|
||||||
let id: [u8; 2] = Deserialize::deserialize(s)?;
|
|
||||||
Ok(ShardIndex::from(id))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
|
||||||
where
|
|
||||||
E: serde::de::Error,
|
|
||||||
{
|
|
||||||
ShardIndex::from_str(v).map_err(E::custom)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if deserializer.is_human_readable() {
|
|
||||||
deserializer.deserialize_str(IdVisitor {
|
|
||||||
is_human_readable_deserializer: true,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
deserializer.deserialize_tuple(
|
|
||||||
2,
|
|
||||||
IdVisitor {
|
|
||||||
is_human_readable_deserializer: false,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use std::str::FromStr;
|
|
||||||
|
|
||||||
use bincode;
|
|
||||||
use utils::{id::TenantId, Hex};
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
const EXAMPLE_TENANT_ID: &str = "1f359dd625e519a1a4e8d7509690f6fc";
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn tenant_shard_id_string() -> Result<(), hex::FromHexError> {
|
|
||||||
let example = TenantShardId {
|
|
||||||
tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
|
|
||||||
shard_count: ShardCount(10),
|
|
||||||
shard_number: ShardNumber(7),
|
|
||||||
};
|
|
||||||
|
|
||||||
let encoded = format!("{example}");
|
|
||||||
|
|
||||||
let expected = format!("{EXAMPLE_TENANT_ID}-070a");
|
|
||||||
assert_eq!(&encoded, &expected);
|
|
||||||
|
|
||||||
let decoded = TenantShardId::from_str(&encoded)?;
|
|
||||||
|
|
||||||
assert_eq!(example, decoded);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn tenant_shard_id_binary() -> Result<(), hex::FromHexError> {
|
|
||||||
let example = TenantShardId {
|
|
||||||
tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
|
|
||||||
shard_count: ShardCount(10),
|
|
||||||
shard_number: ShardNumber(7),
|
|
||||||
};
|
|
||||||
|
|
||||||
let encoded = bincode::serialize(&example).unwrap();
|
|
||||||
let expected: [u8; 18] = [
|
|
||||||
0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
|
|
||||||
0xf6, 0xfc, 0x07, 0x0a,
|
|
||||||
];
|
|
||||||
assert_eq!(Hex(&encoded), Hex(&expected));
|
|
||||||
|
|
||||||
let decoded = bincode::deserialize(&encoded).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(example, decoded);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn tenant_shard_id_backward_compat() -> Result<(), hex::FromHexError> {
|
|
||||||
// Test that TenantShardId can decode a TenantId in human
|
|
||||||
// readable form
|
|
||||||
let example = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
|
|
||||||
let encoded = format!("{example}");
|
|
||||||
|
|
||||||
assert_eq!(&encoded, EXAMPLE_TENANT_ID);
|
|
||||||
|
|
||||||
let decoded = TenantShardId::from_str(&encoded)?;
|
|
||||||
|
|
||||||
assert_eq!(example, decoded.tenant_id);
|
|
||||||
assert_eq!(decoded.shard_count, ShardCount(0));
|
|
||||||
assert_eq!(decoded.shard_number, ShardNumber(0));
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn tenant_shard_id_forward_compat() -> Result<(), hex::FromHexError> {
|
|
||||||
// Test that a legacy TenantShardId encodes into a form that
|
|
||||||
// can be decoded as TenantId
|
|
||||||
let example_tenant_id = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
|
|
||||||
let example = TenantShardId::unsharded(example_tenant_id);
|
|
||||||
let encoded = format!("{example}");
|
|
||||||
|
|
||||||
assert_eq!(&encoded, EXAMPLE_TENANT_ID);
|
|
||||||
|
|
||||||
let decoded = TenantId::from_str(&encoded)?;
|
|
||||||
|
|
||||||
assert_eq!(example_tenant_id, decoded);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn tenant_shard_id_legacy_binary() -> Result<(), hex::FromHexError> {
|
|
||||||
// Unlike in human readable encoding, binary encoding does not
|
|
||||||
// do any special handling of legacy unsharded TenantIds: this test
|
|
||||||
// is equivalent to the main test for binary encoding, just verifying
|
|
||||||
// that the same behavior applies when we have used `unsharded()` to
|
|
||||||
// construct a TenantShardId.
|
|
||||||
let example = TenantShardId::unsharded(TenantId::from_str(EXAMPLE_TENANT_ID).unwrap());
|
|
||||||
let encoded = bincode::serialize(&example).unwrap();
|
|
||||||
|
|
||||||
let expected: [u8; 18] = [
|
|
||||||
0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
|
|
||||||
0xf6, 0xfc, 0x00, 0x00,
|
|
||||||
];
|
|
||||||
assert_eq!(Hex(&encoded), Hex(&expected));
|
|
||||||
|
|
||||||
let decoded = bincode::deserialize::<TenantShardId>(&encoded).unwrap();
|
|
||||||
assert_eq!(example, decoded);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn shard_identity_validation() -> Result<(), ShardConfigError> {
|
|
||||||
// Happy cases
|
|
||||||
ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?;
|
|
||||||
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?;
|
|
||||||
ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?;
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE),
|
|
||||||
Err(ShardConfigError::InvalidCount)
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE),
|
|
||||||
Err(ShardConfigError::InvalidNumber)
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE),
|
|
||||||
Err(ShardConfigError::InvalidNumber)
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE),
|
|
||||||
Err(ShardConfigError::InvalidNumber)
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)),
|
|
||||||
Err(ShardConfigError::InvalidStripeSize)
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn shard_index_human_encoding() -> Result<(), hex::FromHexError> {
|
|
||||||
let example = ShardIndex {
|
|
||||||
shard_number: ShardNumber(13),
|
|
||||||
shard_count: ShardCount(17),
|
|
||||||
};
|
|
||||||
let expected: String = "0d11".to_string();
|
|
||||||
let encoded = format!("{example}");
|
|
||||||
assert_eq!(&encoded, &expected);
|
|
||||||
|
|
||||||
let decoded = ShardIndex::from_str(&encoded)?;
|
|
||||||
assert_eq!(example, decoded);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> {
|
|
||||||
let example = ShardIndex {
|
|
||||||
shard_number: ShardNumber(13),
|
|
||||||
shard_count: ShardCount(17),
|
|
||||||
};
|
|
||||||
let expected: [u8; 2] = [0x0d, 0x11];
|
|
||||||
|
|
||||||
let encoded = bincode::serialize(&example).unwrap();
|
|
||||||
assert_eq!(Hex(&encoded), Hex(&expected));
|
|
||||||
let decoded = bincode::deserialize(&encoded).unwrap();
|
|
||||||
assert_eq!(example, decoded);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -9,7 +9,8 @@ anyhow.workspace = true
|
|||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
aws-smithy-async.workspace = true
|
aws-smithy-async.workspace = true
|
||||||
aws-smithy-types.workspace = true
|
aws-smithy-http.workspace = true
|
||||||
|
aws-types.workspace = true
|
||||||
aws-config.workspace = true
|
aws-config.workspace = true
|
||||||
aws-sdk-s3.workspace = true
|
aws-sdk-s3.workspace = true
|
||||||
aws-credential-types.workspace = true
|
aws-credential-types.workspace = true
|
||||||
|
|||||||
@@ -14,20 +14,18 @@ use aws_config::{
|
|||||||
provider_config::ProviderConfig,
|
provider_config::ProviderConfig,
|
||||||
retry::{RetryConfigBuilder, RetryMode},
|
retry::{RetryConfigBuilder, RetryMode},
|
||||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||||
BehaviorVersion,
|
|
||||||
};
|
};
|
||||||
use aws_credential_types::provider::SharedCredentialsProvider;
|
use aws_credential_types::cache::CredentialsCache;
|
||||||
use aws_sdk_s3::{
|
use aws_sdk_s3::{
|
||||||
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
|
config::{AsyncSleep, Config, Region, SharedAsyncSleep},
|
||||||
error::SdkError,
|
error::SdkError,
|
||||||
operation::get_object::GetObjectError,
|
operation::get_object::GetObjectError,
|
||||||
|
primitives::ByteStream,
|
||||||
types::{Delete, ObjectIdentifier},
|
types::{Delete, ObjectIdentifier},
|
||||||
Client,
|
Client,
|
||||||
};
|
};
|
||||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||||
|
use aws_smithy_http::body::SdkBody;
|
||||||
use aws_smithy_types::body::SdkBody;
|
|
||||||
use aws_smithy_types::byte_stream::ByteStream;
|
|
||||||
use hyper::Body;
|
use hyper::Body;
|
||||||
use scopeguard::ScopeGuard;
|
use scopeguard::ScopeGuard;
|
||||||
use tokio::io::{self, AsyncRead};
|
use tokio::io::{self, AsyncRead};
|
||||||
@@ -80,6 +78,7 @@ impl S3Bucket {
|
|||||||
// needed to access remote extensions bucket
|
// needed to access remote extensions bucket
|
||||||
.or_else("token", {
|
.or_else("token", {
|
||||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||||
|
|
||||||
WebIdentityTokenCredentialsProvider::builder()
|
WebIdentityTokenCredentialsProvider::builder()
|
||||||
.configure(&provider_conf)
|
.configure(&provider_conf)
|
||||||
.build()
|
.build()
|
||||||
@@ -99,20 +98,18 @@ impl S3Bucket {
|
|||||||
.set_max_attempts(Some(1))
|
.set_max_attempts(Some(1))
|
||||||
.set_mode(Some(RetryMode::Adaptive));
|
.set_mode(Some(RetryMode::Adaptive));
|
||||||
|
|
||||||
let mut config_builder = Builder::default()
|
let mut config_builder = Config::builder()
|
||||||
.behavior_version(BehaviorVersion::v2023_11_09())
|
|
||||||
.region(region)
|
.region(region)
|
||||||
.identity_cache(IdentityCache::lazy().build())
|
.credentials_cache(CredentialsCache::lazy())
|
||||||
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
|
.credentials_provider(credentials_provider)
|
||||||
.retry_config(retry_config.build())
|
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
|
||||||
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
|
.retry_config(retry_config.build());
|
||||||
|
|
||||||
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
||||||
config_builder = config_builder
|
config_builder = config_builder
|
||||||
.endpoint_url(custom_endpoint)
|
.endpoint_url(custom_endpoint)
|
||||||
.force_path_style(true);
|
.force_path_style(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
let client = Client::from_conf(config_builder.build());
|
let client = Client::from_conf(config_builder.build());
|
||||||
|
|
||||||
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
|
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
|
||||||
@@ -374,7 +371,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
|
|
||||||
let response = response?;
|
let response = response?;
|
||||||
|
|
||||||
let keys = response.contents();
|
let keys = response.contents().unwrap_or_default();
|
||||||
let empty = Vec::new();
|
let empty = Vec::new();
|
||||||
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
||||||
|
|
||||||
@@ -414,7 +411,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
let started_at = start_measuring_requests(kind);
|
let started_at = start_measuring_requests(kind);
|
||||||
|
|
||||||
let body = Body::wrap_stream(ReaderStream::new(from));
|
let body = Body::wrap_stream(ReaderStream::new(from));
|
||||||
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
|
let bytes_stream = ByteStream::new(SdkBody::from(body));
|
||||||
|
|
||||||
let res = self
|
let res = self
|
||||||
.client
|
.client
|
||||||
@@ -477,7 +474,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
for path in paths {
|
for path in paths {
|
||||||
let obj_id = ObjectIdentifier::builder()
|
let obj_id = ObjectIdentifier::builder()
|
||||||
.set_key(Some(self.relative_path_to_s3_object(path)))
|
.set_key(Some(self.relative_path_to_s3_object(path)))
|
||||||
.build()?;
|
.build();
|
||||||
delete_objects.push(obj_id);
|
delete_objects.push(obj_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -488,11 +485,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.client
|
.client
|
||||||
.delete_objects()
|
.delete_objects()
|
||||||
.bucket(self.bucket_name.clone())
|
.bucket(self.bucket_name.clone())
|
||||||
.delete(
|
.delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
|
||||||
Delete::builder()
|
|
||||||
.set_objects(Some(chunk.to_vec()))
|
|
||||||
.build()?,
|
|
||||||
)
|
|
||||||
.send()
|
.send()
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
|||||||
@@ -281,7 +281,6 @@ fn ensure_logging_ready() {
|
|||||||
utils::logging::init(
|
utils::logging::init(
|
||||||
utils::logging::LogFormat::Test,
|
utils::logging::LogFormat::Test,
|
||||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||||
utils::logging::Output::Stdout,
|
|
||||||
)
|
)
|
||||||
.expect("logging init failed");
|
.expect("logging init failed");
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -210,7 +210,6 @@ fn ensure_logging_ready() {
|
|||||||
utils::logging::init(
|
utils::logging::init(
|
||||||
utils::logging::LogFormat::Test,
|
utils::logging::LogFormat::Test,
|
||||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||||
utils::logging::Output::Stdout,
|
|
||||||
)
|
)
|
||||||
.expect("logging init failed");
|
.expect("logging init failed");
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,21 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# like restore_from_wal.sh, but takes existing initdb.tar.zst
|
|
||||||
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
PG_BIN=$1
|
|
||||||
WAL_PATH=$2
|
|
||||||
DATA_DIR=$3
|
|
||||||
PORT=$4
|
|
||||||
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
|
|
||||||
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
|
|
||||||
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
|
|
||||||
declare -i WAL_SIZE=$REDO_POS+114
|
|
||||||
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
|
|
||||||
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
|
|
||||||
cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
|
|
||||||
cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
|
|
||||||
for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
|
|
||||||
dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
|
||||||
rm -f 000000010000000000000001
|
|
||||||
@@ -66,17 +66,9 @@ pub enum TracingErrorLayerEnablement {
|
|||||||
EnableWithRustLogFilter,
|
EnableWithRustLogFilter,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Where the logging should output to.
|
|
||||||
#[derive(Clone, Copy)]
|
|
||||||
pub enum Output {
|
|
||||||
Stdout,
|
|
||||||
Stderr,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn init(
|
pub fn init(
|
||||||
log_format: LogFormat,
|
log_format: LogFormat,
|
||||||
tracing_error_layer_enablement: TracingErrorLayerEnablement,
|
tracing_error_layer_enablement: TracingErrorLayerEnablement,
|
||||||
output: Output,
|
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// We fall back to printing all spans at info-level or above if
|
// We fall back to printing all spans at info-level or above if
|
||||||
// the RUST_LOG environment variable is not set.
|
// the RUST_LOG environment variable is not set.
|
||||||
@@ -93,12 +85,7 @@ pub fn init(
|
|||||||
let log_layer = tracing_subscriber::fmt::layer()
|
let log_layer = tracing_subscriber::fmt::layer()
|
||||||
.with_target(false)
|
.with_target(false)
|
||||||
.with_ansi(false)
|
.with_ansi(false)
|
||||||
.with_writer(move || -> Box<dyn std::io::Write> {
|
.with_writer(std::io::stdout);
|
||||||
match output {
|
|
||||||
Output::Stdout => Box::new(std::io::stdout()),
|
|
||||||
Output::Stderr => Box::new(std::io::stderr()),
|
|
||||||
}
|
|
||||||
});
|
|
||||||
let log_layer = match log_format {
|
let log_layer = match log_format {
|
||||||
LogFormat::Json => log_layer.json().boxed(),
|
LogFormat::Json => log_layer.json().boxed(),
|
||||||
LogFormat::Plain => log_layer.boxed(),
|
LogFormat::Plain => log_layer.boxed(),
|
||||||
|
|||||||
@@ -51,7 +51,6 @@ regex.workspace = true
|
|||||||
scopeguard.workspace = true
|
scopeguard.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json = { workspace = true, features = ["raw_value"] }
|
serde_json = { workspace = true, features = ["raw_value"] }
|
||||||
serde_path_to_error.workspace = true
|
|
||||||
serde_with.workspace = true
|
serde_with.workspace = true
|
||||||
signal-hook.workspace = true
|
signal-hook.workspace = true
|
||||||
smallvec = { workspace = true, features = ["write"] }
|
smallvec = { workspace = true, features = ["write"] }
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use pageserver::repository::Key;
|
|||||||
use pageserver::tenant::layer_map::LayerMap;
|
use pageserver::tenant::layer_map::LayerMap;
|
||||||
use pageserver::tenant::storage_layer::LayerFileName;
|
use pageserver::tenant::storage_layer::LayerFileName;
|
||||||
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||||
use std::cmp::{max, min};
|
use std::cmp::{max, min};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
@@ -212,7 +211,7 @@ fn bench_sequential(c: &mut Criterion) {
|
|||||||
let i32 = (i as u32) % 100;
|
let i32 = (i as u32) % 100;
|
||||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||||
let layer = PersistentLayerDesc::new_img(
|
let layer = PersistentLayerDesc::new_img(
|
||||||
TenantShardId::unsharded(TenantId::generate()),
|
TenantId::generate(),
|
||||||
TimelineId::generate(),
|
TimelineId::generate(),
|
||||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||||
Lsn(i),
|
Lsn(i),
|
||||||
|
|||||||
@@ -18,5 +18,3 @@ tokio.workspace = true
|
|||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
svg_fmt.workspace = true
|
svg_fmt.workspace = true
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
serde.workspace = true
|
|
||||||
serde_json.workspace = true
|
|
||||||
|
|||||||
@@ -1,38 +0,0 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
use anyhow::Context;
|
|
||||||
use camino::Utf8PathBuf;
|
|
||||||
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
|
|
||||||
use pageserver::tenant::storage_layer::LayerFileName;
|
|
||||||
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
|
|
||||||
use utils::lsn::Lsn;
|
|
||||||
|
|
||||||
#[derive(clap::Subcommand)]
|
|
||||||
pub(crate) enum IndexPartCmd {
|
|
||||||
Dump { path: Utf8PathBuf },
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
|
||||||
match cmd {
|
|
||||||
IndexPartCmd::Dump { path } => {
|
|
||||||
let bytes = tokio::fs::read(path).await.context("read file")?;
|
|
||||||
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
|
||||||
#[derive(serde::Serialize)]
|
|
||||||
struct Output<'a> {
|
|
||||||
layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
|
|
||||||
disk_consistent_lsn: Lsn,
|
|
||||||
timeline_metadata: &'a TimelineMetadata,
|
|
||||||
}
|
|
||||||
|
|
||||||
let output = Output {
|
|
||||||
layer_metadata: &des.layer_metadata,
|
|
||||||
disk_consistent_lsn: des.get_disk_consistent_lsn(),
|
|
||||||
timeline_metadata: &des.metadata,
|
|
||||||
};
|
|
||||||
|
|
||||||
let output = serde_json::to_string_pretty(&output).context("serialize output")?;
|
|
||||||
println!("{output}");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,15 +1,13 @@
|
|||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::Utf8Path;
|
||||||
use clap::Subcommand;
|
use clap::Subcommand;
|
||||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||||
use pageserver::task_mgr::TaskKind;
|
use pageserver::task_mgr::TaskKind;
|
||||||
use pageserver::tenant::block_io::BlockCursor;
|
use pageserver::tenant::block_io::BlockCursor;
|
||||||
use pageserver::tenant::disk_btree::DiskBtreeReader;
|
use pageserver::tenant::disk_btree::DiskBtreeReader;
|
||||||
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
||||||
use pageserver::tenant::storage_layer::{delta_layer, image_layer};
|
|
||||||
use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
|
|
||||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||||
use pageserver::{page_cache, virtual_file};
|
use pageserver::{page_cache, virtual_file};
|
||||||
use pageserver::{
|
use pageserver::{
|
||||||
@@ -22,7 +20,6 @@ use pageserver::{
|
|||||||
};
|
};
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use utils::bin_ser::BeSer;
|
use utils::bin_ser::BeSer;
|
||||||
use utils::id::{TenantId, TimelineId};
|
|
||||||
|
|
||||||
use crate::layer_map_analyzer::parse_filename;
|
use crate::layer_map_analyzer::parse_filename;
|
||||||
|
|
||||||
@@ -48,13 +45,6 @@ pub(crate) enum LayerCmd {
|
|||||||
/// The id from list-layer command
|
/// The id from list-layer command
|
||||||
id: usize,
|
id: usize,
|
||||||
},
|
},
|
||||||
RewriteSummary {
|
|
||||||
layer_file_path: Utf8PathBuf,
|
|
||||||
#[clap(long)]
|
|
||||||
new_tenant_id: Option<TenantId>,
|
|
||||||
#[clap(long)]
|
|
||||||
new_timeline_id: Option<TimelineId>,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||||
@@ -110,7 +100,6 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
|||||||
println!("- timeline {}", timeline.file_name().to_string_lossy());
|
println!("- timeline {}", timeline.file_name().to_string_lossy());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
LayerCmd::ListLayer {
|
LayerCmd::ListLayer {
|
||||||
path,
|
path,
|
||||||
@@ -139,7 +128,6 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
|||||||
idx += 1;
|
idx += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
LayerCmd::DumpLayer {
|
LayerCmd::DumpLayer {
|
||||||
path,
|
path,
|
||||||
@@ -180,63 +168,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
|||||||
idx += 1;
|
idx += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
LayerCmd::RewriteSummary {
|
|
||||||
layer_file_path,
|
|
||||||
new_tenant_id,
|
|
||||||
new_timeline_id,
|
|
||||||
} => {
|
|
||||||
pageserver::virtual_file::init(10);
|
|
||||||
pageserver::page_cache::init(100);
|
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
|
||||||
|
|
||||||
macro_rules! rewrite_closure {
|
|
||||||
($($summary_ty:tt)*) => {{
|
|
||||||
|summary| $($summary_ty)* {
|
|
||||||
tenant_id: new_tenant_id.unwrap_or(summary.tenant_id),
|
|
||||||
timeline_id: new_timeline_id.unwrap_or(summary.timeline_id),
|
|
||||||
..summary
|
|
||||||
}
|
|
||||||
}};
|
|
||||||
}
|
|
||||||
|
|
||||||
let res = ImageLayer::rewrite_summary(
|
|
||||||
layer_file_path,
|
|
||||||
rewrite_closure!(image_layer::Summary),
|
|
||||||
&ctx,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
match res {
|
|
||||||
Ok(()) => {
|
|
||||||
println!("Successfully rewrote summary of image layer {layer_file_path}");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
|
|
||||||
Err(image_layer::RewriteSummaryError::Other(e)) => {
|
|
||||||
return Err(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let res = DeltaLayer::rewrite_summary(
|
|
||||||
layer_file_path,
|
|
||||||
rewrite_closure!(delta_layer::Summary),
|
|
||||||
&ctx,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
match res {
|
|
||||||
Ok(()) => {
|
|
||||||
println!("Successfully rewrote summary of delta layer {layer_file_path}");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
|
|
||||||
Err(delta_layer::RewriteSummaryError::Other(e)) => {
|
|
||||||
return Err(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
anyhow::bail!("not an image or delta layer: {layer_file_path}");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,13 +5,11 @@
|
|||||||
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
||||||
|
|
||||||
mod draw_timeline_dir;
|
mod draw_timeline_dir;
|
||||||
mod index_part;
|
|
||||||
mod layer_map_analyzer;
|
mod layer_map_analyzer;
|
||||||
mod layers;
|
mod layers;
|
||||||
|
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
use index_part::IndexPartCmd;
|
|
||||||
use layers::LayerCmd;
|
use layers::LayerCmd;
|
||||||
use pageserver::{
|
use pageserver::{
|
||||||
context::{DownloadBehavior, RequestContext},
|
context::{DownloadBehavior, RequestContext},
|
||||||
@@ -40,8 +38,6 @@ struct CliOpts {
|
|||||||
#[derive(Subcommand)]
|
#[derive(Subcommand)]
|
||||||
enum Commands {
|
enum Commands {
|
||||||
Metadata(MetadataCmd),
|
Metadata(MetadataCmd),
|
||||||
#[command(subcommand)]
|
|
||||||
IndexPart(IndexPartCmd),
|
|
||||||
PrintLayerFile(PrintLayerFileCmd),
|
PrintLayerFile(PrintLayerFileCmd),
|
||||||
DrawTimeline {},
|
DrawTimeline {},
|
||||||
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
||||||
@@ -87,9 +83,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
Commands::Metadata(cmd) => {
|
Commands::Metadata(cmd) => {
|
||||||
handle_metadata(&cmd)?;
|
handle_metadata(&cmd)?;
|
||||||
}
|
}
|
||||||
Commands::IndexPart(cmd) => {
|
|
||||||
index_part::main(&cmd).await?;
|
|
||||||
}
|
|
||||||
Commands::DrawTimeline {} => {
|
Commands::DrawTimeline {} => {
|
||||||
draw_timeline_dir::main()?;
|
draw_timeline_dir::main()?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -103,11 +103,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
} else {
|
} else {
|
||||||
TracingErrorLayerEnablement::Disabled
|
TracingErrorLayerEnablement::Disabled
|
||||||
};
|
};
|
||||||
logging::init(
|
logging::init(conf.log_format, tracing_error_layer_enablement)?;
|
||||||
conf.log_format,
|
|
||||||
tracing_error_layer_enablement,
|
|
||||||
logging::Output::Stdout,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
||||||
// disarming this hook on pageserver, because we never tear down tracing.
|
// disarming this hook on pageserver, because we never tear down tracing.
|
||||||
@@ -625,7 +621,6 @@ fn start_pageserver(
|
|||||||
conf.synthetic_size_calculation_interval,
|
conf.synthetic_size_calculation_interval,
|
||||||
conf.id,
|
conf.id,
|
||||||
local_disk_storage,
|
local_disk_storage,
|
||||||
cancel,
|
|
||||||
metrics_ctx,
|
metrics_ctx,
|
||||||
)
|
)
|
||||||
.instrument(info_span!("metrics_collection"))
|
.instrument(info_span!("metrics_collection"))
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
//! See also `settings.md` for better description on every parameter.
|
//! See also `settings.md` for better description on every parameter.
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||||
use serde::de::IntoDeserializer;
|
use serde::de::IntoDeserializer;
|
||||||
use std::env;
|
use std::env;
|
||||||
@@ -26,7 +25,7 @@ use toml_edit::{Document, Item};
|
|||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use utils::{
|
use utils::{
|
||||||
id::{NodeId, TimelineId},
|
id::{NodeId, TenantId, TimelineId},
|
||||||
logging::LogFormat,
|
logging::LogFormat,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -629,13 +628,12 @@ impl PageServerConf {
|
|||||||
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
|
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||||
self.tenants_path().join(tenant_shard_id.to_string())
|
self.tenants_path().join(tenant_id.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_shard_id)
|
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
||||||
.join(IGNORED_TENANT_FILE_NAME)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Points to a place in pageserver's local directory,
|
/// Points to a place in pageserver's local directory,
|
||||||
@@ -643,53 +641,47 @@ impl PageServerConf {
|
|||||||
///
|
///
|
||||||
/// Legacy: superseded by tenant_location_config_path. Eventually
|
/// Legacy: superseded by tenant_location_config_path. Eventually
|
||||||
/// remove this function.
|
/// remove this function.
|
||||||
pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
|
self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_shard_id)
|
self.tenant_path(tenant_id)
|
||||||
.join(TENANT_LOCATION_CONFIG_NAME)
|
.join(TENANT_LOCATION_CONFIG_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_shard_id)
|
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
|
||||||
.join(TIMELINES_SEGMENT_NAME)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_path(
|
pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
|
||||||
&self,
|
self.timelines_path(tenant_id).join(timeline_id.to_string())
|
||||||
tenant_shard_id: &TenantShardId,
|
|
||||||
timeline_id: &TimelineId,
|
|
||||||
) -> Utf8PathBuf {
|
|
||||||
self.timelines_path(tenant_shard_id)
|
|
||||||
.join(timeline_id.to_string())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_uninit_mark_file_path(
|
pub fn timeline_uninit_mark_file_path(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Utf8PathBuf {
|
) -> Utf8PathBuf {
|
||||||
path_with_suffix_extension(
|
path_with_suffix_extension(
|
||||||
self.timeline_path(&tenant_shard_id, &timeline_id),
|
self.timeline_path(&tenant_id, &timeline_id),
|
||||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_delete_mark_file_path(
|
pub fn timeline_delete_mark_file_path(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Utf8PathBuf {
|
) -> Utf8PathBuf {
|
||||||
path_with_suffix_extension(
|
path_with_suffix_extension(
|
||||||
self.timeline_path(&tenant_shard_id, &timeline_id),
|
self.timeline_path(&tenant_id, &timeline_id),
|
||||||
TIMELINE_DELETE_MARK_SUFFIX,
|
TIMELINE_DELETE_MARK_SUFFIX,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_shard_id)
|
self.tenant_path(tenant_id)
|
||||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -699,24 +691,20 @@ impl PageServerConf {
|
|||||||
|
|
||||||
pub fn trace_path(
|
pub fn trace_path(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
connection_id: &ConnectionId,
|
connection_id: &ConnectionId,
|
||||||
) -> Utf8PathBuf {
|
) -> Utf8PathBuf {
|
||||||
self.traces_path()
|
self.traces_path()
|
||||||
.join(tenant_shard_id.to_string())
|
.join(tenant_id.to_string())
|
||||||
.join(timeline_id.to_string())
|
.join(timeline_id.to_string())
|
||||||
.join(connection_id.to_string())
|
.join(connection_id.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Points to a place in pageserver's local directory,
|
/// Points to a place in pageserver's local directory,
|
||||||
/// where certain timeline's metadata file should be located.
|
/// where certain timeline's metadata file should be located.
|
||||||
pub fn metadata_path(
|
pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
|
||||||
&self,
|
self.timeline_path(tenant_id, timeline_id)
|
||||||
tenant_shard_id: &TenantShardId,
|
|
||||||
timeline_id: &TimelineId,
|
|
||||||
) -> Utf8PathBuf {
|
|
||||||
self.timeline_path(tenant_shard_id, timeline_id)
|
|
||||||
.join(METADATA_FILE_NAME)
|
.join(METADATA_FILE_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -779,7 +767,7 @@ impl PageServerConf {
|
|||||||
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
|
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
|
||||||
}
|
}
|
||||||
"tenant_config" => {
|
"tenant_config" => {
|
||||||
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
t_conf = Self::parse_toml_tenant_conf(item)?;
|
||||||
}
|
}
|
||||||
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
||||||
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
||||||
@@ -853,10 +841,114 @@ impl PageServerConf {
|
|||||||
Ok(conf)
|
Ok(conf)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// subroutine of parse_and_validate to parse `[tenant_conf]` section
|
||||||
|
|
||||||
|
pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
|
||||||
|
let mut t_conf: TenantConfOpt = Default::default();
|
||||||
|
if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
|
||||||
|
t_conf.checkpoint_distance =
|
||||||
|
Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
|
||||||
|
t_conf.checkpoint_timeout = Some(parse_toml_duration(
|
||||||
|
"checkpoint_timeout",
|
||||||
|
checkpoint_timeout,
|
||||||
|
)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(compaction_target_size) = item.get("compaction_target_size") {
|
||||||
|
t_conf.compaction_target_size = Some(parse_toml_u64(
|
||||||
|
"compaction_target_size",
|
||||||
|
compaction_target_size,
|
||||||
|
)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(compaction_period) = item.get("compaction_period") {
|
||||||
|
t_conf.compaction_period =
|
||||||
|
Some(parse_toml_duration("compaction_period", compaction_period)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(compaction_threshold) = item.get("compaction_threshold") {
|
||||||
|
t_conf.compaction_threshold =
|
||||||
|
Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
|
||||||
|
t_conf.image_creation_threshold = Some(
|
||||||
|
parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(gc_horizon) = item.get("gc_horizon") {
|
||||||
|
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(gc_period) = item.get("gc_period") {
|
||||||
|
t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(pitr_interval) = item.get("pitr_interval") {
|
||||||
|
t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
|
||||||
|
}
|
||||||
|
if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") {
|
||||||
|
t_conf.walreceiver_connect_timeout = Some(parse_toml_duration(
|
||||||
|
"walreceiver_connect_timeout",
|
||||||
|
walreceiver_connect_timeout,
|
||||||
|
)?);
|
||||||
|
}
|
||||||
|
if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") {
|
||||||
|
t_conf.lagging_wal_timeout = Some(parse_toml_duration(
|
||||||
|
"lagging_wal_timeout",
|
||||||
|
lagging_wal_timeout,
|
||||||
|
)?);
|
||||||
|
}
|
||||||
|
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
|
||||||
|
t_conf.max_lsn_wal_lag =
|
||||||
|
Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
|
||||||
|
}
|
||||||
|
if let Some(trace_read_requests) = item.get("trace_read_requests") {
|
||||||
|
t_conf.trace_read_requests =
|
||||||
|
Some(trace_read_requests.as_bool().with_context(|| {
|
||||||
|
"configure option trace_read_requests is not a bool".to_string()
|
||||||
|
})?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(eviction_policy) = item.get("eviction_policy") {
|
||||||
|
t_conf.eviction_policy = Some(
|
||||||
|
deserialize_from_item("eviction_policy", eviction_policy)
|
||||||
|
.context("parse eviction_policy")?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(item) = item.get("min_resident_size_override") {
|
||||||
|
t_conf.min_resident_size_override = Some(
|
||||||
|
deserialize_from_item("min_resident_size_override", item)
|
||||||
|
.context("parse min_resident_size_override")?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") {
|
||||||
|
t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration(
|
||||||
|
"evictions_low_residence_duration_metric_threshold",
|
||||||
|
item,
|
||||||
|
)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(gc_feedback) = item.get("gc_feedback") {
|
||||||
|
t_conf.gc_feedback = Some(
|
||||||
|
gc_feedback
|
||||||
|
.as_bool()
|
||||||
|
.with_context(|| "configure option gc_feedback is not a bool".to_string())?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(t_conf)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
|
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
|
||||||
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
|
Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
||||||
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
||||||
@@ -1325,37 +1417,6 @@ trace_read_requests = {trace_read_requests}"#,
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
|
|
||||||
let config_string = r#"
|
|
||||||
[tenant_config]
|
|
||||||
checkpoint_distance = -1 # supposed to be an u64
|
|
||||||
"#
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let toml: Document = config_string.parse()?;
|
|
||||||
let item = toml.get("tenant_config").unwrap();
|
|
||||||
let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
|
|
||||||
|
|
||||||
let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
|
|
||||||
assert_eq!(error.to_string(), expected_error_str);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn parse_override_tenant_config() -> anyhow::Result<()> {
|
|
||||||
let config_string = r#"tenant_config={ min_resident_size_override = 400 }"#.to_string();
|
|
||||||
|
|
||||||
let toml: Document = config_string.parse()?;
|
|
||||||
let item = toml.get("tenant_config").unwrap();
|
|
||||||
let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(conf.min_resident_size_override, Some(400));
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
|
fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
|
||||||
let tempdir = tempdir()?;
|
let tempdir = tempdir()?;
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||||
use crate::tenant::tasks::BackgroundLoopKind;
|
use crate::tenant::tasks::BackgroundLoopKind;
|
||||||
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
|
use crate::tenant::{mgr, LogicalSizeCalculationCause};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use consumption_metrics::EventType;
|
use consumption_metrics::EventType;
|
||||||
use pageserver_api::models::TenantState;
|
use pageserver_api::models::TenantState;
|
||||||
@@ -12,7 +12,6 @@ use std::collections::HashMap;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, SystemTime};
|
use std::time::{Duration, SystemTime};
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
|
|
||||||
@@ -38,7 +37,6 @@ type RawMetric = (MetricsKey, (EventType, u64));
|
|||||||
type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
||||||
|
|
||||||
/// Main thread that serves metrics collection
|
/// Main thread that serves metrics collection
|
||||||
#[allow(clippy::too_many_arguments)]
|
|
||||||
pub async fn collect_metrics(
|
pub async fn collect_metrics(
|
||||||
metric_collection_endpoint: &Url,
|
metric_collection_endpoint: &Url,
|
||||||
metric_collection_interval: Duration,
|
metric_collection_interval: Duration,
|
||||||
@@ -46,7 +44,6 @@ pub async fn collect_metrics(
|
|||||||
synthetic_size_calculation_interval: Duration,
|
synthetic_size_calculation_interval: Duration,
|
||||||
node_id: NodeId,
|
node_id: NodeId,
|
||||||
local_disk_storage: Utf8PathBuf,
|
local_disk_storage: Utf8PathBuf,
|
||||||
cancel: CancellationToken,
|
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
if _cached_metric_collection_interval != Duration::ZERO {
|
if _cached_metric_collection_interval != Duration::ZERO {
|
||||||
@@ -66,13 +63,9 @@ pub async fn collect_metrics(
|
|||||||
"synthetic size calculation",
|
"synthetic size calculation",
|
||||||
false,
|
false,
|
||||||
async move {
|
async move {
|
||||||
calculate_synthetic_size_worker(
|
calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
|
||||||
synthetic_size_calculation_interval,
|
.instrument(info_span!("synthetic_size_worker"))
|
||||||
&cancel,
|
.await?;
|
||||||
&worker_ctx,
|
|
||||||
)
|
|
||||||
.instrument(info_span!("synthetic_size_worker"))
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
@@ -248,7 +241,6 @@ async fn reschedule(
|
|||||||
/// Caclculate synthetic size for each active tenant
|
/// Caclculate synthetic size for each active tenant
|
||||||
async fn calculate_synthetic_size_worker(
|
async fn calculate_synthetic_size_worker(
|
||||||
synthetic_size_calculation_interval: Duration,
|
synthetic_size_calculation_interval: Duration,
|
||||||
cancel: &CancellationToken,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
info!("starting calculate_synthetic_size_worker");
|
info!("starting calculate_synthetic_size_worker");
|
||||||
@@ -280,12 +272,7 @@ async fn calculate_synthetic_size_worker(
|
|||||||
// Same for the loop that fetches computed metrics.
|
// Same for the loop that fetches computed metrics.
|
||||||
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
|
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
|
||||||
// which turns out is really handy to understand the system.
|
// which turns out is really handy to understand the system.
|
||||||
if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
|
if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
|
||||||
if let Some(PageReconstructError::Cancelled) =
|
|
||||||
e.downcast_ref::<PageReconstructError>()
|
|
||||||
{
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
|
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
|
use crate::context::RequestContext;
|
||||||
|
use anyhow::Context;
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use consumption_metrics::EventType;
|
use consumption_metrics::EventType;
|
||||||
use futures::stream::StreamExt;
|
use futures::stream::StreamExt;
|
||||||
@@ -350,12 +351,14 @@ impl TimelineSnapshot {
|
|||||||
let last_record_lsn = t.get_last_record_lsn();
|
let last_record_lsn = t.get_last_record_lsn();
|
||||||
|
|
||||||
let current_exact_logical_size = {
|
let current_exact_logical_size = {
|
||||||
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
|
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
|
||||||
let size = span.in_scope(|| t.get_current_logical_size(ctx));
|
let res = span
|
||||||
match size {
|
.in_scope(|| t.get_current_logical_size(ctx))
|
||||||
|
.context("get_current_logical_size");
|
||||||
|
match res? {
|
||||||
// Only send timeline logical size when it is fully calculated.
|
// Only send timeline logical size when it is fully calculated.
|
||||||
CurrentLogicalSize::Exact(ref size) => Some(size.into()),
|
(size, is_exact) if is_exact => Some(size),
|
||||||
CurrentLogicalSize::Approximate(_) => None,
|
(_, _) => None,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,15 +1,16 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use pageserver_api::{
|
use pageserver_api::control_api::{
|
||||||
control_api::{
|
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
|
||||||
},
|
|
||||||
shard::TenantShardId,
|
|
||||||
};
|
};
|
||||||
use serde::{de::DeserializeOwned, Serialize};
|
use serde::{de::DeserializeOwned, Serialize};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
use utils::{backoff, generation::Generation, id::NodeId};
|
use utils::{
|
||||||
|
backoff,
|
||||||
|
generation::Generation,
|
||||||
|
id::{NodeId, TenantId},
|
||||||
|
};
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
|
|
||||||
@@ -30,11 +31,11 @@ pub enum RetryForeverError {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
pub trait ControlPlaneGenerationsApi {
|
pub trait ControlPlaneGenerationsApi {
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
|
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
|
||||||
async fn validate(
|
async fn validate(
|
||||||
&self,
|
&self,
|
||||||
tenants: Vec<(TenantShardId, Generation)>,
|
tenants: Vec<(TenantId, Generation)>,
|
||||||
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
|
) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ControlPlaneClient {
|
impl ControlPlaneClient {
|
||||||
@@ -126,7 +127,7 @@ impl ControlPlaneClient {
|
|||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||||
/// Block until we get a successful response, or error out if we are shut down
|
/// Block until we get a successful response, or error out if we are shut down
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
|
||||||
let re_attach_path = self
|
let re_attach_path = self
|
||||||
.base_url
|
.base_url
|
||||||
.join("re-attach")
|
.join("re-attach")
|
||||||
@@ -153,8 +154,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
|||||||
/// Block until we get a successful response, or error out if we are shut down
|
/// Block until we get a successful response, or error out if we are shut down
|
||||||
async fn validate(
|
async fn validate(
|
||||||
&self,
|
&self,
|
||||||
tenants: Vec<(TenantShardId, Generation)>,
|
tenants: Vec<(TenantId, Generation)>,
|
||||||
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
|
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
|
||||||
let re_attach_path = self
|
let re_attach_path = self
|
||||||
.base_url
|
.base_url
|
||||||
.join("validate")
|
.join("validate")
|
||||||
|
|||||||
@@ -10,12 +10,11 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
|
|||||||
use crate::metrics;
|
use crate::metrics;
|
||||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||||
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
|
||||||
use crate::virtual_file::MaybeFatalIo;
|
use crate::virtual_file::MaybeFatalIo;
|
||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use hex::FromHex;
|
||||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
@@ -26,7 +25,7 @@ use tracing::Instrument;
|
|||||||
use tracing::{self, debug, error};
|
use tracing::{self, debug, error};
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
use utils::id::TimelineId;
|
use utils::id::{TenantId, TimelineId};
|
||||||
use utils::lsn::AtomicLsn;
|
use utils::lsn::AtomicLsn;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -160,10 +159,11 @@ pub struct DeletionQueueClient {
|
|||||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
struct TenantDeletionList {
|
struct TenantDeletionList {
|
||||||
/// For each Timeline, a list of key fragments to append to the timeline remote path
|
/// For each Timeline, a list of key fragments to append to the timeline remote path
|
||||||
/// when reconstructing a full key
|
/// when reconstructing a full key
|
||||||
|
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
|
||||||
timelines: HashMap<TimelineId, Vec<String>>,
|
timelines: HashMap<TimelineId, Vec<String>>,
|
||||||
|
|
||||||
/// The generation in which this deletion was emitted: note that this may not be the
|
/// The generation in which this deletion was emitted: note that this may not be the
|
||||||
@@ -178,11 +178,43 @@ impl TenantDeletionList {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
|
||||||
|
fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
V: Serialize,
|
||||||
|
I: AsRef<[u8]>,
|
||||||
|
{
|
||||||
|
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
|
||||||
|
|
||||||
|
transformed
|
||||||
|
.collect::<HashMap<String, &V>>()
|
||||||
|
.serialize(serializer)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// For HashMaps using a FromHex key, where we would like to decode the key
|
||||||
|
fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::de::Deserializer<'de>,
|
||||||
|
V: Deserialize<'de>,
|
||||||
|
I: FromHex + std::hash::Hash + Eq,
|
||||||
|
{
|
||||||
|
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
|
||||||
|
hex_map
|
||||||
|
.into_iter()
|
||||||
|
.map(|(k, v)| {
|
||||||
|
I::from_hex(k)
|
||||||
|
.map(|k| (k, v))
|
||||||
|
.map_err(|_| serde::de::Error::custom("Invalid hex ID"))
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
/// Files ending with this suffix will be ignored and erased
|
/// Files ending with this suffix will be ignored and erased
|
||||||
/// during recovery as startup.
|
/// during recovery as startup.
|
||||||
const TEMP_SUFFIX: &str = "tmp";
|
const TEMP_SUFFIX: &str = "tmp";
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
struct DeletionList {
|
struct DeletionList {
|
||||||
/// Serialization version, for future use
|
/// Serialization version, for future use
|
||||||
version: u8,
|
version: u8,
|
||||||
@@ -194,7 +226,8 @@ struct DeletionList {
|
|||||||
/// nested HashMaps by TenantTimelineID. Each Tenant only appears once
|
/// nested HashMaps by TenantTimelineID. Each Tenant only appears once
|
||||||
/// with one unique generation ID: if someone tries to push a second generation
|
/// with one unique generation ID: if someone tries to push a second generation
|
||||||
/// ID for the same tenant, we will start a new DeletionList.
|
/// ID for the same tenant, we will start a new DeletionList.
|
||||||
tenants: HashMap<TenantShardId, TenantDeletionList>,
|
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
|
||||||
|
tenants: HashMap<TenantId, TenantDeletionList>,
|
||||||
|
|
||||||
/// Avoid having to walk `tenants` to calculate the number of keys in
|
/// Avoid having to walk `tenants` to calculate the number of keys in
|
||||||
/// the nested deletion lists
|
/// the nested deletion lists
|
||||||
@@ -266,7 +299,7 @@ impl DeletionList {
|
|||||||
/// deletion list.
|
/// deletion list.
|
||||||
fn push(
|
fn push(
|
||||||
&mut self,
|
&mut self,
|
||||||
tenant: &TenantShardId,
|
tenant: &TenantId,
|
||||||
timeline: &TimelineId,
|
timeline: &TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
objects: &mut Vec<RemotePath>,
|
objects: &mut Vec<RemotePath>,
|
||||||
@@ -358,7 +391,7 @@ struct TenantLsnState {
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct VisibleLsnUpdates {
|
struct VisibleLsnUpdates {
|
||||||
tenants: HashMap<TenantShardId, TenantLsnState>,
|
tenants: HashMap<TenantId, TenantLsnState>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl VisibleLsnUpdates {
|
impl VisibleLsnUpdates {
|
||||||
@@ -415,7 +448,7 @@ impl DeletionQueueClient {
|
|||||||
|
|
||||||
pub(crate) fn recover(
|
pub(crate) fn recover(
|
||||||
&self,
|
&self,
|
||||||
attached_tenants: HashMap<TenantShardId, Generation>,
|
attached_tenants: HashMap<TenantId, Generation>,
|
||||||
) -> Result<(), DeletionQueueError> {
|
) -> Result<(), DeletionQueueError> {
|
||||||
self.do_push(
|
self.do_push(
|
||||||
&self.tx,
|
&self.tx,
|
||||||
@@ -432,7 +465,7 @@ impl DeletionQueueClient {
|
|||||||
/// backend will later wake up and notice that the tenant's generation requires validation.
|
/// backend will later wake up and notice that the tenant's generation requires validation.
|
||||||
pub(crate) async fn update_remote_consistent_lsn(
|
pub(crate) async fn update_remote_consistent_lsn(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
current_generation: Generation,
|
current_generation: Generation,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
@@ -443,13 +476,10 @@ impl DeletionQueueClient {
|
|||||||
.write()
|
.write()
|
||||||
.expect("Lock should never be poisoned");
|
.expect("Lock should never be poisoned");
|
||||||
|
|
||||||
let tenant_entry = locked
|
let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
|
||||||
.tenants
|
timelines: HashMap::new(),
|
||||||
.entry(tenant_shard_id)
|
generation: current_generation,
|
||||||
.or_insert(TenantLsnState {
|
});
|
||||||
timelines: HashMap::new(),
|
|
||||||
generation: current_generation,
|
|
||||||
});
|
|
||||||
|
|
||||||
if tenant_entry.generation != current_generation {
|
if tenant_entry.generation != current_generation {
|
||||||
// Generation might have changed if we were detached and then re-attached: in this case,
|
// Generation might have changed if we were detached and then re-attached: in this case,
|
||||||
@@ -476,29 +506,27 @@ impl DeletionQueueClient {
|
|||||||
/// generations in `layers` are the generations in which those layers were written.
|
/// generations in `layers` are the generations in which those layers were written.
|
||||||
pub(crate) async fn push_layers(
|
pub(crate) async fn push_layers(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
current_generation: Generation,
|
current_generation: Generation,
|
||||||
layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
layers: Vec<(LayerFileName, Generation)>,
|
||||||
) -> Result<(), DeletionQueueError> {
|
) -> Result<(), DeletionQueueError> {
|
||||||
if current_generation.is_none() {
|
if current_generation.is_none() {
|
||||||
debug!("Enqueuing deletions in legacy mode, skipping queue");
|
debug!("Enqueuing deletions in legacy mode, skipping queue");
|
||||||
|
|
||||||
let mut layer_paths = Vec::new();
|
let mut layer_paths = Vec::new();
|
||||||
for (layer, meta) in layers {
|
for (layer, generation) in layers {
|
||||||
layer_paths.push(remote_layer_path(
|
layer_paths.push(remote_layer_path(
|
||||||
&tenant_shard_id.tenant_id,
|
&tenant_id,
|
||||||
&timeline_id,
|
&timeline_id,
|
||||||
meta.shard,
|
|
||||||
&layer,
|
&layer,
|
||||||
meta.generation,
|
generation,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
self.push_immediate(layer_paths).await?;
|
self.push_immediate(layer_paths).await?;
|
||||||
return self.flush_immediate().await;
|
return self.flush_immediate().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
|
self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// When a Tenant has a generation, push_layers is always synchronous because
|
/// When a Tenant has a generation, push_layers is always synchronous because
|
||||||
@@ -508,10 +536,10 @@ impl DeletionQueueClient {
|
|||||||
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
|
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
|
||||||
pub(crate) fn push_layers_sync(
|
pub(crate) fn push_layers_sync(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
current_generation: Generation,
|
current_generation: Generation,
|
||||||
layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
layers: Vec<(LayerFileName, Generation)>,
|
||||||
) -> Result<(), DeletionQueueError> {
|
) -> Result<(), DeletionQueueError> {
|
||||||
metrics::DELETION_QUEUE
|
metrics::DELETION_QUEUE
|
||||||
.keys_submitted
|
.keys_submitted
|
||||||
@@ -519,7 +547,7 @@ impl DeletionQueueClient {
|
|||||||
self.do_push(
|
self.do_push(
|
||||||
&self.tx,
|
&self.tx,
|
||||||
ListWriterQueueMessage::Delete(DeletionOp {
|
ListWriterQueueMessage::Delete(DeletionOp {
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
layers,
|
layers,
|
||||||
generation: current_generation,
|
generation: current_generation,
|
||||||
@@ -722,7 +750,6 @@ impl DeletionQueue {
|
|||||||
mod test {
|
mod test {
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use hex_literal::hex;
|
use hex_literal::hex;
|
||||||
use pageserver_api::shard::ShardIndex;
|
|
||||||
use std::{io::ErrorKind, time::Duration};
|
use std::{io::ErrorKind, time::Duration};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
@@ -787,12 +814,12 @@ mod test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn set_latest_generation(&self, gen: Generation) {
|
fn set_latest_generation(&self, gen: Generation) {
|
||||||
let tenant_shard_id = self.harness.tenant_shard_id;
|
let tenant_id = self.harness.tenant_id;
|
||||||
self.mock_control_plane
|
self.mock_control_plane
|
||||||
.latest_generation
|
.latest_generation
|
||||||
.lock()
|
.lock()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.insert(tenant_shard_id, gen);
|
.insert(tenant_id, gen);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns remote layer file name, suitable for use in assert_remote_files
|
/// Returns remote layer file name, suitable for use in assert_remote_files
|
||||||
@@ -801,8 +828,8 @@ mod test {
|
|||||||
file_name: LayerFileName,
|
file_name: LayerFileName,
|
||||||
gen: Generation,
|
gen: Generation,
|
||||||
) -> anyhow::Result<String> {
|
) -> anyhow::Result<String> {
|
||||||
let tenant_shard_id = self.harness.tenant_shard_id;
|
let tenant_id = self.harness.tenant_id;
|
||||||
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
|
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
||||||
let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
|
let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
|
||||||
std::fs::create_dir_all(&remote_timeline_path)?;
|
std::fs::create_dir_all(&remote_timeline_path)?;
|
||||||
let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
|
let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
|
||||||
@@ -820,7 +847,7 @@ mod test {
|
|||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct MockControlPlane {
|
struct MockControlPlane {
|
||||||
pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantShardId, Generation>>>,
|
pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MockControlPlane {
|
impl MockControlPlane {
|
||||||
@@ -834,20 +861,20 @@ mod test {
|
|||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||||
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
async fn validate(
|
async fn validate(
|
||||||
&self,
|
&self,
|
||||||
tenants: Vec<(TenantShardId, Generation)>,
|
tenants: Vec<(TenantId, Generation)>,
|
||||||
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
|
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
|
||||||
let mut result = HashMap::new();
|
let mut result = HashMap::new();
|
||||||
|
|
||||||
let latest_generation = self.latest_generation.lock().unwrap();
|
let latest_generation = self.latest_generation.lock().unwrap();
|
||||||
|
|
||||||
for (tenant_shard_id, generation) in tenants {
|
for (tenant_id, generation) in tenants {
|
||||||
if let Some(latest) = latest_generation.get(&tenant_shard_id) {
|
if let Some(latest) = latest_generation.get(&tenant_id) {
|
||||||
result.insert(tenant_shard_id, *latest == generation);
|
result.insert(tenant_id, *latest == generation);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -951,10 +978,10 @@ mod test {
|
|||||||
client.recover(HashMap::new())?;
|
client.recover(HashMap::new())?;
|
||||||
|
|
||||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||||
let tenant_shard_id = ctx.harness.tenant_shard_id;
|
let tenant_id = ctx.harness.tenant_id;
|
||||||
|
|
||||||
let content: Vec<u8> = "victim1 contents".into();
|
let content: Vec<u8> = "victim1 contents".into();
|
||||||
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
|
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
||||||
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
||||||
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
||||||
|
|
||||||
@@ -962,8 +989,6 @@ mod test {
|
|||||||
// we delete, and the generation of the running Tenant.
|
// we delete, and the generation of the running Tenant.
|
||||||
let layer_generation = Generation::new(0xdeadbeef);
|
let layer_generation = Generation::new(0xdeadbeef);
|
||||||
let now_generation = Generation::new(0xfeedbeef);
|
let now_generation = Generation::new(0xfeedbeef);
|
||||||
let layer_metadata =
|
|
||||||
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
|
|
||||||
|
|
||||||
let remote_layer_file_name_1 =
|
let remote_layer_file_name_1 =
|
||||||
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
|
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
|
||||||
@@ -984,10 +1009,10 @@ mod test {
|
|||||||
info!("Pushing");
|
info!("Pushing");
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
now_generation,
|
now_generation,
|
||||||
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
|
[(layer_file_name_1.clone(), layer_generation)].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
|
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
|
||||||
@@ -1026,13 +1051,11 @@ mod test {
|
|||||||
let stale_generation = latest_generation.previous();
|
let stale_generation = latest_generation.previous();
|
||||||
// Generation that our example layer file was written with
|
// Generation that our example layer file was written with
|
||||||
let layer_generation = stale_generation.previous();
|
let layer_generation = stale_generation.previous();
|
||||||
let layer_metadata =
|
|
||||||
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
|
|
||||||
|
|
||||||
ctx.set_latest_generation(latest_generation);
|
ctx.set_latest_generation(latest_generation);
|
||||||
|
|
||||||
let tenant_shard_id = ctx.harness.tenant_shard_id;
|
let tenant_id = ctx.harness.tenant_id;
|
||||||
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
|
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
||||||
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
||||||
|
|
||||||
// Initial state: a remote layer exists
|
// Initial state: a remote layer exists
|
||||||
@@ -1042,10 +1065,10 @@ mod test {
|
|||||||
tracing::debug!("Pushing...");
|
tracing::debug!("Pushing...");
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
stale_generation,
|
stale_generation,
|
||||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -1057,10 +1080,10 @@ mod test {
|
|||||||
tracing::debug!("Pushing...");
|
tracing::debug!("Pushing...");
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
latest_generation,
|
latest_generation,
|
||||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -1079,16 +1102,14 @@ mod test {
|
|||||||
let client = ctx.deletion_queue.new_client();
|
let client = ctx.deletion_queue.new_client();
|
||||||
client.recover(HashMap::new())?;
|
client.recover(HashMap::new())?;
|
||||||
|
|
||||||
let tenant_shard_id = ctx.harness.tenant_shard_id;
|
let tenant_id = ctx.harness.tenant_id;
|
||||||
|
|
||||||
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
|
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
||||||
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
||||||
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
||||||
|
|
||||||
let layer_generation = Generation::new(0xdeadbeef);
|
let layer_generation = Generation::new(0xdeadbeef);
|
||||||
let now_generation = Generation::new(0xfeedbeef);
|
let now_generation = Generation::new(0xfeedbeef);
|
||||||
let layer_metadata =
|
|
||||||
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
|
|
||||||
|
|
||||||
// Inject a deletion in the generation before generation_now: after restart,
|
// Inject a deletion in the generation before generation_now: after restart,
|
||||||
// this deletion should _not_ get executed (only the immediately previous
|
// this deletion should _not_ get executed (only the immediately previous
|
||||||
@@ -1097,10 +1118,10 @@ mod test {
|
|||||||
ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
|
ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
now_generation.previous(),
|
now_generation.previous(),
|
||||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -1111,10 +1132,10 @@ mod test {
|
|||||||
ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
|
ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
now_generation,
|
now_generation,
|
||||||
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
|
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -1142,7 +1163,7 @@ mod test {
|
|||||||
drop(client);
|
drop(client);
|
||||||
ctx.restart().await;
|
ctx.restart().await;
|
||||||
let client = ctx.deletion_queue.new_client();
|
let client = ctx.deletion_queue.new_client();
|
||||||
client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?;
|
client.recover(HashMap::from([(tenant_id, now_generation)]))?;
|
||||||
|
|
||||||
info!("Flush-executing");
|
info!("Flush-executing");
|
||||||
client.flush_execute().await?;
|
client.flush_execute().await?;
|
||||||
@@ -1204,13 +1225,12 @@ pub(crate) mod mock {
|
|||||||
match msg {
|
match msg {
|
||||||
ListWriterQueueMessage::Delete(op) => {
|
ListWriterQueueMessage::Delete(op) => {
|
||||||
let mut objects = op.objects;
|
let mut objects = op.objects;
|
||||||
for (layer, meta) in op.layers {
|
for (layer, generation) in op.layers {
|
||||||
objects.push(remote_layer_path(
|
objects.push(remote_layer_path(
|
||||||
&op.tenant_shard_id.tenant_id,
|
&op.tenant_id,
|
||||||
&op.timeline_id,
|
&op.timeline_id,
|
||||||
meta.shard,
|
|
||||||
&layer,
|
&layer,
|
||||||
meta.generation,
|
generation,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1290,34 +1310,4 @@ pub(crate) mod mock {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Test round-trip serialization/deserialization, and test stability of the format
|
|
||||||
/// vs. a static expected string for the serialized version.
|
|
||||||
#[test]
|
|
||||||
fn deletion_list_serialization() -> anyhow::Result<()> {
|
|
||||||
let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c"
|
|
||||||
.to_string()
|
|
||||||
.parse::<TenantShardId>()?;
|
|
||||||
let timeline_id = "be322c834ed9e709e63b5c9698691910"
|
|
||||||
.to_string()
|
|
||||||
.parse::<TimelineId>()?;
|
|
||||||
let generation = Generation::new(123);
|
|
||||||
|
|
||||||
let object =
|
|
||||||
RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?;
|
|
||||||
let mut objects = [object].to_vec();
|
|
||||||
|
|
||||||
let mut example = DeletionList::new(1);
|
|
||||||
example.push(&tenant_id, &timeline_id, generation, &mut objects);
|
|
||||||
|
|
||||||
let encoded = serde_json::to_string(&example)?;
|
|
||||||
|
|
||||||
let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string();
|
|
||||||
assert_eq!(encoded, expected);
|
|
||||||
|
|
||||||
let decoded = serde_json::from_str::<DeletionList>(&encoded)?;
|
|
||||||
assert_eq!(example, decoded);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ use std::collections::HashMap;
|
|||||||
use std::fs::create_dir_all;
|
use std::fs::create_dir_all;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use remote_storage::RemotePath;
|
use remote_storage::RemotePath;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
@@ -27,13 +26,13 @@ use tracing::debug;
|
|||||||
use tracing::info;
|
use tracing::info;
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
|
use utils::id::TenantId;
|
||||||
use utils::id::TimelineId;
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::deletion_queue::TEMP_SUFFIX;
|
use crate::deletion_queue::TEMP_SUFFIX;
|
||||||
use crate::metrics;
|
use crate::metrics;
|
||||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||||
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
|
||||||
use crate::tenant::storage_layer::LayerFileName;
|
use crate::tenant::storage_layer::LayerFileName;
|
||||||
use crate::virtual_file::on_fatal_io_error;
|
use crate::virtual_file::on_fatal_io_error;
|
||||||
use crate::virtual_file::MaybeFatalIo;
|
use crate::virtual_file::MaybeFatalIo;
|
||||||
@@ -54,22 +53,22 @@ const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
|
|||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(super) struct DeletionOp {
|
pub(super) struct DeletionOp {
|
||||||
pub(super) tenant_shard_id: TenantShardId,
|
pub(super) tenant_id: TenantId,
|
||||||
pub(super) timeline_id: TimelineId,
|
pub(super) timeline_id: TimelineId,
|
||||||
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
|
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
|
||||||
// have a config object handy to project it to a remote key, and need the consuming worker
|
// have a config object handy to project it to a remote key, and need the consuming worker
|
||||||
// to do it for you.
|
// to do it for you.
|
||||||
pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
pub(super) layers: Vec<(LayerFileName, Generation)>,
|
||||||
pub(super) objects: Vec<RemotePath>,
|
pub(super) objects: Vec<RemotePath>,
|
||||||
|
|
||||||
/// The _current_ generation of the Tenant shard attachment in which we are enqueuing
|
/// The _current_ generation of the Tenant attachment in which we are enqueuing
|
||||||
/// this deletion.
|
/// this deletion.
|
||||||
pub(super) generation: Generation,
|
pub(super) generation: Generation,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(super) struct RecoverOp {
|
pub(super) struct RecoverOp {
|
||||||
pub(super) attached_tenants: HashMap<TenantShardId, Generation>,
|
pub(super) attached_tenants: HashMap<TenantId, Generation>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -206,7 +205,7 @@ impl ListWriter {
|
|||||||
|
|
||||||
async fn recover(
|
async fn recover(
|
||||||
&mut self,
|
&mut self,
|
||||||
attached_tenants: HashMap<TenantShardId, Generation>,
|
attached_tenants: HashMap<TenantId, Generation>,
|
||||||
) -> Result<(), anyhow::Error> {
|
) -> Result<(), anyhow::Error> {
|
||||||
debug!(
|
debug!(
|
||||||
"recovering with {} attached tenants",
|
"recovering with {} attached tenants",
|
||||||
@@ -309,8 +308,8 @@ impl ListWriter {
|
|||||||
// generation was issued to another node in the interval while we restarted,
|
// generation was issued to another node in the interval while we restarted,
|
||||||
// then we may treat deletion lists from the previous generation as if they
|
// then we may treat deletion lists from the previous generation as if they
|
||||||
// belong to our currently attached generation, and proceed to validate & execute.
|
// belong to our currently attached generation, and proceed to validate & execute.
|
||||||
for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
|
for (tenant_id, tenant_list) in &mut deletion_list.tenants {
|
||||||
if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
|
if let Some(attached_gen) = attached_tenants.get(tenant_id) {
|
||||||
if attached_gen.previous() == tenant_list.generation {
|
if attached_gen.previous() == tenant_list.generation {
|
||||||
tenant_list.generation = *attached_gen;
|
tenant_list.generation = *attached_gen;
|
||||||
}
|
}
|
||||||
@@ -388,26 +387,25 @@ impl ListWriter {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let mut layer_paths = Vec::new();
|
let mut layer_paths = Vec::new();
|
||||||
for (layer, meta) in op.layers {
|
for (layer, generation) in op.layers {
|
||||||
layer_paths.push(remote_layer_path(
|
layer_paths.push(remote_layer_path(
|
||||||
&op.tenant_shard_id.tenant_id,
|
&op.tenant_id,
|
||||||
&op.timeline_id,
|
&op.timeline_id,
|
||||||
meta.shard,
|
|
||||||
&layer,
|
&layer,
|
||||||
meta.generation,
|
generation,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
layer_paths.extend(op.objects);
|
layer_paths.extend(op.objects);
|
||||||
|
|
||||||
if !self.pending.push(
|
if !self.pending.push(
|
||||||
&op.tenant_shard_id,
|
&op.tenant_id,
|
||||||
&op.timeline_id,
|
&op.timeline_id,
|
||||||
op.generation,
|
op.generation,
|
||||||
&mut layer_paths,
|
&mut layer_paths,
|
||||||
) {
|
) {
|
||||||
self.flush().await;
|
self.flush().await;
|
||||||
let retry_succeeded = self.pending.push(
|
let retry_succeeded = self.pending.push(
|
||||||
&op.tenant_shard_id,
|
&op.tenant_id,
|
||||||
&op.timeline_id,
|
&op.timeline_id,
|
||||||
op.generation,
|
op.generation,
|
||||||
&mut layer_paths,
|
&mut layer_paths,
|
||||||
|
|||||||
@@ -178,14 +178,7 @@ where
|
|||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
|
|
||||||
if valid && *validated_generation == tenant_lsn_state.generation {
|
if valid && *validated_generation == tenant_lsn_state.generation {
|
||||||
for (timeline_id, pending_lsn) in tenant_lsn_state.timelines {
|
for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
|
||||||
tracing::debug!(
|
|
||||||
%tenant_id,
|
|
||||||
%timeline_id,
|
|
||||||
current = %pending_lsn.result_slot.load(),
|
|
||||||
projected = %pending_lsn.projected,
|
|
||||||
"advancing validated remote_consistent_lsn",
|
|
||||||
);
|
|
||||||
pending_lsn.result_slot.store(pending_lsn.projected);
|
pending_lsn.result_slot.store(pending_lsn.projected);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -310,7 +310,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.as_micros(),
|
.as_micros(),
|
||||||
partition,
|
partition,
|
||||||
desc.tenant_shard_id,
|
desc.tenant_id,
|
||||||
desc.timeline_id,
|
desc.timeline_id,
|
||||||
candidate.layer,
|
candidate.layer,
|
||||||
);
|
);
|
||||||
@@ -380,7 +380,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
|
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
|
||||||
|
|
||||||
for (timeline, batch) in batched {
|
for (timeline, batch) in batched {
|
||||||
let tenant_shard_id = timeline.tenant_shard_id;
|
let tenant_id = timeline.tenant_id;
|
||||||
let timeline_id = timeline.timeline_id;
|
let timeline_id = timeline.timeline_id;
|
||||||
let batch_size =
|
let batch_size =
|
||||||
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
|
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
|
||||||
@@ -431,7 +431,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
(evicted_bytes, evictions_failed)
|
(evicted_bytes, evictions_failed)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size));
|
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
|
||||||
|
|
||||||
js.spawn(evict);
|
js.spawn(evict);
|
||||||
|
|
||||||
@@ -572,7 +572,7 @@ async fn collect_eviction_candidates(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let info = tl.get_local_layers_for_disk_usage_eviction().await;
|
let info = tl.get_local_layers_for_disk_usage_eviction().await;
|
||||||
debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
||||||
tenant_candidates.extend(
|
tenant_candidates.extend(
|
||||||
info.resident_layers
|
info.resident_layers
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
|||||||
@@ -624,99 +624,6 @@ paths:
|
|||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/location_config:
|
|
||||||
parameters:
|
|
||||||
- name: tenant_id
|
|
||||||
in: path
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
format: hex
|
|
||||||
- name: flush_ms
|
|
||||||
in: query
|
|
||||||
required: false
|
|
||||||
schema:
|
|
||||||
type: integer
|
|
||||||
put:
|
|
||||||
description: |
|
|
||||||
Configures a _tenant location_, that is how a particular pageserver handles
|
|
||||||
a particular tenant. This includes _attached_ tenants, i.e. those ingesting WAL
|
|
||||||
and page service requests, and _secondary_ tenants, i.e. those which are just keeping
|
|
||||||
a warm cache in anticipation of transitioning to attached state in the future.
|
|
||||||
|
|
||||||
This is a declarative, idempotent API: there are not separate endpoints
|
|
||||||
for different tenant location configurations. Rather, this single endpoint accepts
|
|
||||||
a description of the desired location configuration, and makes whatever changes
|
|
||||||
are required to reach that state.
|
|
||||||
|
|
||||||
In imperative terms, this API is used to attach and detach tenants, and
|
|
||||||
to transition tenants to and from secondary mode.
|
|
||||||
|
|
||||||
This is a synchronous API: there is no 202 response. State transitions should always
|
|
||||||
be fast (milliseconds), with the exception of requests setting `flush_ms`, in which case
|
|
||||||
the caller controls the runtime of the request.
|
|
||||||
|
|
||||||
In some state transitions, it makes sense to flush dirty data to remote storage: this includes transitions
|
|
||||||
to AttachedStale and Detached. Flushing is never necessary for correctness, but is an
|
|
||||||
important optimization when doing migrations. The `flush_ms` parameter controls whether
|
|
||||||
flushing should be attempted, and how much time is allowed for flushing. If the time limit expires,
|
|
||||||
the requested transition will continue without waiting for any outstanding data to flush. Callers
|
|
||||||
should use a duration which is substantially less than their HTTP client's request
|
|
||||||
timeout. It is safe to supply flush_ms irrespective of the request body: in state transitions
|
|
||||||
where flushing doesn't make sense, the server will ignore it.
|
|
||||||
|
|
||||||
It is safe to retry requests, but if one receives a 409 or 503 response, it is not
|
|
||||||
useful to retry aggressively: there is probably an existing request still ongoing.
|
|
||||||
requestBody:
|
|
||||||
required: false
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/TenantLocationConfigRequest"
|
|
||||||
responses:
|
|
||||||
"200":
|
|
||||||
description: Tenant is now in requested state
|
|
||||||
"503":
|
|
||||||
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"409":
|
|
||||||
description: |
|
|
||||||
The tenant is already known to Pageserver in some way,
|
|
||||||
and hence this `/attach` call has been rejected.
|
|
||||||
|
|
||||||
Some examples of how this can happen:
|
|
||||||
- tenant was created on this pageserver
|
|
||||||
- tenant attachment was started by an earlier call to `/attach`.
|
|
||||||
|
|
||||||
Callers should poll the tenant status's `attachment_status` field,
|
|
||||||
like for status 202. See the longer description for `POST /attach`
|
|
||||||
for details.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ConflictError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/detach:
|
/v1/tenant/{tenant_id}/detach:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -1028,9 +935,6 @@ paths:
|
|||||||
format: hex
|
format: hex
|
||||||
pg_version:
|
pg_version:
|
||||||
type: integer
|
type: integer
|
||||||
existing_initdb_timeline_id:
|
|
||||||
type: string
|
|
||||||
format: hex
|
|
||||||
responses:
|
responses:
|
||||||
"201":
|
"201":
|
||||||
description: TimelineInfo
|
description: TimelineInfo
|
||||||
@@ -1370,31 +1274,6 @@ components:
|
|||||||
tenant_id:
|
tenant_id:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
TenantLocationConfigRequest:
|
|
||||||
type: object
|
|
||||||
required:
|
|
||||||
- tenant_id
|
|
||||||
properties:
|
|
||||||
tenant_id:
|
|
||||||
type: string
|
|
||||||
format: hex
|
|
||||||
mode:
|
|
||||||
type: string
|
|
||||||
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
|
||||||
description: Mode of functionality that this pageserver will run in for this tenant.
|
|
||||||
generation:
|
|
||||||
type: integer
|
|
||||||
description: Attachment generation number, mandatory when `mode` is an attached state
|
|
||||||
secondary_conf:
|
|
||||||
$ref: '#/components/schemas/SecondaryConfig'
|
|
||||||
tenant_conf:
|
|
||||||
$ref: '#/components/schemas/TenantConfig'
|
|
||||||
SecondaryConfig:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
warm:
|
|
||||||
type: boolean
|
|
||||||
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
|
|
||||||
TenantConfig:
|
TenantConfig:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
|||||||
@@ -4,10 +4,8 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use enumset::EnumSet;
|
|
||||||
use futures::TryFutureExt;
|
use futures::TryFutureExt;
|
||||||
use humantime::format_rfc3339;
|
use humantime::format_rfc3339;
|
||||||
use hyper::header;
|
use hyper::header;
|
||||||
@@ -18,7 +16,6 @@ use pageserver_api::models::{
|
|||||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||||
TenantLoadRequest, TenantLocationConfigRequest,
|
TenantLoadRequest, TenantLocationConfigRequest,
|
||||||
};
|
};
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use tenant_size_model::{SizeResult, StorageModel};
|
use tenant_size_model::{SizeResult, StorageModel};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
@@ -44,7 +41,6 @@ use crate::tenant::mgr::{
|
|||||||
};
|
};
|
||||||
use crate::tenant::size::ModelInputs;
|
use crate::tenant::size::ModelInputs;
|
||||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||||
use crate::tenant::timeline::CompactFlags;
|
|
||||||
use crate::tenant::timeline::Timeline;
|
use crate::tenant::timeline::Timeline;
|
||||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
|
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
|
||||||
use crate::{config::PageServerConf, tenant::mgr};
|
use crate::{config::PageServerConf, tenant::mgr};
|
||||||
@@ -338,7 +334,13 @@ async fn build_timeline_info_common(
|
|||||||
Lsn(0) => None,
|
Lsn(0) => None,
|
||||||
lsn @ Lsn(_) => Some(lsn),
|
lsn @ Lsn(_) => Some(lsn),
|
||||||
};
|
};
|
||||||
let current_logical_size = timeline.get_current_logical_size(ctx);
|
let current_logical_size = match timeline.get_current_logical_size(ctx) {
|
||||||
|
Ok((size, _)) => Some(size),
|
||||||
|
Err(err) => {
|
||||||
|
error!("Timeline info creation failed to get current logical size: {err:?}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||||
let state = timeline.current_state();
|
let state = timeline.current_state();
|
||||||
let remote_consistent_lsn_projected = timeline
|
let remote_consistent_lsn_projected = timeline
|
||||||
@@ -351,8 +353,7 @@ async fn build_timeline_info_common(
|
|||||||
let walreceiver_status = timeline.walreceiver_status();
|
let walreceiver_status = timeline.walreceiver_status();
|
||||||
|
|
||||||
let info = TimelineInfo {
|
let info = TimelineInfo {
|
||||||
// TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id
|
tenant_id: timeline.tenant_id,
|
||||||
tenant_id: timeline.tenant_shard_id.tenant_id,
|
|
||||||
timeline_id: timeline.timeline_id,
|
timeline_id: timeline.timeline_id,
|
||||||
ancestor_timeline_id,
|
ancestor_timeline_id,
|
||||||
ancestor_lsn,
|
ancestor_lsn,
|
||||||
@@ -362,11 +363,7 @@ async fn build_timeline_info_common(
|
|||||||
last_record_lsn,
|
last_record_lsn,
|
||||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||||
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
||||||
current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
|
current_logical_size,
|
||||||
current_logical_size_is_accurate: match current_logical_size.accuracy() {
|
|
||||||
tenant::timeline::logical_size::Accuracy::Approximate => false,
|
|
||||||
tenant::timeline::logical_size::Accuracy::Exact => true,
|
|
||||||
},
|
|
||||||
current_physical_size,
|
current_physical_size,
|
||||||
current_logical_size_non_incremental: None,
|
current_logical_size_non_incremental: None,
|
||||||
timeline_dir_layer_file_size_sum: None,
|
timeline_dir_layer_file_size_sum: None,
|
||||||
@@ -422,9 +419,9 @@ async fn timeline_create_handler(
|
|||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let new_timeline_id = request_data.new_timeline_id;
|
let new_timeline_id = request_data.new_timeline_id;
|
||||||
|
|
||||||
@@ -433,13 +430,12 @@ async fn timeline_create_handler(
|
|||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
async {
|
async {
|
||||||
let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
|
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||||
match tenant.create_timeline(
|
match tenant.create_timeline(
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||||
request_data.ancestor_start_lsn,
|
request_data.ancestor_start_lsn,
|
||||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||||
request_data.existing_initdb_timeline_id,
|
|
||||||
state.broker_client.clone(),
|
state.broker_client.clone(),
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
@@ -468,10 +464,7 @@ async fn timeline_create_handler(
|
|||||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_create",
|
.instrument(info_span!("timeline_create", %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
|
||||||
shard = %tenant_shard_id.shard_slug(),
|
|
||||||
timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -551,7 +544,7 @@ async fn timeline_detail_handler(
|
|||||||
|
|
||||||
async fn get_lsn_by_timestamp_handler(
|
async fn get_lsn_by_timestamp_handler(
|
||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
@@ -567,9 +560,7 @@ async fn get_lsn_by_timestamp_handler(
|
|||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||||
let result = timeline
|
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
|
||||||
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if version.unwrap_or(0) > 1 {
|
if version.unwrap_or(0) > 1 {
|
||||||
#[derive(serde::Serialize)]
|
#[derive(serde::Serialize)]
|
||||||
@@ -669,15 +660,14 @@ async fn timeline_delete_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
let state = get_state(&request);
|
|
||||||
|
|
||||||
state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
|
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
|
||||||
.instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
|
.instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::ACCEPTED, ())
|
json_response(StatusCode::ACCEPTED, ())
|
||||||
@@ -691,14 +681,11 @@ async fn tenant_detach_handler(
|
|||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
|
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
|
||||||
|
|
||||||
// This is a legacy API (`/location_conf` is the replacement). It only supports unsharded tenants
|
|
||||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
|
||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
let conf = state.conf;
|
let conf = state.conf;
|
||||||
mgr::detach_tenant(
|
mgr::detach_tenant(
|
||||||
conf,
|
conf,
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
detach_ignored.unwrap_or(false),
|
detach_ignored.unwrap_or(false),
|
||||||
&state.deletion_queue_client,
|
&state.deletion_queue_client,
|
||||||
)
|
)
|
||||||
@@ -815,16 +802,13 @@ async fn tenant_delete_handler(
|
|||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
// TODO openapi spec
|
// TODO openapi spec
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
|
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
|
||||||
.instrument(info_span!("tenant_delete_handler",
|
.instrument(info_span!("tenant_delete_handler", %tenant_id))
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
|
||||||
shard = tenant_shard_id.shard_slug()
|
|
||||||
))
|
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::ACCEPTED, ())
|
json_response(StatusCode::ACCEPTED, ())
|
||||||
@@ -845,7 +829,7 @@ async fn tenant_delete_handler(
|
|||||||
/// without modifying anything anyway.
|
/// without modifying anything anyway.
|
||||||
async fn tenant_size_handler(
|
async fn tenant_size_handler(
|
||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
@@ -861,7 +845,6 @@ async fn tenant_size_handler(
|
|||||||
.gather_size_inputs(
|
.gather_size_inputs(
|
||||||
retention_period,
|
retention_period,
|
||||||
LogicalSizeCalculationCause::TenantSizeHandler,
|
LogicalSizeCalculationCause::TenantSizeHandler,
|
||||||
&cancel,
|
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
@@ -1155,11 +1138,9 @@ async fn put_tenant_location_config_handler(
|
|||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
|
||||||
|
|
||||||
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
|
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
|
||||||
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
|
let tenant_id = request_data.tenant_id;
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
@@ -1168,13 +1149,9 @@ async fn put_tenant_location_config_handler(
|
|||||||
// The `Detached` state is special, it doesn't upsert a tenant, it removes
|
// The `Detached` state is special, it doesn't upsert a tenant, it removes
|
||||||
// its local disk content and drops it from memory.
|
// its local disk content and drops it from memory.
|
||||||
if let LocationConfigMode::Detached = request_data.config.mode {
|
if let LocationConfigMode::Detached = request_data.config.mode {
|
||||||
if let Err(e) =
|
if let Err(e) = mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
|
||||||
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
.instrument(info_span!("tenant_detach", %tenant_id))
|
||||||
.instrument(info_span!("tenant_detach",
|
.await
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
|
||||||
shard = tenant_shard_id.shard_slug()
|
|
||||||
))
|
|
||||||
.await
|
|
||||||
{
|
{
|
||||||
match e {
|
match e {
|
||||||
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
|
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
|
||||||
@@ -1191,7 +1168,7 @@ async fn put_tenant_location_config_handler(
|
|||||||
|
|
||||||
state
|
state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.upsert_location(tenant_shard_id, location_conf, flush, &ctx)
|
.upsert_location(tenant_id, location_conf, &ctx)
|
||||||
.await
|
.await
|
||||||
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
||||||
// principle we might have hit something like concurrent API calls to the same tenant,
|
// principle we might have hit something like concurrent API calls to the same tenant,
|
||||||
@@ -1247,7 +1224,7 @@ async fn failpoints_handler(
|
|||||||
// Run GC immediately on given timeline.
|
// Run GC immediately on given timeline.
|
||||||
async fn timeline_gc_handler(
|
async fn timeline_gc_handler(
|
||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
@@ -1256,7 +1233,7 @@ async fn timeline_gc_handler(
|
|||||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?;
|
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
|
||||||
let gc_result = wait_task_done
|
let gc_result = wait_task_done
|
||||||
.await
|
.await
|
||||||
.context("wait for gc task")
|
.context("wait for gc task")
|
||||||
@@ -1275,15 +1252,11 @@ async fn timeline_compact_handler(
|
|||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let mut flags = EnumSet::empty();
|
|
||||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
|
|
||||||
flags |= CompactFlags::ForceRepartition;
|
|
||||||
}
|
|
||||||
async {
|
async {
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||||
timeline
|
timeline
|
||||||
.compact(&cancel, flags, &ctx)
|
.compact(&cancel, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
@@ -1300,11 +1273,6 @@ async fn timeline_checkpoint_handler(
|
|||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let mut flags = EnumSet::empty();
|
|
||||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
|
|
||||||
flags |= CompactFlags::ForceRepartition;
|
|
||||||
}
|
|
||||||
async {
|
async {
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||||
@@ -1313,7 +1281,7 @@ async fn timeline_checkpoint_handler(
|
|||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
timeline
|
timeline
|
||||||
.compact(&cancel, flags, &ctx)
|
.compact(&cancel, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||||
|
|
||||||
@@ -1510,7 +1478,7 @@ async fn timeline_collect_keyspace(
|
|||||||
let keys = timeline
|
let keys = timeline
|
||||||
.collect_keyspace(at_lsn, &ctx)
|
.collect_keyspace(at_lsn, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
|
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
|
||||||
}
|
}
|
||||||
@@ -1691,24 +1659,8 @@ where
|
|||||||
let token_cloned = token.clone();
|
let token_cloned = token.clone();
|
||||||
let result = handler(r, token).await;
|
let result = handler(r, token).await;
|
||||||
if token_cloned.is_cancelled() {
|
if token_cloned.is_cancelled() {
|
||||||
// dropguard has executed: we will never turn this result into response.
|
info!("Cancelled request finished");
|
||||||
//
|
|
||||||
// at least temporarily do {:?} logging; these failures are rare enough but
|
|
||||||
// could hide difficult errors.
|
|
||||||
match &result {
|
|
||||||
Ok(response) => {
|
|
||||||
let status = response.status();
|
|
||||||
info!(%status, "Cancelled request finished successfully")
|
|
||||||
}
|
|
||||||
Err(e) => error!("Cancelled request finished with an error: {e:?}"),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// only logging for cancelled panicked request handlers is the tracing_panic_hook,
|
|
||||||
// which should suffice.
|
|
||||||
//
|
|
||||||
// there is still a chance to lose the result due to race between
|
|
||||||
// returning from here and the actual connection closing happening
|
|
||||||
// before outer task gets to execute. leaving that up for #5815.
|
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
.in_current_span(),
|
.in_current_span(),
|
||||||
@@ -1800,7 +1752,7 @@ pub fn make_router(
|
|||||||
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
||||||
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
||||||
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
|
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
|
||||||
.delete("/v1/tenant/:tenant_shard_id", |r| {
|
.delete("/v1/tenant/:tenant_id", |r| {
|
||||||
api_handler(r, tenant_delete_handler)
|
api_handler(r, tenant_delete_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
|
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
|
||||||
@@ -1812,13 +1764,13 @@ pub fn make_router(
|
|||||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||||
api_handler(r, get_tenant_config_handler)
|
api_handler(r, get_tenant_config_handler)
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||||
api_handler(r, put_tenant_location_config_handler)
|
api_handler(r, put_tenant_location_config_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/timeline", |r| {
|
.get("/v1/tenant/:tenant_id/timeline", |r| {
|
||||||
api_handler(r, timeline_list_handler)
|
api_handler(r, timeline_list_handler)
|
||||||
})
|
})
|
||||||
.post("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||||
api_handler(r, timeline_create_handler)
|
api_handler(r, timeline_create_handler)
|
||||||
})
|
})
|
||||||
.post("/v1/tenant/:tenant_id/attach", |r| {
|
.post("/v1/tenant/:tenant_id/attach", |r| {
|
||||||
@@ -1862,7 +1814,7 @@ pub fn make_router(
|
|||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
||||||
|r| api_handler(r, timeline_download_remote_layers_handler_get),
|
|r| api_handler(r, timeline_download_remote_layers_handler_get),
|
||||||
)
|
)
|
||||||
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||||
api_handler(r, timeline_delete_handler)
|
api_handler(r, timeline_delete_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
|
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
|
||||||
|
|||||||
@@ -3,26 +3,18 @@
|
|||||||
//! a neon Timeline.
|
//! a neon Timeline.
|
||||||
//!
|
//!
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::pin::Pin;
|
|
||||||
use std::task::{self, Poll};
|
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
use async_compression::tokio::bufread::ZstdDecoder;
|
|
||||||
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
|
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use nix::NixPath;
|
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
|
|
||||||
use tokio_tar::Archive;
|
use tokio_tar::Archive;
|
||||||
use tokio_tar::Builder;
|
|
||||||
use tokio_tar::HeaderMode;
|
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
use crate::walingest::WalIngest;
|
use crate::walingest::WalIngest;
|
||||||
use crate::walrecord::DecodedWALRecord;
|
use crate::walrecord::DecodedWALRecord;
|
||||||
@@ -41,9 +33,7 @@ use utils::lsn::Lsn;
|
|||||||
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
|
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
|
||||||
// Read control file to extract the LSN
|
// Read control file to extract the LSN
|
||||||
let controlfile_path = path.join("global").join("pg_control");
|
let controlfile_path = path.join("global").join("pg_control");
|
||||||
let controlfile_buf = std::fs::read(&controlfile_path)
|
let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
|
||||||
.with_context(|| format!("reading controlfile: {controlfile_path}"))?;
|
|
||||||
let controlfile = ControlFileData::decode(&controlfile_buf)?;
|
|
||||||
let lsn = controlfile.checkPoint;
|
let lsn = controlfile.checkPoint;
|
||||||
|
|
||||||
Ok(Lsn(lsn))
|
Ok(Lsn(lsn))
|
||||||
@@ -628,118 +618,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
|
|||||||
reader.read_to_end(&mut buf).await?;
|
reader.read_to_end(&mut buf).await?;
|
||||||
Ok(Bytes::from(buf))
|
Ok(Bytes::from(buf))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then
|
|
||||||
///
|
|
||||||
/// The number of yields is bounded by above by the number of times poll_write is called,
|
|
||||||
/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total.
|
|
||||||
/// This is an explicit choice as the `YieldingVec` is meant to give the async executor
|
|
||||||
/// breathing room between units of CPU intensive preparation of buffers to be written.
|
|
||||||
/// Once a write call is issued, the whole buffer has been prepared already, so there is no
|
|
||||||
/// gain in splitting up the memcopy further.
|
|
||||||
struct YieldingVec {
|
|
||||||
yield_budget: usize,
|
|
||||||
// the buffer written into
|
|
||||||
buf: Vec<u8>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl YieldingVec {
|
|
||||||
fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
yield_budget: 0,
|
|
||||||
buf: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Whether we should yield for a read operation of given size
|
|
||||||
fn should_yield(&mut self, add_buf_len: usize) -> bool {
|
|
||||||
// Set this limit to a small value so that we are a
|
|
||||||
// good async citizen and yield repeatedly (but not
|
|
||||||
// too often for many small writes to cause many yields)
|
|
||||||
const YIELD_DIST: usize = 1024;
|
|
||||||
|
|
||||||
let target_buf_len = self.buf.len() + add_buf_len;
|
|
||||||
let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST;
|
|
||||||
if self.yield_budget < target_buf_len {
|
|
||||||
self.yield_budget += add_buf_len;
|
|
||||||
}
|
|
||||||
ret
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AsyncWrite for YieldingVec {
|
|
||||||
fn poll_write(
|
|
||||||
mut self: Pin<&mut Self>,
|
|
||||||
cx: &mut task::Context<'_>,
|
|
||||||
buf: &[u8],
|
|
||||||
) -> Poll<std::io::Result<usize>> {
|
|
||||||
if self.should_yield(buf.len()) {
|
|
||||||
cx.waker().wake_by_ref();
|
|
||||||
return Poll::Pending;
|
|
||||||
}
|
|
||||||
self.get_mut().buf.extend_from_slice(buf);
|
|
||||||
Poll::Ready(Ok(buf.len()))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll<std::io::Result<()>> {
|
|
||||||
Poll::Ready(Ok(()))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn poll_shutdown(
|
|
||||||
self: Pin<&mut Self>,
|
|
||||||
_cx: &mut task::Context<'_>,
|
|
||||||
) -> Poll<std::io::Result<()>> {
|
|
||||||
Poll::Ready(Ok(()))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
|
|
||||||
let mut paths = Vec::new();
|
|
||||||
for entry in WalkDir::new(pgdata_path) {
|
|
||||||
let entry = entry?;
|
|
||||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
|
||||||
// Also allow directories so that we also get empty directories
|
|
||||||
if !(metadata.is_file() || metadata.is_dir()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let path = entry.into_path();
|
|
||||||
paths.push(path);
|
|
||||||
}
|
|
||||||
// Do a sort to get a more consistent listing
|
|
||||||
paths.sort_unstable();
|
|
||||||
let zstd = ZstdEncoder::with_quality_and_params(
|
|
||||||
YieldingVec::new(),
|
|
||||||
Level::Default,
|
|
||||||
&[CParameter::enable_long_distance_matching(true)],
|
|
||||||
);
|
|
||||||
let mut builder = Builder::new(zstd);
|
|
||||||
// Use reproducible header mode
|
|
||||||
builder.mode(HeaderMode::Deterministic);
|
|
||||||
for path in paths {
|
|
||||||
let rel_path = path.strip_prefix(pgdata_path)?;
|
|
||||||
if rel_path.is_empty() {
|
|
||||||
// The top directory should not be compressed,
|
|
||||||
// the tar crate doesn't like that
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
builder.append_path_with_name(&path, rel_path).await?;
|
|
||||||
}
|
|
||||||
let mut zstd = builder.into_inner().await?;
|
|
||||||
zstd.shutdown().await?;
|
|
||||||
let compressed = zstd.into_inner();
|
|
||||||
let compressed_len = compressed.buf.len();
|
|
||||||
const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000;
|
|
||||||
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
|
|
||||||
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
|
|
||||||
}
|
|
||||||
Ok(compressed.buf)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn extract_tar_zst(
|
|
||||||
pgdata_path: &Utf8Path,
|
|
||||||
tar_zst: impl AsyncBufRead + Unpin,
|
|
||||||
) -> Result<()> {
|
|
||||||
let tar = Box::pin(ZstdDecoder::new(tar_zst));
|
|
||||||
let mut archive = Archive::new(tar);
|
|
||||||
archive.unpack(pgdata_path).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ use metrics::{
|
|||||||
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||||
};
|
};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
||||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
@@ -403,126 +402,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define current logical size metric")
|
.expect("failed to define current logical size metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) mod initial_logical_size {
|
|
||||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
|
||||||
use once_cell::sync::Lazy;
|
|
||||||
|
|
||||||
use crate::task_mgr::TaskKind;
|
|
||||||
|
|
||||||
pub(crate) struct StartCalculation(IntCounterVec);
|
|
||||||
pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
|
|
||||||
StartCalculation(
|
|
||||||
register_int_counter_vec!(
|
|
||||||
"pageserver_initial_logical_size_start_calculation",
|
|
||||||
"Incremented each time we start an initial logical size calculation attempt. \
|
|
||||||
The `task_kind` label is for the task kind that caused this attempt.",
|
|
||||||
&["attempt", "task_kind"]
|
|
||||||
)
|
|
||||||
.unwrap(),
|
|
||||||
)
|
|
||||||
});
|
|
||||||
|
|
||||||
struct DropCalculation {
|
|
||||||
first: IntCounter,
|
|
||||||
retry: IntCounter,
|
|
||||||
}
|
|
||||||
|
|
||||||
static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
|
|
||||||
let vec = register_int_counter_vec!(
|
|
||||||
"pageserver_initial_logical_size_drop_calculation",
|
|
||||||
"Incremented each time we abort a started size calculation attmpt.",
|
|
||||||
&["attempt"]
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
DropCalculation {
|
|
||||||
first: vec.with_label_values(&["first"]),
|
|
||||||
retry: vec.with_label_values(&["retry"]),
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
pub(crate) struct Calculated {
|
|
||||||
pub(crate) births: IntCounter,
|
|
||||||
pub(crate) deaths: IntCounter,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
|
|
||||||
births: register_int_counter!(
|
|
||||||
"pageserver_initial_logical_size_finish_calculation",
|
|
||||||
"Incremented every time we finish calculation of initial logical size.\
|
|
||||||
If everything is working well, this should happen at most once per Timeline object."
|
|
||||||
)
|
|
||||||
.unwrap(),
|
|
||||||
deaths: register_int_counter!(
|
|
||||||
"pageserver_initial_logical_size_drop_finished_calculation",
|
|
||||||
"Incremented when we drop a finished initial logical size calculation result.\
|
|
||||||
Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
|
|
||||||
)
|
|
||||||
.unwrap(),
|
|
||||||
});
|
|
||||||
|
|
||||||
pub(crate) struct OngoingCalculationGuard {
|
|
||||||
inc_drop_calculation: Option<IntCounter>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StartCalculation {
|
|
||||||
pub(crate) fn first(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
|
|
||||||
let task_kind_label: &'static str =
|
|
||||||
causing_task_kind.map(|k| k.into()).unwrap_or_default();
|
|
||||||
self.0.with_label_values(&["first", task_kind_label]);
|
|
||||||
OngoingCalculationGuard {
|
|
||||||
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub(crate) fn retry(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
|
|
||||||
let task_kind_label: &'static str =
|
|
||||||
causing_task_kind.map(|k| k.into()).unwrap_or_default();
|
|
||||||
self.0.with_label_values(&["retry", task_kind_label]);
|
|
||||||
OngoingCalculationGuard {
|
|
||||||
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for OngoingCalculationGuard {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
if let Some(counter) = self.inc_drop_calculation.take() {
|
|
||||||
counter.inc();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl OngoingCalculationGuard {
|
|
||||||
pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
|
|
||||||
drop(self.inc_drop_calculation.take());
|
|
||||||
CALCULATED.births.inc();
|
|
||||||
FinishedCalculationGuard {
|
|
||||||
inc_on_drop: CALCULATED.deaths.clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) struct FinishedCalculationGuard {
|
|
||||||
inc_on_drop: IntCounter,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for FinishedCalculationGuard {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
self.inc_on_drop.inc();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// context: https://github.com/neondatabase/neon/issues/5963
|
|
||||||
pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
|
|
||||||
Lazy::new(|| {
|
|
||||||
register_int_counter!(
|
|
||||||
"pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
|
|
||||||
"Counter for the following event: walreceiver calls\
|
|
||||||
Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
|
|
||||||
)
|
|
||||||
.unwrap()
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
register_uint_gauge_vec!(
|
register_uint_gauge_vec!(
|
||||||
"pageserver_tenant_states_count",
|
"pageserver_tenant_states_count",
|
||||||
@@ -759,7 +638,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
|||||||
///
|
///
|
||||||
/// Operations:
|
/// Operations:
|
||||||
/// - open ([`std::fs::OpenOptions::open`])
|
/// - open ([`std::fs::OpenOptions::open`])
|
||||||
/// - close (dropping [`crate::virtual_file::VirtualFile`])
|
/// - close (dropping [`std::fs::File`])
|
||||||
/// - close-by-replace (close by replacement algorithm)
|
/// - close-by-replace (close by replacement algorithm)
|
||||||
/// - read (`read_at`)
|
/// - read (`read_at`)
|
||||||
/// - write (`write_at`)
|
/// - write (`write_at`)
|
||||||
@@ -1373,55 +1252,6 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
|
||||||
register_histogram!(
|
|
||||||
"pageserver_wal_redo_process_launch_duration",
|
|
||||||
"Histogram of the duration of successful WalRedoProcess::launch calls",
|
|
||||||
redo_histogram_time_buckets!(),
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric")
|
|
||||||
});
|
|
||||||
|
|
||||||
pub(crate) struct WalRedoProcessCounters {
|
|
||||||
pub(crate) started: IntCounter,
|
|
||||||
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
|
||||||
pub(crate) enum WalRedoKillCause {
|
|
||||||
WalRedoProcessDrop,
|
|
||||||
NoLeakChildDrop,
|
|
||||||
Startup,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for WalRedoProcessCounters {
|
|
||||||
fn default() -> Self {
|
|
||||||
let started = register_int_counter!(
|
|
||||||
"pageserver_wal_redo_process_started_total",
|
|
||||||
"Number of WAL redo processes started",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let killed = register_int_counter_vec!(
|
|
||||||
"pageserver_wal_redo_process_stopped_total",
|
|
||||||
"Number of WAL redo processes stopped",
|
|
||||||
&["cause"],
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
Self {
|
|
||||||
started,
|
|
||||||
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
|
|
||||||
let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
|
|
||||||
let cause_str: &'static str = cause.into();
|
|
||||||
killed.with_label_values(&[cause_str])
|
|
||||||
})),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
|
||||||
Lazy::new(WalRedoProcessCounters::default);
|
|
||||||
|
|
||||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||||
pub struct StorageTimeMetricsTimer {
|
pub struct StorageTimeMetricsTimer {
|
||||||
metrics: StorageTimeMetrics,
|
metrics: StorageTimeMetrics,
|
||||||
@@ -1701,9 +1531,9 @@ pub struct RemoteTimelineClientMetrics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl RemoteTimelineClientMetrics {
|
impl RemoteTimelineClientMetrics {
|
||||||
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||||
RemoteTimelineClientMetrics {
|
RemoteTimelineClientMetrics {
|
||||||
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
tenant_id: tenant_id.to_string(),
|
||||||
timeline_id: timeline_id.to_string(),
|
timeline_id: timeline_id.to_string(),
|
||||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||||
@@ -2091,7 +1921,6 @@ pub fn preinitialize_metrics() {
|
|||||||
&WAL_REDO_TIME,
|
&WAL_REDO_TIME,
|
||||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||||
&WAL_REDO_BYTES_HISTOGRAM,
|
&WAL_REDO_BYTES_HISTOGRAM,
|
||||||
&WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
|
|
||||||
]
|
]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.for_each(|h| {
|
.for_each(|h| {
|
||||||
|
|||||||
@@ -399,9 +399,6 @@ impl PageServerHandler {
|
|||||||
{
|
{
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// TODO(sharding): enumerate local tenant shards for this tenant, and select the one
|
|
||||||
// that should serve this request.
|
|
||||||
|
|
||||||
// Make request tracer if needed
|
// Make request tracer if needed
|
||||||
let tenant = mgr::get_active_tenant_with_timeout(
|
let tenant = mgr::get_active_tenant_with_timeout(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
@@ -411,10 +408,9 @@ impl PageServerHandler {
|
|||||||
.await?;
|
.await?;
|
||||||
let mut tracer = if tenant.get_trace_read_requests() {
|
let mut tracer = if tenant.get_trace_read_requests() {
|
||||||
let connection_id = ConnectionId::generate();
|
let connection_id = ConnectionId::generate();
|
||||||
let path =
|
let path = tenant
|
||||||
tenant
|
.conf
|
||||||
.conf
|
.trace_path(&tenant_id, &timeline_id, &connection_id);
|
||||||
.trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
|
|
||||||
Some(Tracer::new(path))
|
Some(Tracer::new(path))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
|
|||||||
@@ -21,9 +21,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::collections::{hash_map, HashMap, HashSet};
|
use std::collections::{hash_map, HashMap, HashSet};
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use tracing::{debug, trace, warn};
|
use tracing::{debug, trace, warn};
|
||||||
use utils::bin_ser::DeserializeError;
|
|
||||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||||
|
|
||||||
/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
|
/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
|
||||||
@@ -31,33 +29,9 @@ pub type BlockNumber = u32;
|
|||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum LsnForTimestamp {
|
pub enum LsnForTimestamp {
|
||||||
/// Found commits both before and after the given timestamp
|
|
||||||
Present(Lsn),
|
Present(Lsn),
|
||||||
|
|
||||||
/// Found no commits after the given timestamp, this means
|
|
||||||
/// that the newest data in the branch is older than the given
|
|
||||||
/// timestamp.
|
|
||||||
///
|
|
||||||
/// All commits <= LSN happened before the given timestamp
|
|
||||||
Future(Lsn),
|
Future(Lsn),
|
||||||
|
|
||||||
/// The queried timestamp is past our horizon we look back at (PITR)
|
|
||||||
///
|
|
||||||
/// All commits > LSN happened after the given timestamp,
|
|
||||||
/// but any commits < LSN might have happened before or after
|
|
||||||
/// the given timestamp. We don't know because no data before
|
|
||||||
/// the given lsn is available.
|
|
||||||
Past(Lsn),
|
Past(Lsn),
|
||||||
|
|
||||||
/// We have found no commit with a timestamp,
|
|
||||||
/// so we can't return anything meaningful.
|
|
||||||
///
|
|
||||||
/// The associated LSN is the lower bound value we can safely
|
|
||||||
/// create branches on, but no statement is made if it is
|
|
||||||
/// older or newer than the timestamp.
|
|
||||||
///
|
|
||||||
/// This variant can e.g. be returned right after a
|
|
||||||
/// cluster import.
|
|
||||||
NoData(Lsn),
|
NoData(Lsn),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -69,25 +43,6 @@ pub enum CalculateLogicalSizeError {
|
|||||||
Other(#[from] anyhow::Error),
|
Other(#[from] anyhow::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
|
||||||
pub(crate) enum CollectKeySpaceError {
|
|
||||||
#[error(transparent)]
|
|
||||||
Decode(#[from] DeserializeError),
|
|
||||||
#[error(transparent)]
|
|
||||||
PageRead(PageReconstructError),
|
|
||||||
#[error("cancelled")]
|
|
||||||
Cancelled,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<PageReconstructError> for CollectKeySpaceError {
|
|
||||||
fn from(err: PageReconstructError) -> Self {
|
|
||||||
match err {
|
|
||||||
PageReconstructError::Cancelled => Self::Cancelled,
|
|
||||||
err => Self::PageRead(err),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<PageReconstructError> for CalculateLogicalSizeError {
|
impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||||
fn from(pre: PageReconstructError) -> Self {
|
fn from(pre: PageReconstructError) -> Self {
|
||||||
match pre {
|
match pre {
|
||||||
@@ -366,15 +321,10 @@ impl Timeline {
|
|||||||
pub async fn find_lsn_for_timestamp(
|
pub async fn find_lsn_for_timestamp(
|
||||||
&self,
|
&self,
|
||||||
search_timestamp: TimestampTz,
|
search_timestamp: TimestampTz,
|
||||||
cancel: &CancellationToken,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||||
// We use this method to figure out the branching LSN for the new branch, but the
|
let min_lsn = *gc_cutoff_lsn_guard;
|
||||||
// GC cutoff could be before the branching point and we cannot create a new branch
|
|
||||||
// with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
|
|
||||||
// on the safe side.
|
|
||||||
let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
|
|
||||||
let max_lsn = self.get_last_record_lsn();
|
let max_lsn = self.get_last_record_lsn();
|
||||||
|
|
||||||
// LSNs are always 8-byte aligned. low/mid/high represent the
|
// LSNs are always 8-byte aligned. low/mid/high represent the
|
||||||
@@ -385,9 +335,6 @@ impl Timeline {
|
|||||||
let mut found_smaller = false;
|
let mut found_smaller = false;
|
||||||
let mut found_larger = false;
|
let mut found_larger = false;
|
||||||
while low < high {
|
while low < high {
|
||||||
if cancel.is_cancelled() {
|
|
||||||
return Err(PageReconstructError::Cancelled);
|
|
||||||
}
|
|
||||||
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
||||||
let mid = (high + low) / 2;
|
let mid = (high + low) / 2;
|
||||||
|
|
||||||
@@ -407,33 +354,30 @@ impl Timeline {
|
|||||||
low = mid + 1;
|
low = mid + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// If `found_smaller == true`, `low = t + 1` where `t` is the target LSN,
|
|
||||||
// so the LSN of the last commit record before or at `search_timestamp`.
|
|
||||||
// Remove one from `low` to get `t`.
|
|
||||||
//
|
|
||||||
// FIXME: it would be better to get the LSN of the previous commit.
|
|
||||||
// Otherwise, if you restore to the returned LSN, the database will
|
|
||||||
// include physical changes from later commits that will be marked
|
|
||||||
// as aborted, and will need to be vacuumed away.
|
|
||||||
let commit_lsn = Lsn((low - 1) * 8);
|
|
||||||
match (found_smaller, found_larger) {
|
match (found_smaller, found_larger) {
|
||||||
(false, false) => {
|
(false, false) => {
|
||||||
// This can happen if no commit records have been processed yet, e.g.
|
// This can happen if no commit records have been processed yet, e.g.
|
||||||
// just after importing a cluster.
|
// just after importing a cluster.
|
||||||
Ok(LsnForTimestamp::NoData(min_lsn))
|
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||||
|
}
|
||||||
|
(true, false) => {
|
||||||
|
// Didn't find any commit timestamps larger than the request
|
||||||
|
Ok(LsnForTimestamp::Future(max_lsn))
|
||||||
}
|
}
|
||||||
(false, true) => {
|
(false, true) => {
|
||||||
// Didn't find any commit timestamps smaller than the request
|
// Didn't find any commit timestamps smaller than the request
|
||||||
Ok(LsnForTimestamp::Past(min_lsn))
|
Ok(LsnForTimestamp::Past(max_lsn))
|
||||||
}
|
}
|
||||||
(true, false) => {
|
(true, true) => {
|
||||||
// Only found commits with timestamps smaller than the request.
|
// low is the LSN of the first commit record *after* the search_timestamp,
|
||||||
// It's still a valid case for branch creation, return it.
|
// Back off by one to get to the point just before the commit.
|
||||||
// And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
|
//
|
||||||
// case, anyway.
|
// FIXME: it would be better to get the LSN of the previous commit.
|
||||||
Ok(LsnForTimestamp::Future(commit_lsn))
|
// Otherwise, if you restore to the returned LSN, the database will
|
||||||
|
// include physical changes from later commits that will be marked
|
||||||
|
// as aborted, and will need to be vacuumed away.
|
||||||
|
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||||
}
|
}
|
||||||
(true, true) => Ok(LsnForTimestamp::Present(commit_lsn)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -661,11 +605,11 @@ impl Timeline {
|
|||||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||||
/// that LSN forwards).
|
/// that LSN forwards).
|
||||||
pub(crate) async fn collect_keyspace(
|
pub async fn collect_keyspace(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<KeySpace, CollectKeySpaceError> {
|
) -> anyhow::Result<KeySpace> {
|
||||||
// Iterate through key ranges, greedily packing them into partitions
|
// Iterate through key ranges, greedily packing them into partitions
|
||||||
let mut result = KeySpaceAccum::new();
|
let mut result = KeySpaceAccum::new();
|
||||||
|
|
||||||
@@ -674,7 +618,7 @@ impl Timeline {
|
|||||||
|
|
||||||
// Fetch list of database dirs and iterate them
|
// Fetch list of database dirs and iterate them
|
||||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||||
let dbdir = DbDirectory::des(&buf)?;
|
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
|
||||||
|
|
||||||
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
|
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
|
||||||
dbs.sort_unstable();
|
dbs.sort_unstable();
|
||||||
@@ -707,7 +651,7 @@ impl Timeline {
|
|||||||
let slrudir_key = slru_dir_to_key(kind);
|
let slrudir_key = slru_dir_to_key(kind);
|
||||||
result.add_key(slrudir_key);
|
result.add_key(slrudir_key);
|
||||||
let buf = self.get(slrudir_key, lsn, ctx).await?;
|
let buf = self.get(slrudir_key, lsn, ctx).await?;
|
||||||
let dir = SlruSegmentDirectory::des(&buf)?;
|
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
|
||||||
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
|
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
|
||||||
segments.sort_unstable();
|
segments.sort_unstable();
|
||||||
for segno in segments {
|
for segno in segments {
|
||||||
@@ -725,7 +669,7 @@ impl Timeline {
|
|||||||
// Then pg_twophase
|
// Then pg_twophase
|
||||||
result.add_key(TWOPHASEDIR_KEY);
|
result.add_key(TWOPHASEDIR_KEY);
|
||||||
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
||||||
let twophase_dir = TwoPhaseDirectory::des(&buf)?;
|
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
|
||||||
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
|
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
|
||||||
xids.sort_unstable();
|
xids.sort_unstable();
|
||||||
for xid in xids {
|
for xid in xids {
|
||||||
|
|||||||
@@ -1,11 +1,106 @@
|
|||||||
use crate::walrecord::NeonWalRecord;
|
use crate::walrecord::NeonWalRecord;
|
||||||
use anyhow::Result;
|
use anyhow::{bail, Result};
|
||||||
|
use byteorder::{ByteOrder, BE};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::fmt;
|
||||||
use std::ops::{AddAssign, Range};
|
use std::ops::{AddAssign, Range};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
/// Key used in the Repository kv-store.
|
||||||
|
///
|
||||||
|
/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
|
||||||
|
/// for what we actually store in these fields.
|
||||||
|
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||||
|
pub struct Key {
|
||||||
|
pub field1: u8,
|
||||||
|
pub field2: u32,
|
||||||
|
pub field3: u32,
|
||||||
|
pub field4: u32,
|
||||||
|
pub field5: u8,
|
||||||
|
pub field6: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const KEY_SIZE: usize = 18;
|
||||||
|
|
||||||
|
impl Key {
|
||||||
|
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
||||||
|
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||||
|
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||||
|
pub fn to_i128(&self) -> i128 {
|
||||||
|
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
||||||
|
(((self.field1 & 0xf) as i128) << 120)
|
||||||
|
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||||
|
| ((self.field3 as i128) << 72)
|
||||||
|
| ((self.field4 as i128) << 40)
|
||||||
|
| ((self.field5 as i128) << 32)
|
||||||
|
| self.field6 as i128
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn from_i128(x: i128) -> Self {
|
||||||
|
Key {
|
||||||
|
field1: ((x >> 120) & 0xf) as u8,
|
||||||
|
field2: ((x >> 104) & 0xFFFF) as u32,
|
||||||
|
field3: (x >> 72) as u32,
|
||||||
|
field4: (x >> 40) as u32,
|
||||||
|
field5: (x >> 32) as u8,
|
||||||
|
field6: x as u32,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn next(&self) -> Key {
|
||||||
|
self.add(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add(&self, x: u32) -> Key {
|
||||||
|
let mut key = *self;
|
||||||
|
|
||||||
|
let r = key.field6.overflowing_add(x);
|
||||||
|
key.field6 = r.0;
|
||||||
|
if r.1 {
|
||||||
|
let r = key.field5.overflowing_add(1);
|
||||||
|
key.field5 = r.0;
|
||||||
|
if r.1 {
|
||||||
|
let r = key.field4.overflowing_add(1);
|
||||||
|
key.field4 = r.0;
|
||||||
|
if r.1 {
|
||||||
|
let r = key.field3.overflowing_add(1);
|
||||||
|
key.field3 = r.0;
|
||||||
|
if r.1 {
|
||||||
|
let r = key.field2.overflowing_add(1);
|
||||||
|
key.field2 = r.0;
|
||||||
|
if r.1 {
|
||||||
|
let r = key.field1.overflowing_add(1);
|
||||||
|
key.field1 = r.0;
|
||||||
|
assert!(!r.1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
key
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_slice(b: &[u8]) -> Self {
|
||||||
|
Key {
|
||||||
|
field1: b[0],
|
||||||
|
field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
|
||||||
|
field3: u32::from_be_bytes(b[5..9].try_into().unwrap()),
|
||||||
|
field4: u32::from_be_bytes(b[9..13].try_into().unwrap()),
|
||||||
|
field5: b[13],
|
||||||
|
field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
||||||
|
buf[0] = self.field1;
|
||||||
|
BE::write_u32(&mut buf[1..5], self.field2);
|
||||||
|
BE::write_u32(&mut buf[5..9], self.field3);
|
||||||
|
BE::write_u32(&mut buf[9..13], self.field4);
|
||||||
|
buf[13] = self.field5;
|
||||||
|
BE::write_u32(&mut buf[14..18], self.field6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
||||||
let start = key_range.start;
|
let start = key_range.start;
|
||||||
@@ -34,9 +129,51 @@ pub fn singleton_range(key: Key) -> Range<Key> {
|
|||||||
key..key.next()
|
key..key.next()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for Key {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}",
|
||||||
|
self.field1, self.field2, self.field3, self.field4, self.field5, self.field6
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Key {
|
||||||
|
pub const MIN: Key = Key {
|
||||||
|
field1: u8::MIN,
|
||||||
|
field2: u32::MIN,
|
||||||
|
field3: u32::MIN,
|
||||||
|
field4: u32::MIN,
|
||||||
|
field5: u8::MIN,
|
||||||
|
field6: u32::MIN,
|
||||||
|
};
|
||||||
|
pub const MAX: Key = Key {
|
||||||
|
field1: u8::MAX,
|
||||||
|
field2: u32::MAX,
|
||||||
|
field3: u32::MAX,
|
||||||
|
field4: u32::MAX,
|
||||||
|
field5: u8::MAX,
|
||||||
|
field6: u32::MAX,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn from_hex(s: &str) -> Result<Self> {
|
||||||
|
if s.len() != 36 {
|
||||||
|
bail!("parse error");
|
||||||
|
}
|
||||||
|
Ok(Key {
|
||||||
|
field1: u8::from_str_radix(&s[0..2], 16)?,
|
||||||
|
field2: u32::from_str_radix(&s[2..10], 16)?,
|
||||||
|
field3: u32::from_str_radix(&s[10..18], 16)?,
|
||||||
|
field4: u32::from_str_radix(&s[18..26], 16)?,
|
||||||
|
field5: u8::from_str_radix(&s[26..28], 16)?,
|
||||||
|
field6: u32::from_str_radix(&s[28..36], 16)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// A 'value' stored for a one Key.
|
/// A 'value' stored for a one Key.
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
#[cfg_attr(test, derive(PartialEq))]
|
|
||||||
pub enum Value {
|
pub enum Value {
|
||||||
/// An Image value contains a full copy of the value
|
/// An Image value contains a full copy of the value
|
||||||
Image(Bytes),
|
Image(Bytes),
|
||||||
@@ -60,70 +197,6 @@ impl Value {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod test {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
use bytes::Bytes;
|
|
||||||
use utils::bin_ser::BeSer;
|
|
||||||
|
|
||||||
macro_rules! roundtrip {
|
|
||||||
($orig:expr, $expected:expr) => {{
|
|
||||||
let orig: Value = $orig;
|
|
||||||
|
|
||||||
let actual = Value::ser(&orig).unwrap();
|
|
||||||
let expected: &[u8] = &$expected;
|
|
||||||
|
|
||||||
assert_eq!(utils::Hex(&actual), utils::Hex(expected));
|
|
||||||
|
|
||||||
let deser = Value::des(&actual).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(orig, deser);
|
|
||||||
}};
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn image_roundtrip() {
|
|
||||||
let image = Bytes::from_static(b"foobar");
|
|
||||||
let image = Value::Image(image);
|
|
||||||
|
|
||||||
#[rustfmt::skip]
|
|
||||||
let expected = [
|
|
||||||
// top level discriminator of 4 bytes
|
|
||||||
0x00, 0x00, 0x00, 0x00,
|
|
||||||
// 8 byte length
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
|
|
||||||
// foobar
|
|
||||||
0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
|
|
||||||
];
|
|
||||||
|
|
||||||
roundtrip!(image, expected);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn walrecord_postgres_roundtrip() {
|
|
||||||
let rec = NeonWalRecord::Postgres {
|
|
||||||
will_init: true,
|
|
||||||
rec: Bytes::from_static(b"foobar"),
|
|
||||||
};
|
|
||||||
let rec = Value::WalRecord(rec);
|
|
||||||
|
|
||||||
#[rustfmt::skip]
|
|
||||||
let expected = [
|
|
||||||
// flattened discriminator of total 8 bytes
|
|
||||||
0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
// will_init
|
|
||||||
0x01,
|
|
||||||
// 8 byte length
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
|
|
||||||
// foobar
|
|
||||||
0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
|
|
||||||
];
|
|
||||||
|
|
||||||
roundtrip!(rec, expected);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Result of performing GC
|
/// Result of performing GC
|
||||||
///
|
///
|
||||||
@@ -138,14 +211,6 @@ pub struct GcResult {
|
|||||||
|
|
||||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||||
pub elapsed: Duration,
|
pub elapsed: Duration,
|
||||||
|
|
||||||
/// The layers which were garbage collected.
|
|
||||||
///
|
|
||||||
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
|
||||||
/// dropped in tests.
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
#[serde(skip)]
|
|
||||||
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||||
@@ -166,11 +231,5 @@ impl AddAssign for GcResult {
|
|||||||
self.layers_removed += other.layers_removed;
|
self.layers_removed += other.layers_removed;
|
||||||
|
|
||||||
self.elapsed += other.elapsed;
|
self.elapsed += other.elapsed;
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
{
|
|
||||||
let mut other = other;
|
|
||||||
self.doomed_layers.append(&mut other.doomed_layers);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -8,12 +8,9 @@
|
|||||||
//! We cannot use global or default config instead, because wrong settings
|
//! We cannot use global or default config instead, because wrong settings
|
||||||
//! may lead to a data loss.
|
//! may lead to a data loss.
|
||||||
//!
|
//!
|
||||||
use anyhow::bail;
|
use anyhow::Context;
|
||||||
use pageserver_api::models;
|
use pageserver_api::models;
|
||||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
|
||||||
use serde::de::IntoDeserializer;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
|
||||||
use std::num::NonZeroU64;
|
use std::num::NonZeroU64;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
@@ -91,14 +88,6 @@ pub(crate) struct LocationConf {
|
|||||||
/// The location-specific part of the configuration, describes the operating
|
/// The location-specific part of the configuration, describes the operating
|
||||||
/// mode of this pageserver for this tenant.
|
/// mode of this pageserver for this tenant.
|
||||||
pub(crate) mode: LocationMode,
|
pub(crate) mode: LocationMode,
|
||||||
|
|
||||||
/// The detailed shard identity. This structure is already scoped within
|
|
||||||
/// a TenantShardId, but we need the full ShardIdentity to enable calculating
|
|
||||||
/// key->shard mappings.
|
|
||||||
#[serde(default = "ShardIdentity::unsharded")]
|
|
||||||
#[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
|
|
||||||
pub(crate) shard: ShardIdentity,
|
|
||||||
|
|
||||||
/// The pan-cluster tenant configuration, the same on all locations
|
/// The pan-cluster tenant configuration, the same on all locations
|
||||||
pub(crate) tenant_conf: TenantConfOpt,
|
pub(crate) tenant_conf: TenantConfOpt,
|
||||||
}
|
}
|
||||||
@@ -171,8 +160,6 @@ impl LocationConf {
|
|||||||
generation,
|
generation,
|
||||||
attach_mode: AttachmentMode::Single,
|
attach_mode: AttachmentMode::Single,
|
||||||
}),
|
}),
|
||||||
// Legacy configuration loads are always from tenants created before sharding existed.
|
|
||||||
shard: ShardIdentity::unsharded(),
|
|
||||||
tenant_conf,
|
tenant_conf,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -200,7 +187,6 @@ impl LocationConf {
|
|||||||
|
|
||||||
fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
|
fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
|
||||||
conf.generation
|
conf.generation
|
||||||
.map(Generation::new)
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
|
.ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -240,21 +226,7 @@ impl LocationConf {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let shard = if conf.shard_count == 0 {
|
Ok(Self { mode, tenant_conf })
|
||||||
ShardIdentity::unsharded()
|
|
||||||
} else {
|
|
||||||
ShardIdentity::new(
|
|
||||||
ShardNumber(conf.shard_number),
|
|
||||||
ShardCount(conf.shard_count),
|
|
||||||
ShardStripeSize(conf.shard_stripe_size),
|
|
||||||
)?
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
shard,
|
|
||||||
mode,
|
|
||||||
tenant_conf,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -269,7 +241,6 @@ impl Default for LocationConf {
|
|||||||
attach_mode: AttachmentMode::Single,
|
attach_mode: AttachmentMode::Single,
|
||||||
}),
|
}),
|
||||||
tenant_conf: TenantConfOpt::default(),
|
tenant_conf: TenantConfOpt::default(),
|
||||||
shard: ShardIdentity::unsharded(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -523,49 +494,105 @@ impl Default for TenantConf {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper function to standardize the error messages we produce on bad durations
|
||||||
|
//
|
||||||
|
// Intended to be used with anyhow's `with_context`, e.g.:
|
||||||
|
//
|
||||||
|
// let value = result.with_context(bad_duration("name", &value))?;
|
||||||
|
//
|
||||||
|
fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
|
||||||
|
move || format!("Cannot parse `{field_name}` duration {value:?}")
|
||||||
|
}
|
||||||
|
|
||||||
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
||||||
type Error = anyhow::Error;
|
type Error = anyhow::Error;
|
||||||
|
|
||||||
fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
|
fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
|
||||||
// Convert the request_data to a JSON Value
|
let mut tenant_conf = TenantConfOpt::default();
|
||||||
let json_value: Value = serde_json::to_value(request_data)?;
|
|
||||||
|
|
||||||
// Create a Deserializer from the JSON Value
|
if let Some(gc_period) = &request_data.gc_period {
|
||||||
let deserializer = json_value.into_deserializer();
|
tenant_conf.gc_period = Some(
|
||||||
|
humantime::parse_duration(gc_period)
|
||||||
|
.with_context(bad_duration("gc_period", gc_period))?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||||
|
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||||
|
|
||||||
// Use serde_path_to_error to deserialize the JSON Value into TenantConfOpt
|
if let Some(pitr_interval) = &request_data.pitr_interval {
|
||||||
let tenant_conf: TenantConfOpt = serde_path_to_error::deserialize(deserializer)?;
|
tenant_conf.pitr_interval = Some(
|
||||||
|
humantime::parse_duration(pitr_interval)
|
||||||
|
.with_context(bad_duration("pitr_interval", pitr_interval))?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
|
||||||
|
tenant_conf.walreceiver_connect_timeout = Some(
|
||||||
|
humantime::parse_duration(walreceiver_connect_timeout).with_context(
|
||||||
|
bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
|
||||||
|
)?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
|
||||||
|
tenant_conf.lagging_wal_timeout = Some(
|
||||||
|
humantime::parse_duration(lagging_wal_timeout)
|
||||||
|
.with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
|
||||||
|
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||||
|
}
|
||||||
|
if let Some(trace_read_requests) = request_data.trace_read_requests {
|
||||||
|
tenant_conf.trace_read_requests = Some(trace_read_requests);
|
||||||
|
}
|
||||||
|
|
||||||
|
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||||
|
if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
|
||||||
|
tenant_conf.checkpoint_timeout = Some(
|
||||||
|
humantime::parse_duration(checkpoint_timeout)
|
||||||
|
.with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||||
|
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||||
|
|
||||||
|
if let Some(compaction_period) = &request_data.compaction_period {
|
||||||
|
tenant_conf.compaction_period = Some(
|
||||||
|
humantime::parse_duration(compaction_period)
|
||||||
|
.with_context(bad_duration("compaction_period", compaction_period))?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(eviction_policy) = &request_data.eviction_policy {
|
||||||
|
tenant_conf.eviction_policy = Some(
|
||||||
|
serde::Deserialize::deserialize(eviction_policy)
|
||||||
|
.context("parse field `eviction_policy`")?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
|
||||||
|
|
||||||
|
if let Some(evictions_low_residence_duration_metric_threshold) =
|
||||||
|
&request_data.evictions_low_residence_duration_metric_threshold
|
||||||
|
{
|
||||||
|
tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
|
||||||
|
humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
|
||||||
|
.with_context(bad_duration(
|
||||||
|
"evictions_low_residence_duration_metric_threshold",
|
||||||
|
evictions_low_residence_duration_metric_threshold,
|
||||||
|
))?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
tenant_conf.gc_feedback = request_data.gc_feedback;
|
||||||
|
|
||||||
Ok(tenant_conf)
|
Ok(tenant_conf)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TryFrom<toml_edit::Item> for TenantConfOpt {
|
|
||||||
type Error = anyhow::Error;
|
|
||||||
|
|
||||||
fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
|
|
||||||
match item {
|
|
||||||
toml_edit::Item::Value(value) => {
|
|
||||||
let d = value.into_deserializer();
|
|
||||||
return serde_path_to_error::deserialize(d)
|
|
||||||
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
|
||||||
}
|
|
||||||
toml_edit::Item::Table(table) => {
|
|
||||||
let deserializer = toml_edit::de::Deserializer::new(table.into());
|
|
||||||
return serde_path_to_error::deserialize(deserializer)
|
|
||||||
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
bail!("expected non-inline table but found {item}")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use models::TenantConfig;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn de_serializing_pageserver_config_omits_empty_values() {
|
fn de_serializing_pageserver_config_omits_empty_values() {
|
||||||
@@ -582,38 +609,4 @@ mod tests {
|
|||||||
assert_eq!(json_form, "{\"gc_horizon\":42}");
|
assert_eq!(json_form, "{\"gc_horizon\":42}");
|
||||||
assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
|
assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_try_from_models_tenant_config_err() {
|
|
||||||
let tenant_config = models::TenantConfig {
|
|
||||||
lagging_wal_timeout: Some("5a".to_string()),
|
|
||||||
..TenantConfig::default()
|
|
||||||
};
|
|
||||||
|
|
||||||
let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config);
|
|
||||||
|
|
||||||
assert!(
|
|
||||||
tenant_conf_opt.is_err(),
|
|
||||||
"Suceeded to convert TenantConfig to TenantConfOpt"
|
|
||||||
);
|
|
||||||
|
|
||||||
let expected_error_str =
|
|
||||||
"lagging_wal_timeout: invalid value: string \"5a\", expected a duration";
|
|
||||||
assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_try_from_models_tenant_config_success() {
|
|
||||||
let tenant_config = models::TenantConfig {
|
|
||||||
lagging_wal_timeout: Some("5s".to_string()),
|
|
||||||
..TenantConfig::default()
|
|
||||||
};
|
|
||||||
|
|
||||||
let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
tenant_conf_opt.lagging_wal_timeout,
|
|
||||||
Some(Duration::from_secs(5))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,19 +2,21 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use pageserver_api::{models::TenantState, shard::TenantShardId};
|
use pageserver_api::models::TenantState;
|
||||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||||
use tokio::sync::OwnedMutexGuard;
|
use tokio::sync::OwnedMutexGuard;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{error, instrument, Instrument, Span};
|
use tracing::{error, instrument, warn, Instrument, Span};
|
||||||
|
|
||||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
|
use utils::{
|
||||||
|
backoff, completion, crashsafe, fs_ext,
|
||||||
|
id::{TenantId, TimelineId},
|
||||||
|
};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
context::RequestContext,
|
context::RequestContext,
|
||||||
task_mgr::{self, TaskKind},
|
task_mgr::{self, TaskKind},
|
||||||
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
|
|
||||||
InitializationOrder,
|
InitializationOrder,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -57,10 +59,10 @@ type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
|
|||||||
|
|
||||||
fn remote_tenant_delete_mark_path(
|
fn remote_tenant_delete_mark_path(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
) -> anyhow::Result<RemotePath> {
|
) -> anyhow::Result<RemotePath> {
|
||||||
let tenant_remote_path = conf
|
let tenant_remote_path = conf
|
||||||
.tenant_path(tenant_shard_id)
|
.tenant_path(tenant_id)
|
||||||
.strip_prefix(&conf.workdir)
|
.strip_prefix(&conf.workdir)
|
||||||
.context("Failed to strip workdir prefix")
|
.context("Failed to strip workdir prefix")
|
||||||
.and_then(RemotePath::new)
|
.and_then(RemotePath::new)
|
||||||
@@ -71,9 +73,9 @@ fn remote_tenant_delete_mark_path(
|
|||||||
async fn create_remote_delete_mark(
|
async fn create_remote_delete_mark(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
remote_storage: &GenericRemoteStorage,
|
remote_storage: &GenericRemoteStorage,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
||||||
|
|
||||||
let data: &[u8] = &[];
|
let data: &[u8] = &[];
|
||||||
backoff::retry(
|
backoff::retry(
|
||||||
@@ -97,9 +99,9 @@ async fn create_remote_delete_mark(
|
|||||||
|
|
||||||
async fn create_local_delete_mark(
|
async fn create_local_delete_mark(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
|
let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
|
||||||
|
|
||||||
// Note: we're ok to replace existing file.
|
// Note: we're ok to replace existing file.
|
||||||
let _ = std::fs::OpenOptions::new()
|
let _ = std::fs::OpenOptions::new()
|
||||||
@@ -168,10 +170,10 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
|
|||||||
async fn remove_tenant_remote_delete_mark(
|
async fn remove_tenant_remote_delete_mark(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
remote_storage: Option<&GenericRemoteStorage>,
|
remote_storage: Option<&GenericRemoteStorage>,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
if let Some(remote_storage) = remote_storage {
|
if let Some(remote_storage) = remote_storage {
|
||||||
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
||||||
backoff::retry(
|
backoff::retry(
|
||||||
|| async { remote_storage.delete(&path).await },
|
|| async { remote_storage.delete(&path).await },
|
||||||
|_e| false,
|
|_e| false,
|
||||||
@@ -190,7 +192,7 @@ async fn remove_tenant_remote_delete_mark(
|
|||||||
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
|
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
|
||||||
async fn cleanup_remaining_fs_traces(
|
async fn cleanup_remaining_fs_traces(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
|
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
|
||||||
if is_dir {
|
if is_dir {
|
||||||
@@ -202,8 +204,8 @@ async fn cleanup_remaining_fs_traces(
|
|||||||
.with_context(|| format!("failed to delete {p}"))
|
.with_context(|| format!("failed to delete {p}"))
|
||||||
};
|
};
|
||||||
|
|
||||||
rm(conf.tenant_config_path(tenant_shard_id), false).await?;
|
rm(conf.tenant_config_path(tenant_id), false).await?;
|
||||||
rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
|
rm(conf.tenant_location_config_path(tenant_id), false).await?;
|
||||||
|
|
||||||
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
|
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -211,7 +213,7 @@ async fn cleanup_remaining_fs_traces(
|
|||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
rm(conf.timelines_path(tenant_shard_id), true).await?;
|
rm(conf.timelines_path(tenant_id), true).await?;
|
||||||
|
|
||||||
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -225,14 +227,14 @@ async fn cleanup_remaining_fs_traces(
|
|||||||
// to be reordered later and thus missed if a crash occurs.
|
// to be reordered later and thus missed if a crash occurs.
|
||||||
// Note that we dont need to sync after mark file is removed
|
// Note that we dont need to sync after mark file is removed
|
||||||
// because we can tolerate the case when mark file reappears on startup.
|
// because we can tolerate the case when mark file reappears on startup.
|
||||||
let tenant_path = &conf.tenant_path(tenant_shard_id);
|
let tenant_path = &conf.tenant_path(tenant_id);
|
||||||
if tenant_path.exists() {
|
if tenant_path.exists() {
|
||||||
crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
|
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
|
||||||
.await
|
.await
|
||||||
.context("fsync_pre_mark_remove")?;
|
.context("fsync_pre_mark_remove")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
|
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
|
||||||
|
|
||||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -240,7 +242,7 @@ async fn cleanup_remaining_fs_traces(
|
|||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
rm(conf.tenant_path(tenant_shard_id), true).await?;
|
rm(conf.tenant_path(tenant_id), true).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -285,8 +287,6 @@ impl DeleteTenantFlow {
|
|||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
pausable_failpoint!("tenant-delete-before-run");
|
|
||||||
|
|
||||||
let mut guard = Self::prepare(&tenant).await?;
|
let mut guard = Self::prepare(&tenant).await?;
|
||||||
|
|
||||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||||
@@ -321,7 +321,7 @@ impl DeleteTenantFlow {
|
|||||||
// Though sounds scary, different mark name?
|
// Though sounds scary, different mark name?
|
||||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||||
if let Some(remote_storage) = &remote_storage {
|
if let Some(remote_storage) = &remote_storage {
|
||||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
|
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
|
||||||
.await
|
.await
|
||||||
.context("remote_mark")?
|
.context("remote_mark")?
|
||||||
}
|
}
|
||||||
@@ -332,7 +332,7 @@ impl DeleteTenantFlow {
|
|||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
create_local_delete_mark(conf, &tenant.tenant_shard_id)
|
create_local_delete_mark(conf, &tenant.tenant_id)
|
||||||
.await
|
.await
|
||||||
.context("local delete mark")?;
|
.context("local delete mark")?;
|
||||||
|
|
||||||
@@ -374,11 +374,9 @@ impl DeleteTenantFlow {
|
|||||||
return Ok(acquire(tenant));
|
return Ok(acquire(tenant));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let tenant_id = tenant.tenant_id;
|
||||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||||
if conf
|
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
||||||
.tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
|
|
||||||
.exists()
|
|
||||||
{
|
|
||||||
Ok(acquire(tenant))
|
Ok(acquire(tenant))
|
||||||
} else {
|
} else {
|
||||||
Ok(None)
|
Ok(None)
|
||||||
@@ -461,12 +459,12 @@ impl DeleteTenantFlow {
|
|||||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
tenant: Arc<Tenant>,
|
tenant: Arc<Tenant>,
|
||||||
) {
|
) {
|
||||||
let tenant_shard_id = tenant.tenant_shard_id;
|
let tenant_id = tenant.tenant_id;
|
||||||
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::TimelineDeletionWorker,
|
TaskKind::TimelineDeletionWorker,
|
||||||
Some(tenant_shard_id.tenant_id),
|
Some(tenant_id),
|
||||||
None,
|
None,
|
||||||
"tenant_delete",
|
"tenant_delete",
|
||||||
false,
|
false,
|
||||||
@@ -480,7 +478,7 @@ impl DeleteTenantFlow {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument({
|
.instrument({
|
||||||
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
|
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
|
||||||
span.follows_from(Span::current());
|
span.follows_from(Span::current());
|
||||||
span
|
span
|
||||||
}),
|
}),
|
||||||
@@ -518,7 +516,7 @@ impl DeleteTenantFlow {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
|
let timelines_path = conf.timelines_path(&tenant.tenant_id);
|
||||||
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
||||||
if timelines_path.exists() {
|
if timelines_path.exists() {
|
||||||
// sanity check to guard against layout changes
|
// sanity check to guard against layout changes
|
||||||
@@ -527,8 +525,7 @@ impl DeleteTenantFlow {
|
|||||||
.context("timelines dir not empty")?;
|
.context("timelines dir not empty")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
|
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
|
||||||
.await?;
|
|
||||||
|
|
||||||
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -536,73 +533,21 @@ impl DeleteTenantFlow {
|
|||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
|
cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
|
||||||
.await
|
.await
|
||||||
.context("cleanup_remaining_fs_traces")?;
|
.context("cleanup_remaining_fs_traces")?;
|
||||||
|
|
||||||
{
|
{
|
||||||
pausable_failpoint!("tenant-delete-before-map-remove");
|
let mut locked = tenants.write().unwrap();
|
||||||
|
if locked.remove(&tenant.tenant_id).is_none() {
|
||||||
|
warn!("Tenant got removed from tenants map during deletion");
|
||||||
|
};
|
||||||
|
|
||||||
// This block is simply removing the TenantSlot for this tenant. It requires a loop because
|
// FIXME: we should not be modifying this from outside of mgr.rs.
|
||||||
// we might conflict with a TenantSlot::InProgress marker and need to wait for it.
|
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
||||||
//
|
crate::metrics::TENANT_MANAGER
|
||||||
// This complexity will go away when we simplify how deletion works:
|
.tenant_slots
|
||||||
// https://github.com/neondatabase/neon/issues/5080
|
.set(locked.len() as u64);
|
||||||
loop {
|
|
||||||
// Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
|
|
||||||
// we encounter an InProgress marker, yield the barrier it contains and wait on it.
|
|
||||||
let barrier = {
|
|
||||||
let mut locked = tenants.write().unwrap();
|
|
||||||
let removed = locked.remove(&tenant.tenant_shard_id.tenant_id);
|
|
||||||
|
|
||||||
// FIXME: we should not be modifying this from outside of mgr.rs.
|
|
||||||
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
|
||||||
crate::metrics::TENANT_MANAGER
|
|
||||||
.tenant_slots
|
|
||||||
.set(locked.len() as u64);
|
|
||||||
|
|
||||||
match removed {
|
|
||||||
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
|
|
||||||
match tenant.current_state() {
|
|
||||||
TenantState::Stopping { .. } | TenantState::Broken { .. } => {
|
|
||||||
// Expected: we put the tenant into stopping state before we start deleting it
|
|
||||||
}
|
|
||||||
state => {
|
|
||||||
// Unexpected state
|
|
||||||
tracing::warn!(
|
|
||||||
"Tenant in unexpected state {state} after deletion"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
|
|
||||||
// This is unexpected: this secondary tenants should not have been created, and we
|
|
||||||
// are not in a position to shut it down from here.
|
|
||||||
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
|
|
||||||
unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
|
|
||||||
}
|
|
||||||
TenantsMapRemoveResult::Vacant => {
|
|
||||||
tracing::warn!(
|
|
||||||
"Tenant removed from TenantsMap before deletion completed"
|
|
||||||
);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
TenantsMapRemoveResult::InProgress(barrier) => {
|
|
||||||
// An InProgress entry was found, we must wait on its barrier
|
|
||||||
barrier
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
tracing::info!(
|
|
||||||
"Waiting for competing operation to complete before deleting state for tenant"
|
|
||||||
);
|
|
||||||
barrier.wait().await;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*guard = Self::Finished;
|
*guard = Self::Finished;
|
||||||
|
|||||||
@@ -7,19 +7,18 @@ use crate::page_cache::{self, PAGE_SZ};
|
|||||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use std::fs::OpenOptions;
|
use std::fs::OpenOptions;
|
||||||
use std::io::{self, ErrorKind};
|
use std::io::{self, ErrorKind};
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use std::sync::atomic::AtomicU64;
|
use std::sync::atomic::AtomicU64;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::TimelineId;
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
pub struct EphemeralFile {
|
pub struct EphemeralFile {
|
||||||
page_cache_file_id: page_cache::FileId,
|
page_cache_file_id: page_cache::FileId,
|
||||||
|
|
||||||
_tenant_shard_id: TenantShardId,
|
_tenant_id: TenantId,
|
||||||
_timeline_id: TimelineId,
|
_timeline_id: TimelineId,
|
||||||
file: VirtualFile,
|
file: VirtualFile,
|
||||||
len: u64,
|
len: u64,
|
||||||
@@ -32,7 +31,7 @@ pub struct EphemeralFile {
|
|||||||
impl EphemeralFile {
|
impl EphemeralFile {
|
||||||
pub async fn create(
|
pub async fn create(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Result<EphemeralFile, io::Error> {
|
) -> Result<EphemeralFile, io::Error> {
|
||||||
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
|
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
|
||||||
@@ -40,7 +39,7 @@ impl EphemeralFile {
|
|||||||
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
|
||||||
let filename = conf
|
let filename = conf
|
||||||
.timeline_path(&tenant_shard_id, &timeline_id)
|
.timeline_path(&tenant_id, &timeline_id)
|
||||||
.join(Utf8PathBuf::from(format!(
|
.join(Utf8PathBuf::from(format!(
|
||||||
"ephemeral-{filename_disambiguator}"
|
"ephemeral-{filename_disambiguator}"
|
||||||
)));
|
)));
|
||||||
@@ -53,7 +52,7 @@ impl EphemeralFile {
|
|||||||
|
|
||||||
Ok(EphemeralFile {
|
Ok(EphemeralFile {
|
||||||
page_cache_file_id: page_cache::next_file_id(),
|
page_cache_file_id: page_cache::next_file_id(),
|
||||||
_tenant_shard_id: tenant_shard_id,
|
_tenant_id: tenant_id,
|
||||||
_timeline_id: timeline_id,
|
_timeline_id: timeline_id,
|
||||||
file,
|
file,
|
||||||
len: 0,
|
len: 0,
|
||||||
@@ -283,7 +282,7 @@ mod tests {
|
|||||||
) -> Result<
|
) -> Result<
|
||||||
(
|
(
|
||||||
&'static PageServerConf,
|
&'static PageServerConf,
|
||||||
TenantShardId,
|
TenantId,
|
||||||
TimelineId,
|
TimelineId,
|
||||||
RequestContext,
|
RequestContext,
|
||||||
),
|
),
|
||||||
@@ -296,13 +295,13 @@ mod tests {
|
|||||||
// OK in a test.
|
// OK in a test.
|
||||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||||
|
|
||||||
let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap();
|
let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||||
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||||
fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;
|
fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||||
|
|
||||||
Ok((conf, tenant_shard_id, timeline_id, ctx))
|
Ok((conf, tenant_id, timeline_id, ctx))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|||||||
@@ -11,12 +11,15 @@
|
|||||||
use std::io::{self};
|
use std::io::{self};
|
||||||
|
|
||||||
use anyhow::{ensure, Context};
|
use anyhow::{ensure, Context};
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use utils::bin_ser::SerializeError;
|
use utils::bin_ser::SerializeError;
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
use utils::{
|
||||||
|
bin_ser::BeSer,
|
||||||
|
id::{TenantId, TimelineId},
|
||||||
|
lsn::Lsn,
|
||||||
|
};
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
@@ -269,14 +272,14 @@ impl Serialize for TimelineMetadata {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Save timeline metadata to file
|
/// Save timeline metadata to file
|
||||||
#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
|
#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
|
||||||
pub async fn save_metadata(
|
pub async fn save_metadata(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
data: &TimelineMetadata,
|
data: &TimelineMetadata,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let path = conf.metadata_path(tenant_shard_id, timeline_id);
|
let path = conf.metadata_path(tenant_id, timeline_id);
|
||||||
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||||
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
||||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
|
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
|
||||||
@@ -296,10 +299,10 @@ pub enum LoadMetadataError {
|
|||||||
|
|
||||||
pub fn load_metadata(
|
pub fn load_metadata(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
) -> Result<TimelineMetadata, LoadMetadataError> {
|
) -> Result<TimelineMetadata, LoadMetadataError> {
|
||||||
let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
|
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
|
||||||
let metadata_bytes = std::fs::read(metadata_path)?;
|
let metadata_bytes = std::fs::read(metadata_path)?;
|
||||||
|
|
||||||
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -188,11 +188,8 @@ use anyhow::Context;
|
|||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use chrono::{NaiveDateTime, Utc};
|
use chrono::{NaiveDateTime, Utc};
|
||||||
|
|
||||||
pub(crate) use download::download_initdb_tar_zst;
|
|
||||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
|
||||||
use scopeguard::ScopeGuard;
|
use scopeguard::ScopeGuard;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
pub(crate) use upload::upload_initdb_dir;
|
|
||||||
use utils::backoff::{
|
use utils::backoff::{
|
||||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||||
};
|
};
|
||||||
@@ -252,8 +249,6 @@ pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
|
|||||||
// retries. Uploads and deletions are retried forever, though.
|
// retries. Uploads and deletions are retried forever, though.
|
||||||
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||||
|
|
||||||
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
|
|
||||||
|
|
||||||
pub enum MaybeDeletedIndexPart {
|
pub enum MaybeDeletedIndexPart {
|
||||||
IndexPart(IndexPart),
|
IndexPart(IndexPart),
|
||||||
Deleted(IndexPart),
|
Deleted(IndexPart),
|
||||||
@@ -302,7 +297,7 @@ pub struct RemoteTimelineClient {
|
|||||||
|
|
||||||
runtime: tokio::runtime::Handle,
|
runtime: tokio::runtime::Handle,
|
||||||
|
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
|
|
||||||
@@ -326,7 +321,7 @@ impl RemoteTimelineClient {
|
|||||||
remote_storage: GenericRemoteStorage,
|
remote_storage: GenericRemoteStorage,
|
||||||
deletion_queue_client: DeletionQueueClient,
|
deletion_queue_client: DeletionQueueClient,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
) -> RemoteTimelineClient {
|
) -> RemoteTimelineClient {
|
||||||
@@ -338,16 +333,13 @@ impl RemoteTimelineClient {
|
|||||||
} else {
|
} else {
|
||||||
BACKGROUND_RUNTIME.handle().clone()
|
BACKGROUND_RUNTIME.handle().clone()
|
||||||
},
|
},
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
generation,
|
generation,
|
||||||
storage_impl: remote_storage,
|
storage_impl: remote_storage,
|
||||||
deletion_queue_client,
|
deletion_queue_client,
|
||||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
|
||||||
&tenant_shard_id,
|
|
||||||
&timeline_id,
|
|
||||||
)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -468,13 +460,13 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
let index_part = download::download_index_part(
|
let index_part = download::download_index_part(
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
&self.tenant_shard_id,
|
&self.tenant_id,
|
||||||
&self.timeline_id,
|
&self.timeline_id,
|
||||||
self.generation,
|
self.generation,
|
||||||
cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
self.tenant_shard_id.tenant_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
RemoteOpFileKind::Index,
|
RemoteOpFileKind::Index,
|
||||||
RemoteOpKind::Download,
|
RemoteOpKind::Download,
|
||||||
@@ -510,13 +502,13 @@ impl RemoteTimelineClient {
|
|||||||
download::download_layer_file(
|
download::download_layer_file(
|
||||||
self.conf,
|
self.conf,
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
layer_file_name,
|
layer_file_name,
|
||||||
layer_metadata,
|
layer_metadata,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
self.tenant_shard_id.tenant_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
RemoteOpFileKind::Layer,
|
RemoteOpFileKind::Layer,
|
||||||
RemoteOpKind::Download,
|
RemoteOpKind::Download,
|
||||||
@@ -662,10 +654,10 @@ impl RemoteTimelineClient {
|
|||||||
let mut guard = self.upload_queue.lock().unwrap();
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
let upload_queue = guard.initialized_mut()?;
|
let upload_queue = guard.initialized_mut()?;
|
||||||
|
|
||||||
let with_metadata =
|
let with_generations =
|
||||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
|
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
|
||||||
|
|
||||||
self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
|
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
|
||||||
|
|
||||||
// Launch the tasks immediately, if possible
|
// Launch the tasks immediately, if possible
|
||||||
self.launch_queued_tasks(upload_queue);
|
self.launch_queued_tasks(upload_queue);
|
||||||
@@ -700,7 +692,7 @@ impl RemoteTimelineClient {
|
|||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
upload_queue: &mut UploadQueueInitialized,
|
upload_queue: &mut UploadQueueInitialized,
|
||||||
names: I,
|
names: I,
|
||||||
) -> Vec<(LayerFileName, LayerFileMetadata)>
|
) -> Vec<(LayerFileName, Generation)>
|
||||||
where
|
where
|
||||||
I: IntoIterator<Item = LayerFileName>,
|
I: IntoIterator<Item = LayerFileName>,
|
||||||
{
|
{
|
||||||
@@ -708,17 +700,16 @@ impl RemoteTimelineClient {
|
|||||||
// so we don't need update it. Just serialize it.
|
// so we don't need update it. Just serialize it.
|
||||||
let metadata = upload_queue.latest_metadata.clone();
|
let metadata = upload_queue.latest_metadata.clone();
|
||||||
|
|
||||||
// Decorate our list of names with each name's metadata, dropping
|
// Decorate our list of names with each name's generation, dropping
|
||||||
// names that are unexpectedly missing from our metadata. This metadata
|
// names that are unexpectedly missing from our metadata.
|
||||||
// is later used when physically deleting layers, to construct key paths.
|
let with_generations: Vec<_> = names
|
||||||
let with_metadata: Vec<_> = names
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|name| {
|
.filter_map(|name| {
|
||||||
let meta = upload_queue.latest_files.remove(&name);
|
let meta = upload_queue.latest_files.remove(&name);
|
||||||
|
|
||||||
if let Some(meta) = meta {
|
if let Some(meta) = meta {
|
||||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||||
Some((name, meta))
|
Some((name, meta.generation))
|
||||||
} else {
|
} else {
|
||||||
// This can only happen if we forgot to to schedule the file upload
|
// This can only happen if we forgot to to schedule the file upload
|
||||||
// before scheduling the delete. Log it because it is a rare/strange
|
// before scheduling the delete. Log it because it is a rare/strange
|
||||||
@@ -731,10 +722,9 @@ impl RemoteTimelineClient {
|
|||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
for (name, metadata) in &with_metadata {
|
for (name, gen) in &with_generations {
|
||||||
let gen = metadata.generation;
|
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
|
||||||
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) {
|
if &unexpected == gen {
|
||||||
if unexpected == gen {
|
|
||||||
tracing::error!("{name} was unlinked twice with same generation");
|
tracing::error!("{name} was unlinked twice with same generation");
|
||||||
} else {
|
} else {
|
||||||
tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
|
tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
|
||||||
@@ -749,14 +739,14 @@ impl RemoteTimelineClient {
|
|||||||
self.schedule_index_upload(upload_queue, metadata);
|
self.schedule_index_upload(upload_queue, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
with_metadata
|
with_generations
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Schedules deletion for layer files which have previously been unlinked from the
|
/// Schedules deletion for layer files which have previously been unlinked from the
|
||||||
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
|
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
|
||||||
pub(crate) fn schedule_deletion_of_unlinked(
|
pub(crate) fn schedule_deletion_of_unlinked(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
layers: Vec<(LayerFileName, Generation)>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut guard = self.upload_queue.lock().unwrap();
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
let upload_queue = guard.initialized_mut()?;
|
let upload_queue = guard.initialized_mut()?;
|
||||||
@@ -769,22 +759,16 @@ impl RemoteTimelineClient {
|
|||||||
fn schedule_deletion_of_unlinked0(
|
fn schedule_deletion_of_unlinked0(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
upload_queue: &mut UploadQueueInitialized,
|
upload_queue: &mut UploadQueueInitialized,
|
||||||
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
|
with_generations: Vec<(LayerFileName, Generation)>,
|
||||||
) {
|
) {
|
||||||
for (name, meta) in &with_metadata {
|
for (name, gen) in &with_generations {
|
||||||
info!(
|
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
|
||||||
"scheduling deletion of layer {}{} (shard {})",
|
|
||||||
name,
|
|
||||||
meta.generation.get_suffix(),
|
|
||||||
meta.shard
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
for (name, meta) in &with_metadata {
|
for (name, gen) in &with_generations {
|
||||||
let gen = meta.generation;
|
|
||||||
match upload_queue.dangling_files.remove(name) {
|
match upload_queue.dangling_files.remove(name) {
|
||||||
Some(same) if same == gen => { /* expected */ }
|
Some(same) if &same == gen => { /* expected */ }
|
||||||
Some(other) => {
|
Some(other) => {
|
||||||
tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
|
tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
|
||||||
}
|
}
|
||||||
@@ -796,7 +780,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// schedule the actual deletions
|
// schedule the actual deletions
|
||||||
let op = UploadOp::Delete(Delete {
|
let op = UploadOp::Delete(Delete {
|
||||||
layers: with_metadata,
|
layers: with_generations,
|
||||||
});
|
});
|
||||||
self.calls_unfinished_metric_begin(&op);
|
self.calls_unfinished_metric_begin(&op);
|
||||||
upload_queue.queued_operations.push_back(op);
|
upload_queue.queued_operations.push_back(op);
|
||||||
@@ -825,29 +809,23 @@ impl RemoteTimelineClient {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///
|
||||||
/// Wait for all previously scheduled uploads/deletions to complete
|
/// Wait for all previously scheduled uploads/deletions to complete
|
||||||
pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
|
///
|
||||||
|
pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||||
let mut receiver = {
|
let mut receiver = {
|
||||||
let mut guard = self.upload_queue.lock().unwrap();
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
let upload_queue = guard.initialized_mut()?;
|
let upload_queue = guard.initialized_mut()?;
|
||||||
self.schedule_barrier0(upload_queue)
|
self.schedule_barrier(upload_queue)
|
||||||
};
|
};
|
||||||
|
|
||||||
if receiver.changed().await.is_err() {
|
if receiver.changed().await.is_err() {
|
||||||
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn schedule_barrier(self: &Arc<Self>) -> anyhow::Result<()> {
|
fn schedule_barrier(
|
||||||
let mut guard = self.upload_queue.lock().unwrap();
|
|
||||||
let upload_queue = guard.initialized_mut()?;
|
|
||||||
self.schedule_barrier0(upload_queue);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn schedule_barrier0(
|
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
upload_queue: &mut UploadQueueInitialized,
|
upload_queue: &mut UploadQueueInitialized,
|
||||||
) -> tokio::sync::watch::Receiver<()> {
|
) -> tokio::sync::watch::Receiver<()> {
|
||||||
@@ -863,56 +841,6 @@ impl RemoteTimelineClient {
|
|||||||
receiver
|
receiver
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wait for all previously scheduled operations to complete, and then stop.
|
|
||||||
///
|
|
||||||
/// Not cancellation safe
|
|
||||||
pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
|
|
||||||
// On cancellation the queue is left in ackward state of refusing new operations but
|
|
||||||
// proper stop is yet to be called. On cancel the original or some later task must call
|
|
||||||
// `stop` or `shutdown`.
|
|
||||||
let sg = scopeguard::guard((), |_| {
|
|
||||||
tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error")
|
|
||||||
});
|
|
||||||
|
|
||||||
let fut = {
|
|
||||||
let mut guard = self.upload_queue.lock().unwrap();
|
|
||||||
let upload_queue = match &mut *guard {
|
|
||||||
UploadQueue::Stopped(_) => return Ok(()),
|
|
||||||
UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
|
|
||||||
UploadQueue::Initialized(ref mut init) => init,
|
|
||||||
};
|
|
||||||
|
|
||||||
// if the queue is already stuck due to a shutdown operation which was cancelled, then
|
|
||||||
// just don't add more of these as they would never complete.
|
|
||||||
//
|
|
||||||
// TODO: if launch_queued_tasks were to be refactored to accept a &mut UploadQueue
|
|
||||||
// in every place we would not have to jump through this hoop, and this method could be
|
|
||||||
// made cancellable.
|
|
||||||
if !upload_queue.shutting_down {
|
|
||||||
upload_queue.shutting_down = true;
|
|
||||||
upload_queue.queued_operations.push_back(UploadOp::Shutdown);
|
|
||||||
// this operation is not counted similar to Barrier
|
|
||||||
|
|
||||||
self.launch_queued_tasks(upload_queue);
|
|
||||||
}
|
|
||||||
|
|
||||||
upload_queue.shutdown_ready.clone().acquire_owned()
|
|
||||||
};
|
|
||||||
|
|
||||||
let res = fut.await;
|
|
||||||
|
|
||||||
scopeguard::ScopeGuard::into_inner(sg);
|
|
||||||
|
|
||||||
match res {
|
|
||||||
Ok(_permit) => unreachable!("shutdown_ready should not have been added permits"),
|
|
||||||
Err(_closed) => {
|
|
||||||
// expected
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
self.stop()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the deleted_at field in the remote index file.
|
/// Set the deleted_at field in the remote index file.
|
||||||
///
|
///
|
||||||
/// This fails if the upload queue has not been `stop()`ed.
|
/// This fails if the upload queue has not been `stop()`ed.
|
||||||
@@ -964,7 +892,7 @@ impl RemoteTimelineClient {
|
|||||||
|| {
|
|| {
|
||||||
upload::upload_index_part(
|
upload::upload_index_part(
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
&self.tenant_shard_id,
|
&self.tenant_id,
|
||||||
&self.timeline_id,
|
&self.timeline_id,
|
||||||
self.generation,
|
self.generation,
|
||||||
&index_part_with_deleted_at,
|
&index_part_with_deleted_at,
|
||||||
@@ -1022,9 +950,8 @@ impl RemoteTimelineClient {
|
|||||||
.drain()
|
.drain()
|
||||||
.map(|(file_name, meta)| {
|
.map(|(file_name, meta)| {
|
||||||
remote_layer_path(
|
remote_layer_path(
|
||||||
&self.tenant_shard_id.tenant_id,
|
&self.tenant_id,
|
||||||
&self.timeline_id,
|
&self.timeline_id,
|
||||||
meta.shard,
|
|
||||||
&file_name,
|
&file_name,
|
||||||
meta.generation,
|
meta.generation,
|
||||||
)
|
)
|
||||||
@@ -1037,7 +964,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||||
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
|
||||||
|
|
||||||
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
||||||
// taking the burden of listing all the layers that we already know we should delete.
|
// taking the burden of listing all the layers that we already know we should delete.
|
||||||
@@ -1073,22 +1000,12 @@ impl RemoteTimelineClient {
|
|||||||
.unwrap_or(
|
.unwrap_or(
|
||||||
// No generation-suffixed indices, assume we are dealing with
|
// No generation-suffixed indices, assume we are dealing with
|
||||||
// a legacy index.
|
// a legacy index.
|
||||||
remote_index_path(&self.tenant_shard_id, &self.timeline_id, Generation::none()),
|
remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
|
||||||
);
|
);
|
||||||
|
|
||||||
let remaining_layers: Vec<RemotePath> = remaining
|
let remaining_layers: Vec<RemotePath> = remaining
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|p| {
|
.filter(|p| p!= &latest_index)
|
||||||
if p == &latest_index {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if let Some(name) = p.object_name() {
|
|
||||||
if name == INITDB_PATH {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
true
|
|
||||||
})
|
|
||||||
.inspect(|path| {
|
.inspect(|path| {
|
||||||
if let Some(name) = path.object_name() {
|
if let Some(name) = path.object_name() {
|
||||||
info!(%name, "deleting a file not referenced from index_part.json");
|
info!(%name, "deleting a file not referenced from index_part.json");
|
||||||
@@ -1154,9 +1071,7 @@ impl RemoteTimelineClient {
|
|||||||
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
UploadOp::Barrier(_) | UploadOp::Shutdown => {
|
UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(),
|
||||||
upload_queue.inprogress_tasks.is_empty()
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// If we cannot launch this task, don't look any further.
|
// If we cannot launch this task, don't look any further.
|
||||||
@@ -1169,13 +1084,6 @@ impl RemoteTimelineClient {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let UploadOp::Shutdown = next_op {
|
|
||||||
// leave the op in the queue but do not start more tasks; it will be dropped when
|
|
||||||
// the stop is called.
|
|
||||||
upload_queue.shutdown_ready.close();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We can launch this task. Remove it from the queue first.
|
// We can launch this task. Remove it from the queue first.
|
||||||
let next_op = upload_queue.queued_operations.pop_front().unwrap();
|
let next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||||
|
|
||||||
@@ -1196,7 +1104,6 @@ impl RemoteTimelineClient {
|
|||||||
sender.send_replace(());
|
sender.send_replace(());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
UploadOp::Shutdown => unreachable!("shutdown is intentionally never popped off"),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Assign unique ID to this task
|
// Assign unique ID to this task
|
||||||
@@ -1215,12 +1122,12 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Spawn task to perform the task
|
// Spawn task to perform the task
|
||||||
let self_rc = Arc::clone(self);
|
let self_rc = Arc::clone(self);
|
||||||
let tenant_shard_id = self.tenant_shard_id;
|
let tenant_id = self.tenant_id;
|
||||||
let timeline_id = self.timeline_id;
|
let timeline_id = self.timeline_id;
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
&self.runtime,
|
&self.runtime,
|
||||||
TaskKind::RemoteUploadTask,
|
TaskKind::RemoteUploadTask,
|
||||||
Some(self.tenant_shard_id.tenant_id),
|
Some(self.tenant_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
"remote upload",
|
"remote upload",
|
||||||
false,
|
false,
|
||||||
@@ -1228,7 +1135,7 @@ impl RemoteTimelineClient {
|
|||||||
self_rc.perform_upload_task(task).await;
|
self_rc.perform_upload_task(task).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "remote_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, %upload_task_id)),
|
.instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Loop back to process next task
|
// Loop back to process next task
|
||||||
@@ -1280,7 +1187,7 @@ impl RemoteTimelineClient {
|
|||||||
self.generation,
|
self.generation,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
self.tenant_shard_id.tenant_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
RemoteOpFileKind::Layer,
|
RemoteOpFileKind::Layer,
|
||||||
RemoteOpKind::Upload,
|
RemoteOpKind::Upload,
|
||||||
@@ -1300,13 +1207,13 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
let res = upload::upload_index_part(
|
let res = upload::upload_index_part(
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
&self.tenant_shard_id,
|
&self.tenant_id,
|
||||||
&self.timeline_id,
|
&self.timeline_id,
|
||||||
self.generation,
|
self.generation,
|
||||||
index_part,
|
index_part,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
self.tenant_shard_id.tenant_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
RemoteOpFileKind::Index,
|
RemoteOpFileKind::Index,
|
||||||
RemoteOpKind::Upload,
|
RemoteOpKind::Upload,
|
||||||
@@ -1322,22 +1229,20 @@ impl RemoteTimelineClient {
|
|||||||
}
|
}
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
UploadOp::Delete(delete) => {
|
UploadOp::Delete(delete) => self
|
||||||
pausable_failpoint!("before-delete-layer-pausable");
|
.deletion_queue_client
|
||||||
self.deletion_queue_client
|
.push_layers(
|
||||||
.push_layers(
|
self.tenant_id,
|
||||||
self.tenant_shard_id,
|
self.timeline_id,
|
||||||
self.timeline_id,
|
self.generation,
|
||||||
self.generation,
|
delete.layers.clone(),
|
||||||
delete.layers.clone(),
|
)
|
||||||
)
|
.await
|
||||||
.await
|
.map_err(|e| anyhow::anyhow!(e)),
|
||||||
.map_err(|e| anyhow::anyhow!(e))
|
UploadOp::Barrier(_) => {
|
||||||
}
|
|
||||||
unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => {
|
|
||||||
// unreachable. Barrier operations are handled synchronously in
|
// unreachable. Barrier operations are handled synchronously in
|
||||||
// launch_queued_tasks
|
// launch_queued_tasks
|
||||||
warn!("unexpected {unexpected:?} operation in perform_upload_task");
|
warn!("unexpected Barrier operation in perform_upload_task");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -1431,7 +1336,7 @@ impl RemoteTimelineClient {
|
|||||||
upload_queue.num_inprogress_deletions -= 1;
|
upload_queue.num_inprogress_deletions -= 1;
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
|
UploadOp::Barrier(_) => unreachable!(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Launch any queued tasks that were unblocked by this one.
|
// Launch any queued tasks that were unblocked by this one.
|
||||||
@@ -1445,7 +1350,7 @@ impl RemoteTimelineClient {
|
|||||||
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
|
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
|
||||||
self.deletion_queue_client
|
self.deletion_queue_client
|
||||||
.update_remote_consistent_lsn(
|
.update_remote_consistent_lsn(
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.generation,
|
self.generation,
|
||||||
lsn,
|
lsn,
|
||||||
@@ -1486,7 +1391,7 @@ impl RemoteTimelineClient {
|
|||||||
reason: "should we track deletes? positive or negative sign?",
|
reason: "should we track deletes? positive or negative sign?",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
UploadOp::Barrier(..) | UploadOp::Shutdown => {
|
UploadOp::Barrier(_) => {
|
||||||
// we do not account these
|
// we do not account these
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
@@ -1512,13 +1417,10 @@ impl RemoteTimelineClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Close the upload queue for new operations and cancel queued operations.
|
/// Close the upload queue for new operations and cancel queued operations.
|
||||||
///
|
|
||||||
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
|
|
||||||
///
|
|
||||||
/// In-progress operations will still be running after this function returns.
|
/// In-progress operations will still be running after this function returns.
|
||||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
||||||
/// to wait for them to complete, after calling this function.
|
/// to wait for them to complete, after calling this function.
|
||||||
pub(crate) fn stop(&self) -> Result<(), StopError> {
|
pub fn stop(&self) -> Result<(), StopError> {
|
||||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||||
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
|
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
|
||||||
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
|
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
|
||||||
@@ -1556,8 +1458,6 @@ impl RemoteTimelineClient {
|
|||||||
queued_operations: VecDeque::default(),
|
queued_operations: VecDeque::default(),
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
dangling_files: HashMap::default(),
|
dangling_files: HashMap::default(),
|
||||||
shutting_down: false,
|
|
||||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let upload_queue = std::mem::replace(
|
let upload_queue = std::mem::replace(
|
||||||
@@ -1603,32 +1503,24 @@ impl RemoteTimelineClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
|
||||||
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
|
let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
|
||||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remote_timeline_path(
|
pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
|
||||||
tenant_shard_id: &TenantShardId,
|
remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string()))
|
||||||
timeline_id: &TimelineId,
|
|
||||||
) -> RemotePath {
|
|
||||||
remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Note that the shard component of a remote layer path is _not_ always the same
|
|
||||||
/// as in the TenantShardId of the caller: tenants may reference layers from a different
|
|
||||||
/// ShardIndex. Use the ShardIndex from the layer's metadata.
|
|
||||||
pub fn remote_layer_path(
|
pub fn remote_layer_path(
|
||||||
tenant_id: &TenantId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
shard: ShardIndex,
|
|
||||||
layer_file_name: &LayerFileName,
|
layer_file_name: &LayerFileName,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
) -> RemotePath {
|
) -> RemotePath {
|
||||||
// Generation-aware key format
|
// Generation-aware key format
|
||||||
let path = format!(
|
let path = format!(
|
||||||
"tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
|
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
|
||||||
shard.get_suffix(),
|
|
||||||
layer_file_name.file_name(),
|
layer_file_name.file_name(),
|
||||||
generation.get_suffix()
|
generation.get_suffix()
|
||||||
);
|
);
|
||||||
@@ -1636,20 +1528,13 @@ pub fn remote_layer_path(
|
|||||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
|
|
||||||
RemotePath::from_string(&format!(
|
|
||||||
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}"
|
|
||||||
))
|
|
||||||
.expect("Failed to construct path")
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn remote_index_path(
|
pub fn remote_index_path(
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
) -> RemotePath {
|
) -> RemotePath {
|
||||||
RemotePath::from_string(&format!(
|
RemotePath::from_string(&format!(
|
||||||
"tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
|
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
|
||||||
IndexPart::FILE_NAME,
|
IndexPart::FILE_NAME,
|
||||||
generation.get_suffix()
|
generation.get_suffix()
|
||||||
))
|
))
|
||||||
@@ -1791,14 +1676,14 @@ mod tests {
|
|||||||
Arc::new(RemoteTimelineClient {
|
Arc::new(RemoteTimelineClient {
|
||||||
conf: self.harness.conf,
|
conf: self.harness.conf,
|
||||||
runtime: tokio::runtime::Handle::current(),
|
runtime: tokio::runtime::Handle::current(),
|
||||||
tenant_shard_id: self.harness.tenant_shard_id,
|
tenant_id: self.harness.tenant_id,
|
||||||
timeline_id: TIMELINE_ID,
|
timeline_id: TIMELINE_ID,
|
||||||
generation,
|
generation,
|
||||||
storage_impl: self.harness.remote_storage.clone(),
|
storage_impl: self.harness.remote_storage.clone(),
|
||||||
deletion_queue_client: self.harness.deletion_queue.new_client(),
|
deletion_queue_client: self.harness.deletion_queue.new_client(),
|
||||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||||
&self.harness.tenant_shard_id,
|
&self.harness.tenant_id,
|
||||||
&TIMELINE_ID,
|
&TIMELINE_ID,
|
||||||
)),
|
)),
|
||||||
})
|
})
|
||||||
@@ -1874,7 +1759,6 @@ mod tests {
|
|||||||
println!("remote_timeline_dir: {remote_timeline_dir}");
|
println!("remote_timeline_dir: {remote_timeline_dir}");
|
||||||
|
|
||||||
let generation = harness.generation;
|
let generation = harness.generation;
|
||||||
let shard = harness.shard;
|
|
||||||
|
|
||||||
// Create a couple of dummy files, schedule upload for them
|
// Create a couple of dummy files, schedule upload for them
|
||||||
|
|
||||||
@@ -1891,7 +1775,7 @@ mod tests {
|
|||||||
harness.conf,
|
harness.conf,
|
||||||
&timeline,
|
&timeline,
|
||||||
name,
|
name,
|
||||||
LayerFileMetadata::new(contents.len() as u64, generation, shard),
|
LayerFileMetadata::new(contents.len() as u64, generation),
|
||||||
)
|
)
|
||||||
}).collect::<Vec<_>>();
|
}).collect::<Vec<_>>();
|
||||||
|
|
||||||
@@ -2040,7 +1924,7 @@ mod tests {
|
|||||||
harness.conf,
|
harness.conf,
|
||||||
&timeline,
|
&timeline,
|
||||||
layer_file_name_1.clone(),
|
layer_file_name_1.clone(),
|
||||||
LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
|
LayerFileMetadata::new(content_1.len() as u64, harness.generation),
|
||||||
);
|
);
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||||
@@ -2126,12 +2010,7 @@ mod tests {
|
|||||||
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
|
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
|
||||||
|
|
||||||
let index_path = test_state.harness.remote_fs_dir.join(
|
let index_path = test_state.harness.remote_fs_dir.join(
|
||||||
remote_index_path(
|
remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
|
||||||
&test_state.harness.tenant_shard_id,
|
|
||||||
&TIMELINE_ID,
|
|
||||||
generation,
|
|
||||||
)
|
|
||||||
.get_path(),
|
|
||||||
);
|
);
|
||||||
eprintln!("Writing {index_path}");
|
eprintln!("Writing {index_path}");
|
||||||
std::fs::write(&index_path, index_part_bytes).unwrap();
|
std::fs::write(&index_path, index_part_bytes).unwrap();
|
||||||
|
|||||||
@@ -8,12 +8,10 @@ use std::future::Future;
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use anyhow::{anyhow, Context};
|
use anyhow::{anyhow, Context};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::Utf8Path;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use tokio::fs;
|
||||||
use tokio::fs::{self, File, OpenOptions};
|
use tokio::io::AsyncWriteExt;
|
||||||
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
|
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::warn;
|
|
||||||
use utils::{backoff, crashsafe};
|
use utils::{backoff, crashsafe};
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
@@ -21,15 +19,14 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
|
|||||||
use crate::tenant::storage_layer::LayerFileName;
|
use crate::tenant::storage_layer::LayerFileName;
|
||||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
use crate::tenant::Generation;
|
use crate::tenant::Generation;
|
||||||
use crate::TEMP_FILE_SUFFIX;
|
|
||||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::id::TimelineId;
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
use super::index::{IndexPart, LayerFileMetadata};
|
use super::index::{IndexPart, LayerFileMetadata};
|
||||||
use super::{
|
use super::{
|
||||||
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
|
parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||||
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
FAILED_REMOTE_OP_RETRIES,
|
||||||
};
|
};
|
||||||
|
|
||||||
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
||||||
@@ -42,7 +39,7 @@ static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
|||||||
pub async fn download_layer_file<'a>(
|
pub async fn download_layer_file<'a>(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
storage: &'a GenericRemoteStorage,
|
storage: &'a GenericRemoteStorage,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
layer_file_name: &'a LayerFileName,
|
layer_file_name: &'a LayerFileName,
|
||||||
layer_metadata: &'a LayerFileMetadata,
|
layer_metadata: &'a LayerFileMetadata,
|
||||||
@@ -50,13 +47,12 @@ pub async fn download_layer_file<'a>(
|
|||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
let local_path = conf
|
let local_path = conf
|
||||||
.timeline_path(&tenant_shard_id, &timeline_id)
|
.timeline_path(&tenant_id, &timeline_id)
|
||||||
.join(layer_file_name.file_name());
|
.join(layer_file_name.file_name());
|
||||||
|
|
||||||
let remote_path = remote_layer_path(
|
let remote_path = remote_layer_path(
|
||||||
&tenant_shard_id.tenant_id,
|
&tenant_id,
|
||||||
&timeline_id,
|
&timeline_id,
|
||||||
layer_metadata.shard,
|
|
||||||
layer_file_name,
|
layer_file_name,
|
||||||
layer_metadata.generation,
|
layer_metadata.generation,
|
||||||
);
|
);
|
||||||
@@ -173,10 +169,10 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
|
|||||||
/// List timelines of given tenant in remote storage
|
/// List timelines of given tenant in remote storage
|
||||||
pub async fn list_remote_timelines(
|
pub async fn list_remote_timelines(
|
||||||
storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
||||||
let remote_path = remote_timelines_path(&tenant_shard_id);
|
let remote_path = remote_timelines_path(&tenant_id);
|
||||||
|
|
||||||
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
||||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||||
@@ -184,7 +180,7 @@ pub async fn list_remote_timelines(
|
|||||||
|
|
||||||
let listing = download_retry_forever(
|
let listing = download_retry_forever(
|
||||||
|| storage.list(Some(&remote_path), ListingMode::WithDelimiter),
|
|| storage.list(Some(&remote_path), ListingMode::WithDelimiter),
|
||||||
&format!("list timelines for {tenant_shard_id}"),
|
&format!("list timelines for {tenant_id}"),
|
||||||
cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -194,7 +190,7 @@ pub async fn list_remote_timelines(
|
|||||||
|
|
||||||
for timeline_remote_storage_key in listing.prefixes {
|
for timeline_remote_storage_key in listing.prefixes {
|
||||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
|
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
match object_name.parse::<TimelineId>() {
|
match object_name.parse::<TimelineId>() {
|
||||||
@@ -215,12 +211,12 @@ pub async fn list_remote_timelines(
|
|||||||
|
|
||||||
async fn do_download_index_part(
|
async fn do_download_index_part(
|
||||||
storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
index_generation: Generation,
|
index_generation: Generation,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<IndexPart, DownloadError> {
|
) -> Result<IndexPart, DownloadError> {
|
||||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
|
||||||
|
|
||||||
let index_part_bytes = download_retry_forever(
|
let index_part_bytes = download_retry_forever(
|
||||||
|| async {
|
|| async {
|
||||||
@@ -256,7 +252,7 @@ async fn do_download_index_part(
|
|||||||
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
|
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
|
||||||
pub(super) async fn download_index_part(
|
pub(super) async fn download_index_part(
|
||||||
storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
my_generation: Generation,
|
my_generation: Generation,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
@@ -265,14 +261,8 @@ pub(super) async fn download_index_part(
|
|||||||
|
|
||||||
if my_generation.is_none() {
|
if my_generation.is_none() {
|
||||||
// Operating without generations: just fetch the generation-less path
|
// Operating without generations: just fetch the generation-less path
|
||||||
return do_download_index_part(
|
return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
|
||||||
storage,
|
.await;
|
||||||
tenant_shard_id,
|
|
||||||
timeline_id,
|
|
||||||
my_generation,
|
|
||||||
cancel,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
||||||
@@ -281,7 +271,7 @@ pub(super) async fn download_index_part(
|
|||||||
// This is an optimization to avoid doing the listing for the general case below.
|
// This is an optimization to avoid doing the listing for the general case below.
|
||||||
let res = do_download_index_part(
|
let res = do_download_index_part(
|
||||||
storage,
|
storage,
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
my_generation,
|
my_generation,
|
||||||
cancel.clone(),
|
cancel.clone(),
|
||||||
@@ -308,7 +298,7 @@ pub(super) async fn download_index_part(
|
|||||||
// This is an optimization to avoid doing the listing for the general case below.
|
// This is an optimization to avoid doing the listing for the general case below.
|
||||||
let res = do_download_index_part(
|
let res = do_download_index_part(
|
||||||
storage,
|
storage,
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
my_generation.previous(),
|
my_generation.previous(),
|
||||||
cancel.clone(),
|
cancel.clone(),
|
||||||
@@ -330,9 +320,8 @@ pub(super) async fn download_index_part(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
|
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
|
||||||
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
|
// objects, and select the highest one with a generation <= my_generation.
|
||||||
// to constructing a full index path with no generation, because the generation is a suffix.
|
let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
|
||||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
|
||||||
let indices = backoff::retry(
|
let indices = backoff::retry(
|
||||||
|| async { storage.list_files(Some(&index_prefix)).await },
|
|| async { storage.list_files(Some(&index_prefix)).await },
|
||||||
|_| false,
|
|_| false,
|
||||||
@@ -358,87 +347,18 @@ pub(super) async fn download_index_part(
|
|||||||
match max_previous_generation {
|
match max_previous_generation {
|
||||||
Some(g) => {
|
Some(g) => {
|
||||||
tracing::debug!("Found index_part in generation {g:?}");
|
tracing::debug!("Found index_part in generation {g:?}");
|
||||||
do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
|
do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||||
// attached pageservers did. Try to load from a no-generation path.
|
// attached pageservers did. Try to load from a no-generation path.
|
||||||
tracing::info!("No index_part.json* found");
|
tracing::info!("No index_part.json* found");
|
||||||
do_download_index_part(
|
do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
|
||||||
storage,
|
.await
|
||||||
tenant_shard_id,
|
|
||||||
timeline_id,
|
|
||||||
Generation::none(),
|
|
||||||
cancel,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn download_initdb_tar_zst(
|
|
||||||
conf: &'static PageServerConf,
|
|
||||||
storage: &GenericRemoteStorage,
|
|
||||||
tenant_shard_id: &TenantShardId,
|
|
||||||
timeline_id: &TimelineId,
|
|
||||||
) -> Result<(Utf8PathBuf, File), DownloadError> {
|
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
|
||||||
|
|
||||||
let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
|
|
||||||
|
|
||||||
let timeline_path = conf.timelines_path(tenant_shard_id);
|
|
||||||
|
|
||||||
if !timeline_path.exists() {
|
|
||||||
tokio::fs::create_dir_all(&timeline_path)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("timeline dir creation {timeline_path}"))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
}
|
|
||||||
let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
|
|
||||||
|
|
||||||
let file = download_retry(
|
|
||||||
|| async {
|
|
||||||
let mut file = OpenOptions::new()
|
|
||||||
.create(true)
|
|
||||||
.truncate(true)
|
|
||||||
.read(true)
|
|
||||||
.write(true)
|
|
||||||
.open(&temp_path)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
let mut download = storage.download(&remote_path).await?;
|
|
||||||
|
|
||||||
tokio::io::copy(&mut download.download_stream, &mut file)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
file.seek(std::io::SeekFrom::Start(0))
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
Ok(file)
|
|
||||||
},
|
|
||||||
&format!("download {remote_path}"),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(|e| {
|
|
||||||
if temp_path.exists() {
|
|
||||||
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
|
|
||||||
// We don't have async here nor do we want to pile on any extra errors.
|
|
||||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
|
||||||
warn!("error deleting temporary file {temp_path}: {e}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
e
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok((temp_path, file))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Helper function to handle retries for a download operation.
|
/// Helper function to handle retries for a download operation.
|
||||||
///
|
///
|
||||||
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ use crate::tenant::metadata::TimelineMetadata;
|
|||||||
use crate::tenant::storage_layer::LayerFileName;
|
use crate::tenant::storage_layer::LayerFileName;
|
||||||
use crate::tenant::upload_queue::UploadQueueInitialized;
|
use crate::tenant::upload_queue::UploadQueueInitialized;
|
||||||
use crate::tenant::Generation;
|
use crate::tenant::Generation;
|
||||||
use pageserver_api::shard::ShardIndex;
|
|
||||||
|
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -26,8 +25,6 @@ pub struct LayerFileMetadata {
|
|||||||
file_size: u64,
|
file_size: u64,
|
||||||
|
|
||||||
pub(crate) generation: Generation,
|
pub(crate) generation: Generation,
|
||||||
|
|
||||||
pub(crate) shard: ShardIndex,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||||
@@ -35,17 +32,15 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
|||||||
LayerFileMetadata {
|
LayerFileMetadata {
|
||||||
file_size: other.file_size,
|
file_size: other.file_size,
|
||||||
generation: other.generation,
|
generation: other.generation,
|
||||||
shard: other.shard,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LayerFileMetadata {
|
impl LayerFileMetadata {
|
||||||
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
|
pub fn new(file_size: u64, generation: Generation) -> Self {
|
||||||
LayerFileMetadata {
|
LayerFileMetadata {
|
||||||
file_size,
|
file_size,
|
||||||
generation,
|
generation,
|
||||||
shard,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -133,14 +128,6 @@ impl IndexPart {
|
|||||||
pub fn get_disk_consistent_lsn(&self) -> Lsn {
|
pub fn get_disk_consistent_lsn(&self) -> Lsn {
|
||||||
self.disk_consistent_lsn
|
self.disk_consistent_lsn
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
|
|
||||||
serde_json::from_slice::<IndexPart>(bytes)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
|
|
||||||
serde_json::to_vec(self)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
||||||
@@ -166,10 +153,6 @@ pub struct IndexLayerMetadata {
|
|||||||
#[serde(default = "Generation::none")]
|
#[serde(default = "Generation::none")]
|
||||||
#[serde(skip_serializing_if = "Generation::is_none")]
|
#[serde(skip_serializing_if = "Generation::is_none")]
|
||||||
pub generation: Generation,
|
pub generation: Generation,
|
||||||
|
|
||||||
#[serde(default = "ShardIndex::unsharded")]
|
|
||||||
#[serde(skip_serializing_if = "ShardIndex::is_unsharded")]
|
|
||||||
pub shard: ShardIndex,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<LayerFileMetadata> for IndexLayerMetadata {
|
impl From<LayerFileMetadata> for IndexLayerMetadata {
|
||||||
@@ -177,7 +160,6 @@ impl From<LayerFileMetadata> for IndexLayerMetadata {
|
|||||||
IndexLayerMetadata {
|
IndexLayerMetadata {
|
||||||
file_size: other.file_size,
|
file_size: other.file_size,
|
||||||
generation: other.generation,
|
generation: other.generation,
|
||||||
shard: other.shard,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -205,15 +187,13 @@ mod tests {
|
|||||||
layer_metadata: HashMap::from([
|
layer_metadata: HashMap::from([
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||||
file_size: 25600000,
|
file_size: 25600000,
|
||||||
generation: Generation::none(),
|
generation: Generation::none()
|
||||||
shard: ShardIndex::unsharded()
|
|
||||||
}),
|
}),
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||||
// serde_json should always parse this but this might be a double with jq for
|
// serde_json should always parse this but this might be a double with jq for
|
||||||
// example.
|
// example.
|
||||||
file_size: 9007199254741001,
|
file_size: 9007199254741001,
|
||||||
generation: Generation::none(),
|
generation: Generation::none()
|
||||||
shard: ShardIndex::unsharded()
|
|
||||||
})
|
})
|
||||||
]),
|
]),
|
||||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
@@ -221,7 +201,7 @@ mod tests {
|
|||||||
deleted_at: None,
|
deleted_at: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||||
assert_eq!(part, expected);
|
assert_eq!(part, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -245,15 +225,13 @@ mod tests {
|
|||||||
layer_metadata: HashMap::from([
|
layer_metadata: HashMap::from([
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||||
file_size: 25600000,
|
file_size: 25600000,
|
||||||
generation: Generation::none(),
|
generation: Generation::none()
|
||||||
shard: ShardIndex::unsharded()
|
|
||||||
}),
|
}),
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||||
// serde_json should always parse this but this might be a double with jq for
|
// serde_json should always parse this but this might be a double with jq for
|
||||||
// example.
|
// example.
|
||||||
file_size: 9007199254741001,
|
file_size: 9007199254741001,
|
||||||
generation: Generation::none(),
|
generation: Generation::none()
|
||||||
shard: ShardIndex::unsharded()
|
|
||||||
})
|
})
|
||||||
]),
|
]),
|
||||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
@@ -261,7 +239,7 @@ mod tests {
|
|||||||
deleted_at: None,
|
deleted_at: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||||
assert_eq!(part, expected);
|
assert_eq!(part, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -286,15 +264,13 @@ mod tests {
|
|||||||
layer_metadata: HashMap::from([
|
layer_metadata: HashMap::from([
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||||
file_size: 25600000,
|
file_size: 25600000,
|
||||||
generation: Generation::none(),
|
generation: Generation::none()
|
||||||
shard: ShardIndex::unsharded()
|
|
||||||
}),
|
}),
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||||
// serde_json should always parse this but this might be a double with jq for
|
// serde_json should always parse this but this might be a double with jq for
|
||||||
// example.
|
// example.
|
||||||
file_size: 9007199254741001,
|
file_size: 9007199254741001,
|
||||||
generation: Generation::none(),
|
generation: Generation::none()
|
||||||
shard: ShardIndex::unsharded()
|
|
||||||
})
|
})
|
||||||
]),
|
]),
|
||||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
@@ -303,7 +279,7 @@ mod tests {
|
|||||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
||||||
};
|
};
|
||||||
|
|
||||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||||
assert_eq!(part, expected);
|
assert_eq!(part, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -347,7 +323,7 @@ mod tests {
|
|||||||
deleted_at: None,
|
deleted_at: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
|
let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
|
||||||
|
|
||||||
assert_eq!(empty_layers_parsed, expected);
|
assert_eq!(empty_layers_parsed, expected);
|
||||||
}
|
}
|
||||||
@@ -370,24 +346,22 @@ mod tests {
|
|||||||
layer_metadata: HashMap::from([
|
layer_metadata: HashMap::from([
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||||
file_size: 25600000,
|
file_size: 25600000,
|
||||||
generation: Generation::none(),
|
generation: Generation::none()
|
||||||
shard: ShardIndex::unsharded()
|
|
||||||
}),
|
}),
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||||
// serde_json should always parse this but this might be a double with jq for
|
// serde_json should always parse this but this might be a double with jq for
|
||||||
// example.
|
// example.
|
||||||
file_size: 9007199254741001,
|
file_size: 9007199254741001,
|
||||||
generation: Generation::none(),
|
generation: Generation::none()
|
||||||
shard: ShardIndex::unsharded()
|
|
||||||
})
|
})
|
||||||
]),
|
]),
|
||||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
||||||
};
|
};
|
||||||
|
|
||||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||||
assert_eq!(part, expected);
|
assert_eq!(part, expected);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,19 +1,15 @@
|
|||||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use bytes::Bytes;
|
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use std::io::ErrorKind;
|
use std::io::ErrorKind;
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
|
|
||||||
use super::Generation;
|
use super::Generation;
|
||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
tenant::remote_timeline_client::{
|
tenant::remote_timeline_client::{index::IndexPart, remote_index_path, remote_path},
|
||||||
index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
@@ -25,7 +21,7 @@ use tracing::info;
|
|||||||
/// Serializes and uploads the given index part data to the remote storage.
|
/// Serializes and uploads the given index part data to the remote storage.
|
||||||
pub(super) async fn upload_index_part<'a>(
|
pub(super) async fn upload_index_part<'a>(
|
||||||
storage: &'a GenericRemoteStorage,
|
storage: &'a GenericRemoteStorage,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
index_part: &'a IndexPart,
|
index_part: &'a IndexPart,
|
||||||
@@ -37,17 +33,16 @@ pub(super) async fn upload_index_part<'a>(
|
|||||||
});
|
});
|
||||||
pausable_failpoint!("before-upload-index-pausable");
|
pausable_failpoint!("before-upload-index-pausable");
|
||||||
|
|
||||||
let index_part_bytes = index_part
|
let index_part_bytes =
|
||||||
.to_s3_bytes()
|
serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
|
||||||
.context("serialize index part file into bytes")?;
|
|
||||||
let index_part_size = index_part_bytes.len();
|
let index_part_size = index_part_bytes.len();
|
||||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||||
|
|
||||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
|
let remote_path = remote_index_path(tenant_id, timeline_id, generation);
|
||||||
storage
|
storage
|
||||||
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
|
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
.with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Attempts to upload given layer files.
|
/// Attempts to upload given layer files.
|
||||||
@@ -108,22 +103,3 @@ pub(super) async fn upload_timeline_layer<'a>(
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Uploads the given `initdb` data to the remote storage.
|
|
||||||
pub(crate) async fn upload_initdb_dir(
|
|
||||||
storage: &GenericRemoteStorage,
|
|
||||||
tenant_id: &TenantId,
|
|
||||||
timeline_id: &TimelineId,
|
|
||||||
initdb_dir: Bytes,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
tracing::trace!("uploading initdb dir");
|
|
||||||
|
|
||||||
let size = initdb_dir.len();
|
|
||||||
let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir));
|
|
||||||
|
|
||||||
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
|
||||||
storage
|
|
||||||
.upload_storage_object(bytes, size, &remote_path)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ use std::sync::Arc;
|
|||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use tokio::sync::oneshot::error::RecvError;
|
use tokio::sync::oneshot::error::RecvError;
|
||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||||
@@ -114,12 +113,11 @@ pub(super) async fn gather_inputs(
|
|||||||
max_retention_period: Option<u64>,
|
max_retention_period: Option<u64>,
|
||||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||||
cause: LogicalSizeCalculationCause,
|
cause: LogicalSizeCalculationCause,
|
||||||
cancel: &CancellationToken,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<ModelInputs> {
|
) -> anyhow::Result<ModelInputs> {
|
||||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
||||||
tenant
|
tenant
|
||||||
.refresh_gc_info(cancel, ctx)
|
.refresh_gc_info(ctx)
|
||||||
.await
|
.await
|
||||||
.context("Failed to refresh gc_info before gathering inputs")?;
|
.context("Failed to refresh gc_info before gathering inputs")?;
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
pub mod delta_layer;
|
pub mod delta_layer;
|
||||||
mod filename;
|
mod filename;
|
||||||
pub mod image_layer;
|
mod image_layer;
|
||||||
mod inmemory_layer;
|
mod inmemory_layer;
|
||||||
mod layer;
|
mod layer;
|
||||||
mod layer_desc;
|
mod layer_desc;
|
||||||
@@ -24,7 +24,10 @@ use tracing::warn;
|
|||||||
use utils::history_buffer::HistoryBufferWithDropCounter;
|
use utils::history_buffer::HistoryBufferWithDropCounter;
|
||||||
use utils::rate_limit::RateLimit;
|
use utils::rate_limit::RateLimit;
|
||||||
|
|
||||||
use utils::{id::TimelineId, lsn::Lsn};
|
use utils::{
|
||||||
|
id::{TenantId, TimelineId},
|
||||||
|
lsn::Lsn,
|
||||||
|
};
|
||||||
|
|
||||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
|
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
|
||||||
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||||
@@ -301,14 +304,12 @@ pub trait AsLayerDesc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
impl From<DeltaFileName> for PersistentLayerDesc {
|
impl From<DeltaFileName> for PersistentLayerDesc {
|
||||||
fn from(value: DeltaFileName) -> Self {
|
fn from(value: DeltaFileName) -> Self {
|
||||||
PersistentLayerDesc::new_delta(
|
PersistentLayerDesc::new_delta(
|
||||||
TenantShardId::from([0; 18]),
|
TenantId::from_array([0; 16]),
|
||||||
TimelineId::from_array([0; 16]),
|
TimelineId::from_array([0; 16]),
|
||||||
value.key_range,
|
value.key_range,
|
||||||
value.lsn_range,
|
value.lsn_range,
|
||||||
@@ -320,7 +321,7 @@ pub mod tests {
|
|||||||
impl From<ImageFileName> for PersistentLayerDesc {
|
impl From<ImageFileName> for PersistentLayerDesc {
|
||||||
fn from(value: ImageFileName) -> Self {
|
fn from(value: ImageFileName) -> Self {
|
||||||
PersistentLayerDesc::new_img(
|
PersistentLayerDesc::new_img(
|
||||||
TenantShardId::from([0; 18]),
|
TenantId::from_array([0; 16]),
|
||||||
TimelineId::from_array([0; 16]),
|
TimelineId::from_array([0; 16]),
|
||||||
value.key_range,
|
value.key_range,
|
||||||
value.lsn,
|
value.lsn,
|
||||||
|
|||||||
@@ -42,7 +42,6 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
|||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use pageserver_api::models::LayerAccessKind;
|
use pageserver_api::models::LayerAccessKind;
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use rand::{distributions::Alphanumeric, Rng};
|
use rand::{distributions::Alphanumeric, Rng};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
@@ -70,13 +69,13 @@ use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
|
|||||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
pub struct Summary {
|
pub struct Summary {
|
||||||
/// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC.
|
/// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC.
|
||||||
pub magic: u16,
|
magic: u16,
|
||||||
pub format_version: u16,
|
format_version: u16,
|
||||||
|
|
||||||
pub tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
pub timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
pub key_range: Range<Key>,
|
key_range: Range<Key>,
|
||||||
pub lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
|
|
||||||
/// Block number where the 'index' part of the file begins.
|
/// Block number where the 'index' part of the file begins.
|
||||||
pub index_start_blk: u32,
|
pub index_start_blk: u32,
|
||||||
@@ -87,7 +86,7 @@ pub struct Summary {
|
|||||||
impl From<&DeltaLayer> for Summary {
|
impl From<&DeltaLayer> for Summary {
|
||||||
fn from(layer: &DeltaLayer) -> Self {
|
fn from(layer: &DeltaLayer) -> Self {
|
||||||
Self::expected(
|
Self::expected(
|
||||||
layer.desc.tenant_shard_id.tenant_id,
|
layer.desc.tenant_id,
|
||||||
layer.desc.timeline_id,
|
layer.desc.timeline_id,
|
||||||
layer.desc.key_range.clone(),
|
layer.desc.key_range.clone(),
|
||||||
layer.desc.lsn_range.clone(),
|
layer.desc.lsn_range.clone(),
|
||||||
@@ -249,7 +248,7 @@ impl DeltaLayer {
|
|||||||
|
|
||||||
fn temp_path_for(
|
fn temp_path_for(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
key_start: Key,
|
key_start: Key,
|
||||||
lsn_range: &Range<Lsn>,
|
lsn_range: &Range<Lsn>,
|
||||||
@@ -260,15 +259,14 @@ impl DeltaLayer {
|
|||||||
.map(char::from)
|
.map(char::from)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
conf.timeline_path(tenant_shard_id, timeline_id)
|
conf.timeline_path(tenant_id, timeline_id).join(format!(
|
||||||
.join(format!(
|
"{}-XXX__{:016X}-{:016X}.{}.{}",
|
||||||
"{}-XXX__{:016X}-{:016X}.{}.{}",
|
key_start,
|
||||||
key_start,
|
u64::from(lsn_range.start),
|
||||||
u64::from(lsn_range.start),
|
u64::from(lsn_range.end),
|
||||||
u64::from(lsn_range.end),
|
rand_string,
|
||||||
rand_string,
|
TEMP_FILE_SUFFIX,
|
||||||
TEMP_FILE_SUFFIX,
|
))
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -291,9 +289,7 @@ impl DeltaLayer {
|
|||||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||||
let path = self.path();
|
let path = self.path();
|
||||||
|
|
||||||
let loaded = DeltaLayerInner::load(&path, None, ctx)
|
let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
|
||||||
.await
|
|
||||||
.and_then(|res| res)?;
|
|
||||||
|
|
||||||
// not production code
|
// not production code
|
||||||
let actual_filename = path.file_name().unwrap().to_owned();
|
let actual_filename = path.file_name().unwrap().to_owned();
|
||||||
@@ -320,14 +316,10 @@ impl DeltaLayer {
|
|||||||
.metadata()
|
.metadata()
|
||||||
.context("get file metadata to determine size")?;
|
.context("get file metadata to determine size")?;
|
||||||
|
|
||||||
// TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
|
|
||||||
// we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
|
|
||||||
let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
|
|
||||||
|
|
||||||
Ok(DeltaLayer {
|
Ok(DeltaLayer {
|
||||||
path: path.to_path_buf(),
|
path: path.to_path_buf(),
|
||||||
desc: PersistentLayerDesc::new_delta(
|
desc: PersistentLayerDesc::new_delta(
|
||||||
tenant_shard_id,
|
summary.tenant_id,
|
||||||
summary.timeline_id,
|
summary.timeline_id,
|
||||||
summary.key_range,
|
summary.key_range,
|
||||||
summary.lsn_range,
|
summary.lsn_range,
|
||||||
@@ -359,7 +351,7 @@ struct DeltaLayerWriterInner {
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
pub path: Utf8PathBuf,
|
pub path: Utf8PathBuf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
|
|
||||||
key_start: Key,
|
key_start: Key,
|
||||||
lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
@@ -376,7 +368,7 @@ impl DeltaLayerWriterInner {
|
|||||||
async fn new(
|
async fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
key_start: Key,
|
key_start: Key,
|
||||||
lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
@@ -386,8 +378,7 @@ impl DeltaLayerWriterInner {
|
|||||||
//
|
//
|
||||||
// Note: This overwrites any existing file. There shouldn't be any.
|
// Note: This overwrites any existing file. There shouldn't be any.
|
||||||
// FIXME: throw an error instead?
|
// FIXME: throw an error instead?
|
||||||
let path =
|
let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);
|
||||||
DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);
|
|
||||||
|
|
||||||
let mut file = VirtualFile::create(&path).await?;
|
let mut file = VirtualFile::create(&path).await?;
|
||||||
// make room for the header block
|
// make room for the header block
|
||||||
@@ -402,7 +393,7 @@ impl DeltaLayerWriterInner {
|
|||||||
conf,
|
conf,
|
||||||
path,
|
path,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
key_start,
|
key_start,
|
||||||
lsn_range,
|
lsn_range,
|
||||||
tree: tree_builder,
|
tree: tree_builder,
|
||||||
@@ -464,7 +455,7 @@ impl DeltaLayerWriterInner {
|
|||||||
let summary = Summary {
|
let summary = Summary {
|
||||||
magic: DELTA_FILE_MAGIC,
|
magic: DELTA_FILE_MAGIC,
|
||||||
format_version: STORAGE_FORMAT_VERSION,
|
format_version: STORAGE_FORMAT_VERSION,
|
||||||
tenant_id: self.tenant_shard_id.tenant_id,
|
tenant_id: self.tenant_id,
|
||||||
timeline_id: self.timeline_id,
|
timeline_id: self.timeline_id,
|
||||||
key_range: self.key_start..key_end,
|
key_range: self.key_start..key_end,
|
||||||
lsn_range: self.lsn_range.clone(),
|
lsn_range: self.lsn_range.clone(),
|
||||||
@@ -505,7 +496,7 @@ impl DeltaLayerWriterInner {
|
|||||||
// set inner.file here. The first read will have to re-open it.
|
// set inner.file here. The first read will have to re-open it.
|
||||||
|
|
||||||
let desc = PersistentLayerDesc::new_delta(
|
let desc = PersistentLayerDesc::new_delta(
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.key_start..key_end,
|
self.key_start..key_end,
|
||||||
self.lsn_range.clone(),
|
self.lsn_range.clone(),
|
||||||
@@ -556,20 +547,14 @@ impl DeltaLayerWriter {
|
|||||||
pub async fn new(
|
pub async fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
key_start: Key,
|
key_start: Key,
|
||||||
lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: Some(
|
inner: Some(
|
||||||
DeltaLayerWriterInner::new(
|
DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
|
||||||
conf,
|
.await?,
|
||||||
timeline_id,
|
|
||||||
tenant_shard_id,
|
|
||||||
key_start,
|
|
||||||
lsn_range,
|
|
||||||
)
|
|
||||||
.await?,
|
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -624,84 +609,19 @@ impl Drop for DeltaLayerWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
pub enum RewriteSummaryError {
|
|
||||||
#[error("magic mismatch")]
|
|
||||||
MagicMismatch,
|
|
||||||
#[error(transparent)]
|
|
||||||
Other(#[from] anyhow::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<std::io::Error> for RewriteSummaryError {
|
|
||||||
fn from(e: std::io::Error) -> Self {
|
|
||||||
Self::Other(anyhow::anyhow!(e))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DeltaLayer {
|
|
||||||
pub async fn rewrite_summary<F>(
|
|
||||||
path: &Utf8Path,
|
|
||||||
rewrite: F,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<(), RewriteSummaryError>
|
|
||||||
where
|
|
||||||
F: Fn(Summary) -> Summary,
|
|
||||||
{
|
|
||||||
let file = VirtualFile::open_with_options(
|
|
||||||
path,
|
|
||||||
&*std::fs::OpenOptions::new().read(true).write(true),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
|
||||||
let file = FileBlockReader::new(file);
|
|
||||||
let summary_blk = file.read_blk(0, ctx).await?;
|
|
||||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
|
|
||||||
let mut file = file.file;
|
|
||||||
if actual_summary.magic != DELTA_FILE_MAGIC {
|
|
||||||
return Err(RewriteSummaryError::MagicMismatch);
|
|
||||||
}
|
|
||||||
|
|
||||||
let new_summary = rewrite(actual_summary);
|
|
||||||
|
|
||||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
|
||||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
|
||||||
if buf.spilled() {
|
|
||||||
// The code in DeltaLayerWriterInner just warn!()s for this.
|
|
||||||
// It should probably error out as well.
|
|
||||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
|
||||||
"Used more than one page size for summary buffer: {}",
|
|
||||||
buf.len()
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
file.seek(SeekFrom::Start(0)).await?;
|
|
||||||
file.write_all(&buf).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DeltaLayerInner {
|
impl DeltaLayerInner {
|
||||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
|
||||||
/// - inner has the success or transient failure
|
|
||||||
/// - outer has the permanent failure
|
|
||||||
pub(super) async fn load(
|
pub(super) async fn load(
|
||||||
path: &Utf8Path,
|
path: &Utf8Path,
|
||||||
summary: Option<Summary>,
|
summary: Option<Summary>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
) -> anyhow::Result<Self> {
|
||||||
let file = match VirtualFile::open(path).await {
|
let file = VirtualFile::open(path)
|
||||||
Ok(file) => file,
|
.await
|
||||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
.with_context(|| format!("Failed to open file '{path}'"))?;
|
||||||
};
|
|
||||||
let file = FileBlockReader::new(file);
|
let file = FileBlockReader::new(file);
|
||||||
|
|
||||||
let summary_blk = match file.read_blk(0, ctx).await {
|
let summary_blk = file.read_blk(0, ctx).await?;
|
||||||
Ok(blk) => blk,
|
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO: this should be an assertion instead; see ImageLayerInner::load
|
|
||||||
let actual_summary =
|
|
||||||
Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
|
|
||||||
|
|
||||||
if let Some(mut expected_summary) = summary {
|
if let Some(mut expected_summary) = summary {
|
||||||
// production code path
|
// production code path
|
||||||
@@ -716,11 +636,11 @@ impl DeltaLayerInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Ok(DeltaLayerInner {
|
Ok(DeltaLayerInner {
|
||||||
file,
|
file,
|
||||||
index_start_blk: actual_summary.index_start_blk,
|
index_start_blk: actual_summary.index_start_blk,
|
||||||
index_root_blk: actual_summary.index_root_blk,
|
index_root_blk: actual_summary.index_root_blk,
|
||||||
}))
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) async fn get_value_reconstruct_data(
|
pub(super) async fn get_value_reconstruct_data(
|
||||||
|
|||||||
@@ -41,7 +41,6 @@ use bytes::Bytes;
|
|||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use hex;
|
use hex;
|
||||||
use pageserver_api::models::LayerAccessKind;
|
use pageserver_api::models::LayerAccessKind;
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use rand::{distributions::Alphanumeric, Rng};
|
use rand::{distributions::Alphanumeric, Rng};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
@@ -68,27 +67,27 @@ use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
|
|||||||
/// the 'index' starts at the block indicated by 'index_start_blk'
|
/// the 'index' starts at the block indicated by 'index_start_blk'
|
||||||
///
|
///
|
||||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
pub struct Summary {
|
pub(super) struct Summary {
|
||||||
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
|
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
|
||||||
pub magic: u16,
|
magic: u16,
|
||||||
pub format_version: u16,
|
format_version: u16,
|
||||||
|
|
||||||
pub tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
pub timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
pub key_range: Range<Key>,
|
key_range: Range<Key>,
|
||||||
pub lsn: Lsn,
|
lsn: Lsn,
|
||||||
|
|
||||||
/// Block number where the 'index' part of the file begins.
|
/// Block number where the 'index' part of the file begins.
|
||||||
pub index_start_blk: u32,
|
index_start_blk: u32,
|
||||||
/// Block within the 'index', where the B-tree root page is stored
|
/// Block within the 'index', where the B-tree root page is stored
|
||||||
pub index_root_blk: u32,
|
index_root_blk: u32,
|
||||||
// the 'values' part starts after the summary header, on block 1.
|
// the 'values' part starts after the summary header, on block 1.
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<&ImageLayer> for Summary {
|
impl From<&ImageLayer> for Summary {
|
||||||
fn from(layer: &ImageLayer) -> Self {
|
fn from(layer: &ImageLayer) -> Self {
|
||||||
Self::expected(
|
Self::expected(
|
||||||
layer.desc.tenant_shard_id.tenant_id,
|
layer.desc.tenant_id,
|
||||||
layer.desc.timeline_id,
|
layer.desc.timeline_id,
|
||||||
layer.desc.key_range.clone(),
|
layer.desc.key_range.clone(),
|
||||||
layer.lsn,
|
layer.lsn,
|
||||||
@@ -218,7 +217,7 @@ impl ImageLayer {
|
|||||||
fn temp_path_for(
|
fn temp_path_for(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
fname: &ImageFileName,
|
fname: &ImageFileName,
|
||||||
) -> Utf8PathBuf {
|
) -> Utf8PathBuf {
|
||||||
let rand_string: String = rand::thread_rng()
|
let rand_string: String = rand::thread_rng()
|
||||||
@@ -227,7 +226,7 @@ impl ImageLayer {
|
|||||||
.map(char::from)
|
.map(char::from)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
conf.timeline_path(&tenant_shard_id, &timeline_id)
|
conf.timeline_path(&tenant_id, &timeline_id)
|
||||||
.join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
|
.join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -250,9 +249,7 @@ impl ImageLayer {
|
|||||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||||
let path = self.path();
|
let path = self.path();
|
||||||
|
|
||||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
|
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
|
||||||
.await
|
|
||||||
.and_then(|res| res)?;
|
|
||||||
|
|
||||||
// not production code
|
// not production code
|
||||||
let actual_filename = path.file_name().unwrap().to_owned();
|
let actual_filename = path.file_name().unwrap().to_owned();
|
||||||
@@ -277,15 +274,10 @@ impl ImageLayer {
|
|||||||
let metadata = file
|
let metadata = file
|
||||||
.metadata()
|
.metadata()
|
||||||
.context("get file metadata to determine size")?;
|
.context("get file metadata to determine size")?;
|
||||||
|
|
||||||
// TODO(sharding): we should get TenantShardId from path.
|
|
||||||
// OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
|
|
||||||
let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
|
|
||||||
|
|
||||||
Ok(ImageLayer {
|
Ok(ImageLayer {
|
||||||
path: path.to_path_buf(),
|
path: path.to_path_buf(),
|
||||||
desc: PersistentLayerDesc::new_img(
|
desc: PersistentLayerDesc::new_img(
|
||||||
tenant_shard_id,
|
summary.tenant_id,
|
||||||
summary.timeline_id,
|
summary.timeline_id,
|
||||||
summary.key_range,
|
summary.key_range,
|
||||||
summary.lsn,
|
summary.lsn,
|
||||||
@@ -302,87 +294,19 @@ impl ImageLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
pub enum RewriteSummaryError {
|
|
||||||
#[error("magic mismatch")]
|
|
||||||
MagicMismatch,
|
|
||||||
#[error(transparent)]
|
|
||||||
Other(#[from] anyhow::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<std::io::Error> for RewriteSummaryError {
|
|
||||||
fn from(e: std::io::Error) -> Self {
|
|
||||||
Self::Other(anyhow::anyhow!(e))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImageLayer {
|
|
||||||
pub async fn rewrite_summary<F>(
|
|
||||||
path: &Utf8Path,
|
|
||||||
rewrite: F,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<(), RewriteSummaryError>
|
|
||||||
where
|
|
||||||
F: Fn(Summary) -> Summary,
|
|
||||||
{
|
|
||||||
let file = VirtualFile::open_with_options(
|
|
||||||
path,
|
|
||||||
&*std::fs::OpenOptions::new().read(true).write(true),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
|
||||||
let file = FileBlockReader::new(file);
|
|
||||||
let summary_blk = file.read_blk(0, ctx).await?;
|
|
||||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
|
|
||||||
let mut file = file.file;
|
|
||||||
if actual_summary.magic != IMAGE_FILE_MAGIC {
|
|
||||||
return Err(RewriteSummaryError::MagicMismatch);
|
|
||||||
}
|
|
||||||
|
|
||||||
let new_summary = rewrite(actual_summary);
|
|
||||||
|
|
||||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
|
||||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
|
||||||
if buf.spilled() {
|
|
||||||
// The code in ImageLayerWriterInner just warn!()s for this.
|
|
||||||
// It should probably error out as well.
|
|
||||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
|
||||||
"Used more than one page size for summary buffer: {}",
|
|
||||||
buf.len()
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
file.seek(SeekFrom::Start(0)).await?;
|
|
||||||
file.write_all(&buf).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImageLayerInner {
|
impl ImageLayerInner {
|
||||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
|
||||||
/// - inner has the success or transient failure
|
|
||||||
/// - outer has the permanent failure
|
|
||||||
pub(super) async fn load(
|
pub(super) async fn load(
|
||||||
path: &Utf8Path,
|
path: &Utf8Path,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
summary: Option<Summary>,
|
summary: Option<Summary>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
) -> anyhow::Result<Self> {
|
||||||
let file = match VirtualFile::open(path).await {
|
let file = VirtualFile::open(path)
|
||||||
Ok(file) => file,
|
.await
|
||||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||||
};
|
|
||||||
let file = FileBlockReader::new(file);
|
let file = FileBlockReader::new(file);
|
||||||
let summary_blk = match file.read_blk(0, ctx).await {
|
let summary_blk = file.read_blk(0, ctx).await?;
|
||||||
Ok(blk) => blk,
|
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
|
||||||
};
|
|
||||||
|
|
||||||
// length is the only way how this could fail, so it's not actually likely at all unless
|
|
||||||
// read_blk returns wrong sized block.
|
|
||||||
//
|
|
||||||
// TODO: confirm and make this into assertion
|
|
||||||
let actual_summary =
|
|
||||||
Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
|
|
||||||
|
|
||||||
if let Some(mut expected_summary) = summary {
|
if let Some(mut expected_summary) = summary {
|
||||||
// production code path
|
// production code path
|
||||||
@@ -398,12 +322,12 @@ impl ImageLayerInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Ok(ImageLayerInner {
|
Ok(ImageLayerInner {
|
||||||
index_start_blk: actual_summary.index_start_blk,
|
index_start_blk: actual_summary.index_start_blk,
|
||||||
index_root_blk: actual_summary.index_root_blk,
|
index_root_blk: actual_summary.index_root_blk,
|
||||||
lsn,
|
lsn,
|
||||||
file,
|
file,
|
||||||
}))
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) async fn get_value_reconstruct_data(
|
pub(super) async fn get_value_reconstruct_data(
|
||||||
@@ -461,7 +385,7 @@ struct ImageLayerWriterInner {
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
path: Utf8PathBuf,
|
path: Utf8PathBuf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
key_range: Range<Key>,
|
key_range: Range<Key>,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
|
|
||||||
@@ -476,7 +400,7 @@ impl ImageLayerWriterInner {
|
|||||||
async fn new(
|
async fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
key_range: &Range<Key>,
|
key_range: &Range<Key>,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
@@ -485,7 +409,7 @@ impl ImageLayerWriterInner {
|
|||||||
let path = ImageLayer::temp_path_for(
|
let path = ImageLayer::temp_path_for(
|
||||||
conf,
|
conf,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
&ImageFileName {
|
&ImageFileName {
|
||||||
key_range: key_range.clone(),
|
key_range: key_range.clone(),
|
||||||
lsn,
|
lsn,
|
||||||
@@ -509,7 +433,7 @@ impl ImageLayerWriterInner {
|
|||||||
conf,
|
conf,
|
||||||
path,
|
path,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
key_range: key_range.clone(),
|
key_range: key_range.clone(),
|
||||||
lsn,
|
lsn,
|
||||||
tree: tree_builder,
|
tree: tree_builder,
|
||||||
@@ -556,7 +480,7 @@ impl ImageLayerWriterInner {
|
|||||||
let summary = Summary {
|
let summary = Summary {
|
||||||
magic: IMAGE_FILE_MAGIC,
|
magic: IMAGE_FILE_MAGIC,
|
||||||
format_version: STORAGE_FORMAT_VERSION,
|
format_version: STORAGE_FORMAT_VERSION,
|
||||||
tenant_id: self.tenant_shard_id.tenant_id,
|
tenant_id: self.tenant_id,
|
||||||
timeline_id: self.timeline_id,
|
timeline_id: self.timeline_id,
|
||||||
key_range: self.key_range.clone(),
|
key_range: self.key_range.clone(),
|
||||||
lsn: self.lsn,
|
lsn: self.lsn,
|
||||||
@@ -582,7 +506,7 @@ impl ImageLayerWriterInner {
|
|||||||
.context("get metadata to determine file size")?;
|
.context("get metadata to determine file size")?;
|
||||||
|
|
||||||
let desc = PersistentLayerDesc::new_img(
|
let desc = PersistentLayerDesc::new_img(
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.key_range.clone(),
|
self.key_range.clone(),
|
||||||
self.lsn,
|
self.lsn,
|
||||||
@@ -638,14 +562,13 @@ impl ImageLayerWriter {
|
|||||||
pub async fn new(
|
pub async fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
key_range: &Range<Key>,
|
key_range: &Range<Key>,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
) -> anyhow::Result<ImageLayerWriter> {
|
) -> anyhow::Result<ImageLayerWriter> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: Some(
|
inner: Some(
|
||||||
ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
|
ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
|
||||||
.await?,
|
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,11 +14,15 @@ use crate::tenant::Timeline;
|
|||||||
use crate::walrecord;
|
use crate::walrecord;
|
||||||
use anyhow::{ensure, Result};
|
use anyhow::{ensure, Result};
|
||||||
use pageserver_api::models::InMemoryLayerInfo;
|
use pageserver_api::models::InMemoryLayerInfo;
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::{Arc, OnceLock};
|
use std::sync::{Arc, OnceLock};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
use utils::{
|
||||||
|
bin_ser::BeSer,
|
||||||
|
id::{TenantId, TimelineId},
|
||||||
|
lsn::Lsn,
|
||||||
|
vec_map::VecMap,
|
||||||
|
};
|
||||||
// avoid binding to Write (conflicts with std::io::Write)
|
// avoid binding to Write (conflicts with std::io::Write)
|
||||||
// while being able to use std::fmt::Write's methods
|
// while being able to use std::fmt::Write's methods
|
||||||
use std::fmt::Write as _;
|
use std::fmt::Write as _;
|
||||||
@@ -29,7 +33,7 @@ use super::{DeltaLayerWriter, ResidentLayer};
|
|||||||
|
|
||||||
pub struct InMemoryLayer {
|
pub struct InMemoryLayer {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
|
|
||||||
/// This layer contains all the changes from 'start_lsn'. The
|
/// This layer contains all the changes from 'start_lsn'. The
|
||||||
@@ -222,17 +226,17 @@ impl InMemoryLayer {
|
|||||||
pub async fn create(
|
pub async fn create(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
start_lsn: Lsn,
|
start_lsn: Lsn,
|
||||||
) -> Result<InMemoryLayer> {
|
) -> Result<InMemoryLayer> {
|
||||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||||
|
|
||||||
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
|
let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
|
||||||
|
|
||||||
Ok(InMemoryLayer {
|
Ok(InMemoryLayer {
|
||||||
conf,
|
conf,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
start_lsn,
|
start_lsn,
|
||||||
end_lsn: OnceLock::new(),
|
end_lsn: OnceLock::new(),
|
||||||
inner: RwLock::new(InMemoryLayerInner {
|
inner: RwLock::new(InMemoryLayerInner {
|
||||||
@@ -331,7 +335,7 @@ impl InMemoryLayer {
|
|||||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||||
self.conf,
|
self.conf,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
Key::MIN,
|
Key::MIN,
|
||||||
self.start_lsn..end_lsn,
|
self.start_lsn..end_lsn,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use camino::{Utf8Path, Utf8PathBuf};
|
|||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
||||||
};
|
};
|
||||||
use pageserver_api::shard::ShardIndex;
|
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||||
use std::sync::{Arc, Weak};
|
use std::sync::{Arc, Weak};
|
||||||
@@ -82,7 +81,7 @@ impl Layer {
|
|||||||
metadata: LayerFileMetadata,
|
metadata: LayerFileMetadata,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let desc = PersistentLayerDesc::from_filename(
|
let desc = PersistentLayerDesc::from_filename(
|
||||||
timeline.tenant_shard_id,
|
timeline.tenant_id,
|
||||||
timeline.timeline_id,
|
timeline.timeline_id,
|
||||||
file_name,
|
file_name,
|
||||||
metadata.file_size(),
|
metadata.file_size(),
|
||||||
@@ -97,7 +96,6 @@ impl Layer {
|
|||||||
desc,
|
desc,
|
||||||
None,
|
None,
|
||||||
metadata.generation,
|
metadata.generation,
|
||||||
metadata.shard,
|
|
||||||
)));
|
)));
|
||||||
|
|
||||||
debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
|
debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
|
||||||
@@ -113,7 +111,7 @@ impl Layer {
|
|||||||
metadata: LayerFileMetadata,
|
metadata: LayerFileMetadata,
|
||||||
) -> ResidentLayer {
|
) -> ResidentLayer {
|
||||||
let desc = PersistentLayerDesc::from_filename(
|
let desc = PersistentLayerDesc::from_filename(
|
||||||
timeline.tenant_shard_id,
|
timeline.tenant_id,
|
||||||
timeline.timeline_id,
|
timeline.timeline_id,
|
||||||
file_name,
|
file_name,
|
||||||
metadata.file_size(),
|
metadata.file_size(),
|
||||||
@@ -138,7 +136,6 @@ impl Layer {
|
|||||||
desc,
|
desc,
|
||||||
Some(inner),
|
Some(inner),
|
||||||
metadata.generation,
|
metadata.generation,
|
||||||
metadata.shard,
|
|
||||||
)
|
)
|
||||||
}));
|
}));
|
||||||
|
|
||||||
@@ -182,7 +179,6 @@ impl Layer {
|
|||||||
desc,
|
desc,
|
||||||
Some(inner),
|
Some(inner),
|
||||||
timeline.generation,
|
timeline.generation,
|
||||||
timeline.get_shard_index(),
|
|
||||||
)
|
)
|
||||||
}));
|
}));
|
||||||
|
|
||||||
@@ -255,7 +251,6 @@ impl Layer {
|
|||||||
|
|
||||||
layer
|
layer
|
||||||
.get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
|
.get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
|
||||||
.instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -326,24 +321,6 @@ impl Layer {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Waits until this layer has been dropped (and if needed, local garbage collection and remote
|
|
||||||
/// deletion scheduling has completed).
|
|
||||||
///
|
|
||||||
/// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that
|
|
||||||
/// separatedly.
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
|
|
||||||
let mut rx = self.0.status.subscribe();
|
|
||||||
|
|
||||||
async move {
|
|
||||||
loop {
|
|
||||||
if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
|
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
|
||||||
@@ -448,15 +425,6 @@ struct LayerInner {
|
|||||||
/// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
|
/// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
|
||||||
/// for created layers from [`Timeline::generation`].
|
/// for created layers from [`Timeline::generation`].
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
|
|
||||||
/// The shard of this Layer.
|
|
||||||
///
|
|
||||||
/// For layers created in this process, this will always be the [`ShardIndex`] of the
|
|
||||||
/// current `ShardIdentity`` (TODO: add link once it's introduced).
|
|
||||||
///
|
|
||||||
/// For loaded layers, this may be some other value if the tenant has undergone
|
|
||||||
/// a shard split since the layer was originally written.
|
|
||||||
shard: ShardIndex,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for LayerInner {
|
impl std::fmt::Display for LayerInner {
|
||||||
@@ -486,21 +454,17 @@ impl Drop for LayerInner {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
|
let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_id, timeline_id = %self.layer_desc().timeline_id);
|
||||||
|
|
||||||
let path = std::mem::take(&mut self.path);
|
let path = std::mem::take(&mut self.path);
|
||||||
let file_name = self.layer_desc().filename();
|
let file_name = self.layer_desc().filename();
|
||||||
|
let gen = self.generation;
|
||||||
let file_size = self.layer_desc().file_size;
|
let file_size = self.layer_desc().file_size;
|
||||||
let timeline = self.timeline.clone();
|
let timeline = self.timeline.clone();
|
||||||
let meta = self.metadata();
|
|
||||||
let status = self.status.clone();
|
|
||||||
|
|
||||||
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
|
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
|
||||||
let _g = span.entered();
|
let _g = span.entered();
|
||||||
|
|
||||||
// carry this until we are finished for [`Layer::wait_drop`] support
|
|
||||||
let _status = status;
|
|
||||||
|
|
||||||
let removed = match std::fs::remove_file(path) {
|
let removed = match std::fs::remove_file(path) {
|
||||||
Ok(()) => true,
|
Ok(()) => true,
|
||||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||||
@@ -524,7 +488,7 @@ impl Drop for LayerInner {
|
|||||||
timeline.metrics.resident_physical_size_sub(file_size);
|
timeline.metrics.resident_physical_size_sub(file_size);
|
||||||
}
|
}
|
||||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||||
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
|
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, gen)]);
|
||||||
|
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
||||||
@@ -558,10 +522,9 @@ impl LayerInner {
|
|||||||
desc: PersistentLayerDesc,
|
desc: PersistentLayerDesc,
|
||||||
downloaded: Option<Arc<DownloadedLayer>>,
|
downloaded: Option<Arc<DownloadedLayer>>,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
shard: ShardIndex,
|
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let path = conf
|
let path = conf
|
||||||
.timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
|
.timeline_path(&timeline.tenant_id, &timeline.timeline_id)
|
||||||
.join(desc.filename().to_string());
|
.join(desc.filename().to_string());
|
||||||
|
|
||||||
let (inner, version) = if let Some(inner) = downloaded {
|
let (inner, version) = if let Some(inner) = downloaded {
|
||||||
@@ -586,7 +549,6 @@ impl LayerInner {
|
|||||||
status: tokio::sync::broadcast::channel(1).0,
|
status: tokio::sync::broadcast::channel(1).0,
|
||||||
consecutive_failures: AtomicUsize::new(0),
|
consecutive_failures: AtomicUsize::new(0),
|
||||||
generation,
|
generation,
|
||||||
shard,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -832,7 +794,7 @@ impl LayerInner {
|
|||||||
crate::task_mgr::spawn(
|
crate::task_mgr::spawn(
|
||||||
&tokio::runtime::Handle::current(),
|
&tokio::runtime::Handle::current(),
|
||||||
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
||||||
Some(self.desc.tenant_shard_id.tenant_id),
|
Some(self.desc.tenant_id),
|
||||||
Some(self.desc.timeline_id),
|
Some(self.desc.timeline_id),
|
||||||
&task_name,
|
&task_name,
|
||||||
false,
|
false,
|
||||||
@@ -905,9 +867,6 @@ impl LayerInner {
|
|||||||
}
|
}
|
||||||
Ok((Err(e), _permit)) => {
|
Ok((Err(e), _permit)) => {
|
||||||
// FIXME: this should be with the spawned task and be cancellation sensitive
|
// FIXME: this should be with the spawned task and be cancellation sensitive
|
||||||
//
|
|
||||||
// while we should not need this, this backoff has turned out to be useful with
|
|
||||||
// a bug of unexpectedly deleted remote layer file (#5787).
|
|
||||||
let consecutive_failures =
|
let consecutive_failures =
|
||||||
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
||||||
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
|
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
|
||||||
@@ -997,7 +956,7 @@ impl LayerInner {
|
|||||||
if gc {
|
if gc {
|
||||||
// do nothing now, only in LayerInner::drop
|
// do nothing now, only in LayerInner::drop
|
||||||
} else if can_evict && evict {
|
} else if can_evict && evict {
|
||||||
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
|
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);
|
||||||
|
|
||||||
// downgrade for queueing, in case there's a tear down already ongoing we should not
|
// downgrade for queueing, in case there's a tear down already ongoing we should not
|
||||||
// hold it alive.
|
// hold it alive.
|
||||||
@@ -1114,7 +1073,7 @@ impl LayerInner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn metadata(&self) -> LayerFileMetadata {
|
fn metadata(&self) -> LayerFileMetadata {
|
||||||
LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
|
LayerFileMetadata::new(self.desc.file_size, self.generation)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1229,50 +1188,39 @@ impl DownloadedLayer {
|
|||||||
|
|
||||||
let res = if owner.desc.is_delta {
|
let res = if owner.desc.is_delta {
|
||||||
let summary = Some(delta_layer::Summary::expected(
|
let summary = Some(delta_layer::Summary::expected(
|
||||||
owner.desc.tenant_shard_id.tenant_id,
|
owner.desc.tenant_id,
|
||||||
owner.desc.timeline_id,
|
owner.desc.timeline_id,
|
||||||
owner.desc.key_range.clone(),
|
owner.desc.key_range.clone(),
|
||||||
owner.desc.lsn_range.clone(),
|
owner.desc.lsn_range.clone(),
|
||||||
));
|
));
|
||||||
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
|
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
|
||||||
.await
|
.await
|
||||||
.map(|res| res.map(LayerKind::Delta))
|
.map(LayerKind::Delta)
|
||||||
} else {
|
} else {
|
||||||
let lsn = owner.desc.image_layer_lsn();
|
let lsn = owner.desc.image_layer_lsn();
|
||||||
let summary = Some(image_layer::Summary::expected(
|
let summary = Some(image_layer::Summary::expected(
|
||||||
owner.desc.tenant_shard_id.tenant_id,
|
owner.desc.tenant_id,
|
||||||
owner.desc.timeline_id,
|
owner.desc.timeline_id,
|
||||||
owner.desc.key_range.clone(),
|
owner.desc.key_range.clone(),
|
||||||
lsn,
|
lsn,
|
||||||
));
|
));
|
||||||
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
|
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
|
||||||
.await
|
.await
|
||||||
.map(|res| res.map(LayerKind::Image))
|
.map(LayerKind::Image)
|
||||||
};
|
|
||||||
|
|
||||||
match res {
|
|
||||||
Ok(Ok(layer)) => Ok(Ok(layer)),
|
|
||||||
Ok(Err(transient)) => Err(transient),
|
|
||||||
Err(permanent) => {
|
|
||||||
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
|
|
||||||
// TODO(#5815): we are not logging all errors, so temporarily log them **once**
|
|
||||||
// here as well
|
|
||||||
let permanent = permanent.context("load layer");
|
|
||||||
tracing::error!("layer loading failed permanently: {permanent:#}");
|
|
||||||
Ok(Err(permanent))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
// this will be a permanent failure
|
||||||
|
.context("load layer");
|
||||||
|
|
||||||
|
if res.is_err() {
|
||||||
|
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
|
||||||
|
}
|
||||||
|
res
|
||||||
};
|
};
|
||||||
self.kind
|
self.kind.get_or_init(init).await.as_ref().map_err(|e| {
|
||||||
.get_or_try_init(init)
|
// errors are not clonabled, cannot but stringify
|
||||||
// return transient errors using `?`
|
// test_broken_timeline matches this string
|
||||||
.await?
|
anyhow::anyhow!("layer loading failed: {e:#}")
|
||||||
.as_ref()
|
})
|
||||||
.map_err(|e| {
|
|
||||||
// errors are not clonabled, cannot but stringify
|
|
||||||
// test_broken_timeline matches this string
|
|
||||||
anyhow::anyhow!("layer loading failed: {e:#}")
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_value_reconstruct_data(
|
async fn get_value_reconstruct_data(
|
||||||
@@ -1343,7 +1291,6 @@ impl ResidentLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Loads all keys stored in the layer. Returns key, lsn and value size.
|
/// Loads all keys stored in the layer. Returns key, lsn and value size.
|
||||||
#[tracing::instrument(skip_all, fields(layer=%self))]
|
|
||||||
pub(crate) async fn load_keys<'a>(
|
pub(crate) async fn load_keys<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
@@ -1438,7 +1385,6 @@ impl Default for LayerImplMetrics {
|
|||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
// reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix
|
|
||||||
let gcs = metrics::register_int_counter_vec!(
|
let gcs = metrics::register_int_counter_vec!(
|
||||||
"pageserver_layer_gcs_count",
|
"pageserver_layer_gcs_count",
|
||||||
"Garbage collections started and completed in the Layer implementation",
|
"Garbage collections started and completed in the Layer implementation",
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
use core::fmt::Display;
|
use core::fmt::Display;
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use utils::{id::TimelineId, lsn::Lsn};
|
use utils::{
|
||||||
|
id::{TenantId, TimelineId},
|
||||||
|
lsn::Lsn,
|
||||||
|
};
|
||||||
|
|
||||||
use crate::repository::Key;
|
use crate::repository::Key;
|
||||||
|
|
||||||
@@ -9,15 +11,12 @@ use super::{DeltaFileName, ImageFileName, LayerFileName};
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
use utils::id::TenantId;
|
|
||||||
|
|
||||||
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
|
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
|
||||||
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
|
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
|
||||||
/// a unified way to generate layer information like file name.
|
/// a unified way to generate layer information like file name.
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||||
pub struct PersistentLayerDesc {
|
pub struct PersistentLayerDesc {
|
||||||
pub tenant_shard_id: TenantShardId,
|
pub tenant_id: TenantId,
|
||||||
pub timeline_id: TimelineId,
|
pub timeline_id: TimelineId,
|
||||||
/// Range of keys that this layer covers
|
/// Range of keys that this layer covers
|
||||||
pub key_range: Range<Key>,
|
pub key_range: Range<Key>,
|
||||||
@@ -57,7 +56,7 @@ impl PersistentLayerDesc {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn new_test(key_range: Range<Key>) -> Self {
|
pub fn new_test(key_range: Range<Key>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
|
tenant_id: TenantId::generate(),
|
||||||
timeline_id: TimelineId::generate(),
|
timeline_id: TimelineId::generate(),
|
||||||
key_range,
|
key_range,
|
||||||
lsn_range: Lsn(0)..Lsn(1),
|
lsn_range: Lsn(0)..Lsn(1),
|
||||||
@@ -67,14 +66,14 @@ impl PersistentLayerDesc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_img(
|
pub fn new_img(
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
key_range: Range<Key>,
|
key_range: Range<Key>,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
file_size: u64,
|
file_size: u64,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
key_range,
|
key_range,
|
||||||
lsn_range: Self::image_layer_lsn_range(lsn),
|
lsn_range: Self::image_layer_lsn_range(lsn),
|
||||||
@@ -84,14 +83,14 @@ impl PersistentLayerDesc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_delta(
|
pub fn new_delta(
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
key_range: Range<Key>,
|
key_range: Range<Key>,
|
||||||
lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
file_size: u64,
|
file_size: u64,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
key_range,
|
key_range,
|
||||||
lsn_range,
|
lsn_range,
|
||||||
@@ -101,22 +100,18 @@ impl PersistentLayerDesc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_filename(
|
pub fn from_filename(
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
filename: LayerFileName,
|
filename: LayerFileName,
|
||||||
file_size: u64,
|
file_size: u64,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
match filename {
|
match filename {
|
||||||
LayerFileName::Image(i) => {
|
LayerFileName::Image(i) => {
|
||||||
Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
|
Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
|
||||||
|
}
|
||||||
|
LayerFileName::Delta(d) => {
|
||||||
|
Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
|
||||||
}
|
}
|
||||||
LayerFileName::Delta(d) => Self::new_delta(
|
|
||||||
tenant_shard_id,
|
|
||||||
timeline_id,
|
|
||||||
d.key_range,
|
|
||||||
d.lsn_range,
|
|
||||||
file_size,
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -177,6 +172,10 @@ impl PersistentLayerDesc {
|
|||||||
self.timeline_id
|
self.timeline_id
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_tenant_id(&self) -> TenantId {
|
||||||
|
self.tenant_id
|
||||||
|
}
|
||||||
|
|
||||||
/// Does this layer only contain some data for the key-range (incremental),
|
/// Does this layer only contain some data for the key-range (incremental),
|
||||||
/// or does it contain a version of every page? This is important to know
|
/// or does it contain a version of every page? This is important to know
|
||||||
/// for garbage collecting old layers: an incremental layer depends on
|
/// for garbage collecting old layers: an incremental layer depends on
|
||||||
@@ -193,7 +192,7 @@ impl PersistentLayerDesc {
|
|||||||
if self.is_delta {
|
if self.is_delta {
|
||||||
println!(
|
println!(
|
||||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
|
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.key_range.start,
|
self.key_range.start,
|
||||||
self.key_range.end,
|
self.key_range.end,
|
||||||
@@ -205,7 +204,7 @@ impl PersistentLayerDesc {
|
|||||||
} else {
|
} else {
|
||||||
println!(
|
println!(
|
||||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.key_range.start,
|
self.key_range.start,
|
||||||
self.key_range.end,
|
self.key_range.end,
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ pub fn start_background_loops(
|
|||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
background_jobs_can_start: Option<&completion::Barrier>,
|
background_jobs_can_start: Option<&completion::Barrier>,
|
||||||
) {
|
) {
|
||||||
let tenant_id = tenant.tenant_shard_id.tenant_id;
|
let tenant_id = tenant.tenant_id;
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
BACKGROUND_RUNTIME.handle(),
|
BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::Compaction,
|
TaskKind::Compaction,
|
||||||
@@ -180,16 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
// Run compaction
|
// Run compaction
|
||||||
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
|
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
|
||||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||||
error_run_count + 1,
|
error_run_count,
|
||||||
1.0,
|
1.0,
|
||||||
MAX_BACKOFF_SECS,
|
MAX_BACKOFF_SECS,
|
||||||
);
|
);
|
||||||
error_run_count += 1;
|
error_run_count += 1;
|
||||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
|
||||||
error!(
|
error!(
|
||||||
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
"Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
|
||||||
|
wait_duration
|
||||||
);
|
);
|
||||||
wait_duration
|
Duration::from_secs_f64(wait_duration)
|
||||||
} else {
|
} else {
|
||||||
error_run_count = 0;
|
error_run_count = 0;
|
||||||
period
|
period
|
||||||
@@ -198,10 +198,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
|
|
||||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
|
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
|
||||||
|
|
||||||
// Perhaps we did no work and the walredo process has been idle for some time:
|
|
||||||
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
|
|
||||||
tenant.walredo_mgr.maybe_quiesce(period * 10);
|
|
||||||
|
|
||||||
// Sleep
|
// Sleep
|
||||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||||
.await
|
.await
|
||||||
@@ -261,20 +257,20 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
} else {
|
} else {
|
||||||
// Run gc
|
// Run gc
|
||||||
let res = tenant
|
let res = tenant
|
||||||
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
|
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
|
||||||
.await;
|
.await;
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||||
error_run_count + 1,
|
error_run_count,
|
||||||
1.0,
|
1.0,
|
||||||
MAX_BACKOFF_SECS,
|
MAX_BACKOFF_SECS,
|
||||||
);
|
);
|
||||||
error_run_count += 1;
|
error_run_count += 1;
|
||||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
|
||||||
error!(
|
error!(
|
||||||
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
"Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
|
||||||
|
wait_duration
|
||||||
);
|
);
|
||||||
wait_duration
|
Duration::from_secs_f64(wait_duration)
|
||||||
} else {
|
} else {
|
||||||
error_run_count = 0;
|
error_run_count = 0;
|
||||||
period
|
period
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ pub mod delete;
|
|||||||
mod eviction_task;
|
mod eviction_task;
|
||||||
mod init;
|
mod init;
|
||||||
pub mod layer_manager;
|
pub mod layer_manager;
|
||||||
pub(crate) mod logical_size;
|
mod logical_size;
|
||||||
pub mod span;
|
pub mod span;
|
||||||
pub mod uninit;
|
pub mod uninit;
|
||||||
mod walreceiver;
|
mod walreceiver;
|
||||||
@@ -10,15 +10,10 @@ mod walreceiver;
|
|||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use enumset::EnumSet;
|
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use pageserver_api::{
|
use pageserver_api::models::{
|
||||||
models::{
|
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo, TimelineState,
|
||||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
|
|
||||||
TimelineState,
|
|
||||||
},
|
|
||||||
shard::TenantShardId,
|
|
||||||
};
|
};
|
||||||
use serde_with::serde_as;
|
use serde_with::serde_as;
|
||||||
use storage_broker::BrokerClientChannel;
|
use storage_broker::BrokerClientChannel;
|
||||||
@@ -66,7 +61,6 @@ use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
|||||||
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
|
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
|
||||||
use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
|
use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
|
||||||
use pageserver_api::reltag::RelTag;
|
use pageserver_api::reltag::RelTag;
|
||||||
use pageserver_api::shard::ShardIndex;
|
|
||||||
|
|
||||||
use postgres_connection::PgConnectionConfig;
|
use postgres_connection::PgConnectionConfig;
|
||||||
use postgres_ffi::to_pg_timestamp;
|
use postgres_ffi::to_pg_timestamp;
|
||||||
@@ -101,12 +95,7 @@ use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenant
|
|||||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||||
pub(super) enum FlushLoopState {
|
pub(super) enum FlushLoopState {
|
||||||
NotStarted,
|
NotStarted,
|
||||||
Running {
|
Running,
|
||||||
#[cfg(test)]
|
|
||||||
expect_initdb_optimization: bool,
|
|
||||||
#[cfg(test)]
|
|
||||||
initdb_optimization_count: usize,
|
|
||||||
},
|
|
||||||
Exited,
|
Exited,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,7 +142,7 @@ pub struct Timeline {
|
|||||||
|
|
||||||
myself: Weak<Self>,
|
myself: Weak<Self>,
|
||||||
|
|
||||||
pub(crate) tenant_shard_id: TenantShardId,
|
pub tenant_id: TenantId,
|
||||||
pub timeline_id: TimelineId,
|
pub timeline_id: TimelineId,
|
||||||
|
|
||||||
/// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
|
/// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
|
||||||
@@ -255,6 +244,14 @@ pub struct Timeline {
|
|||||||
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
||||||
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
|
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
|
||||||
|
|
||||||
|
/// Layer removal lock.
|
||||||
|
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||||
|
/// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`].
|
||||||
|
/// This is an `Arc<Mutex>` lock because we need an owned
|
||||||
|
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
|
||||||
|
/// Note that [`DeleteTimelineFlow`] uses `delete_progress` field.
|
||||||
|
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
|
||||||
|
|
||||||
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
||||||
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
|
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
|
||||||
|
|
||||||
@@ -315,24 +312,6 @@ pub struct Timeline {
|
|||||||
/// Cancellation token scoped to this timeline: anything doing long-running work relating
|
/// Cancellation token scoped to this timeline: anything doing long-running work relating
|
||||||
/// to the timeline should drop out when this token fires.
|
/// to the timeline should drop out when this token fires.
|
||||||
pub(crate) cancel: CancellationToken,
|
pub(crate) cancel: CancellationToken,
|
||||||
|
|
||||||
/// Make sure we only have one running compaction at a time in tests.
|
|
||||||
///
|
|
||||||
/// Must only be taken in two places:
|
|
||||||
/// - [`Timeline::compact`] (this file)
|
|
||||||
/// - [`delete::delete_local_layer_files`]
|
|
||||||
///
|
|
||||||
/// Timeline deletion will acquire both compaction and gc locks in whatever order.
|
|
||||||
compaction_lock: tokio::sync::Mutex<()>,
|
|
||||||
|
|
||||||
/// Make sure we only have one running gc at a time.
|
|
||||||
///
|
|
||||||
/// Must only be taken in two places:
|
|
||||||
/// - [`Timeline::gc`] (this file)
|
|
||||||
/// - [`delete::delete_local_layer_files`]
|
|
||||||
///
|
|
||||||
/// Timeline deletion will acquire both compaction and gc locks in whatever order.
|
|
||||||
gc_lock: tokio::sync::Mutex<()>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalReceiverInfo {
|
pub struct WalReceiverInfo {
|
||||||
@@ -453,11 +432,6 @@ pub enum LogicalSizeCalculationCause {
|
|||||||
TenantSizeHandler,
|
TenantSizeHandler,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(enumset::EnumSetType)]
|
|
||||||
pub(crate) enum CompactFlags {
|
|
||||||
ForceRepartition,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Public interface functions
|
/// Public interface functions
|
||||||
impl Timeline {
|
impl Timeline {
|
||||||
/// Get the LSN where this branch was created
|
/// Get the LSN where this branch was created
|
||||||
@@ -705,7 +679,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Flush to disk all data that was written with the put_* functions
|
/// Flush to disk all data that was written with the put_* functions
|
||||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
#[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
|
||||||
pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||||
self.freeze_inmem_layer(false).await;
|
self.freeze_inmem_layer(false).await;
|
||||||
self.flush_frozen_layers_and_wait().await
|
self.flush_frozen_layers_and_wait().await
|
||||||
@@ -715,11 +689,8 @@ impl Timeline {
|
|||||||
pub(crate) async fn compact(
|
pub(crate) async fn compact(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
flags: EnumSet<CompactFlags>,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), CompactionError> {
|
) -> Result<(), CompactionError> {
|
||||||
let _g = self.compaction_lock.lock().await;
|
|
||||||
|
|
||||||
// this wait probably never needs any "long time spent" logging, because we already nag if
|
// this wait probably never needs any "long time spent" logging, because we already nag if
|
||||||
// compaction task goes over it's period (20s) which is quite often in production.
|
// compaction task goes over it's period (20s) which is quite often in production.
|
||||||
let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
|
let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
|
||||||
@@ -774,7 +745,7 @@ impl Timeline {
|
|||||||
// Below are functions compact_level0() and create_image_layers()
|
// Below are functions compact_level0() and create_image_layers()
|
||||||
// but they are a bit ad hoc and don't quite work like it's explained
|
// but they are a bit ad hoc and don't quite work like it's explained
|
||||||
// above. Rewrite it.
|
// above. Rewrite it.
|
||||||
|
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
|
||||||
// Is the timeline being deleted?
|
// Is the timeline being deleted?
|
||||||
if self.is_stopping() {
|
if self.is_stopping() {
|
||||||
trace!("Dropping out of compaction on timeline shutdown");
|
trace!("Dropping out of compaction on timeline shutdown");
|
||||||
@@ -790,7 +761,6 @@ impl Timeline {
|
|||||||
.repartition(
|
.repartition(
|
||||||
self.get_last_record_lsn(),
|
self.get_last_record_lsn(),
|
||||||
self.get_compaction_target_size(),
|
self.get_compaction_target_size(),
|
||||||
flags,
|
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
@@ -815,7 +785,8 @@ impl Timeline {
|
|||||||
|
|
||||||
// 3. Compact
|
// 3. Compact
|
||||||
let timer = self.metrics.compact_time_histo.start_timer();
|
let timer = self.metrics.compact_time_histo.start_timer();
|
||||||
self.compact_level0(target_file_size, ctx).await?;
|
self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx)
|
||||||
|
.await?;
|
||||||
timer.stop_and_record();
|
timer.stop_and_record();
|
||||||
|
|
||||||
if let Some(remote_client) = &self.remote_client {
|
if let Some(remote_client) = &self.remote_client {
|
||||||
@@ -855,38 +826,23 @@ impl Timeline {
|
|||||||
/// the initial size calculation has not been run (gets triggered on the first size access).
|
/// the initial size calculation has not been run (gets triggered on the first size access).
|
||||||
///
|
///
|
||||||
/// return size and boolean flag that shows if the size is exact
|
/// return size and boolean flag that shows if the size is exact
|
||||||
pub(crate) fn get_current_logical_size(
|
pub fn get_current_logical_size(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> logical_size::CurrentLogicalSize {
|
) -> anyhow::Result<(u64, bool)> {
|
||||||
let current_size = self.current_logical_size.current_size();
|
let current_size = self.current_logical_size.current_size()?;
|
||||||
debug!("Current size: {current_size:?}");
|
debug!("Current size: {current_size:?}");
|
||||||
|
|
||||||
|
let mut is_exact = true;
|
||||||
|
let size = current_size.size();
|
||||||
if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
|
if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
|
||||||
(current_size, self.current_logical_size.initial_part_end)
|
(current_size, self.current_logical_size.initial_part_end)
|
||||||
{
|
{
|
||||||
|
is_exact = false;
|
||||||
self.try_spawn_size_init_task(initial_part_end, ctx);
|
self.try_spawn_size_init_task(initial_part_end, ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let CurrentLogicalSize::Approximate(_) = ¤t_size {
|
Ok((size, is_exact))
|
||||||
if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
|
|
||||||
let first = self
|
|
||||||
.current_logical_size
|
|
||||||
.did_return_approximate_to_walreceiver
|
|
||||||
.compare_exchange(
|
|
||||||
false,
|
|
||||||
true,
|
|
||||||
AtomicOrdering::Relaxed,
|
|
||||||
AtomicOrdering::Relaxed,
|
|
||||||
)
|
|
||||||
.is_ok();
|
|
||||||
if first {
|
|
||||||
crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
current_size
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if more than 'checkpoint_distance' of WAL has been accumulated in
|
/// Check if more than 'checkpoint_distance' of WAL has been accumulated in
|
||||||
@@ -956,7 +912,7 @@ impl Timeline {
|
|||||||
tracing::debug!("Waiting for WalReceiverManager...");
|
tracing::debug!("Waiting for WalReceiverManager...");
|
||||||
task_mgr::shutdown_tasks(
|
task_mgr::shutdown_tasks(
|
||||||
Some(TaskKind::WalReceiverManager),
|
Some(TaskKind::WalReceiverManager),
|
||||||
Some(self.tenant_shard_id.tenant_id),
|
Some(self.tenant_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
@@ -976,7 +932,7 @@ impl Timeline {
|
|||||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||||
// obviously it does not make sense to stop while we wait for it, but what
|
// obviously it does not make sense to stop while we wait for it, but what
|
||||||
// about corner cases like s3 suddenly hanging up?
|
// about corner cases like s3 suddenly hanging up?
|
||||||
if let Err(e) = client.shutdown().await {
|
if let Err(e) = client.wait_completion().await {
|
||||||
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||||
// we have some extra WAL replay to do next time the timeline starts.
|
// we have some extra WAL replay to do next time the timeline starts.
|
||||||
warn!("failed to flush to remote storage: {e:#}");
|
warn!("failed to flush to remote storage: {e:#}");
|
||||||
@@ -1007,7 +963,7 @@ impl Timeline {
|
|||||||
// Shut down the layer flush task before the remote client, as one depends on the other
|
// Shut down the layer flush task before the remote client, as one depends on the other
|
||||||
task_mgr::shutdown_tasks(
|
task_mgr::shutdown_tasks(
|
||||||
Some(TaskKind::LayerFlushTask),
|
Some(TaskKind::LayerFlushTask),
|
||||||
Some(self.tenant_shard_id.tenant_id),
|
Some(self.tenant_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
@@ -1025,12 +981,7 @@ impl Timeline {
|
|||||||
|
|
||||||
tracing::debug!("Waiting for tasks...");
|
tracing::debug!("Waiting for tasks...");
|
||||||
|
|
||||||
task_mgr::shutdown_tasks(
|
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;
|
||||||
None,
|
|
||||||
Some(self.tenant_shard_id.tenant_id),
|
|
||||||
Some(self.timeline_id),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
// Finally wait until any gate-holders are complete
|
// Finally wait until any gate-holders are complete
|
||||||
self.gate.close().await;
|
self.gate.close().await;
|
||||||
@@ -1149,7 +1100,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
|
||||||
pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||||
let Some(layer) = self.find_layer(layer_file_name).await else {
|
let Some(layer) = self.find_layer(layer_file_name).await else {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
@@ -1236,6 +1187,16 @@ impl Timeline {
|
|||||||
remote_client: &Arc<RemoteTimelineClient>,
|
remote_client: &Arc<RemoteTimelineClient>,
|
||||||
layers_to_evict: &[Layer],
|
layers_to_evict: &[Layer],
|
||||||
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
||||||
|
// ensure that the layers have finished uploading
|
||||||
|
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
|
||||||
|
remote_client
|
||||||
|
.wait_completion()
|
||||||
|
.await
|
||||||
|
.context("wait for layer upload ops to complete")?;
|
||||||
|
|
||||||
|
// now lock out layer removal (compaction, gc, timeline deletion)
|
||||||
|
let _layer_removal_guard = self.layer_removal_cs.lock().await;
|
||||||
|
|
||||||
{
|
{
|
||||||
// to avoid racing with detach and delete_timeline
|
// to avoid racing with detach and delete_timeline
|
||||||
let state = self.current_state();
|
let state = self.current_state();
|
||||||
@@ -1354,11 +1315,7 @@ impl Timeline {
|
|||||||
&self.tenant_conf.read().unwrap().tenant_conf,
|
&self.tenant_conf.read().unwrap().tenant_conf,
|
||||||
&self.conf.default_tenant_conf,
|
&self.conf.default_tenant_conf,
|
||||||
);
|
);
|
||||||
|
let tenant_id_str = self.tenant_id.to_string();
|
||||||
// TODO(sharding): make evictions state shard aware
|
|
||||||
// (https://github.com/neondatabase/neon/issues/5953)
|
|
||||||
let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
|
|
||||||
|
|
||||||
let timeline_id_str = self.timeline_id.to_string();
|
let timeline_id_str = self.timeline_id.to_string();
|
||||||
self.metrics
|
self.metrics
|
||||||
.evictions_with_low_residence_duration
|
.evictions_with_low_residence_duration
|
||||||
@@ -1378,7 +1335,7 @@ impl Timeline {
|
|||||||
metadata: &TimelineMetadata,
|
metadata: &TimelineMetadata,
|
||||||
ancestor: Option<Arc<Timeline>>,
|
ancestor: Option<Arc<Timeline>>,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
walredo_mgr: Arc<super::WalRedoManager>,
|
walredo_mgr: Arc<super::WalRedoManager>,
|
||||||
resources: TimelineResources,
|
resources: TimelineResources,
|
||||||
@@ -1409,7 +1366,7 @@ impl Timeline {
|
|||||||
tenant_conf,
|
tenant_conf,
|
||||||
myself: myself.clone(),
|
myself: myself.clone(),
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_shard_id,
|
tenant_id,
|
||||||
generation,
|
generation,
|
||||||
pg_version,
|
pg_version,
|
||||||
layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
|
layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
|
||||||
@@ -1436,7 +1393,7 @@ impl Timeline {
|
|||||||
ancestor_lsn: metadata.ancestor_lsn(),
|
ancestor_lsn: metadata.ancestor_lsn(),
|
||||||
|
|
||||||
metrics: TimelineMetrics::new(
|
metrics: TimelineMetrics::new(
|
||||||
&tenant_shard_id.tenant_id,
|
&tenant_id,
|
||||||
&timeline_id,
|
&timeline_id,
|
||||||
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
|
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
|
||||||
"mtime",
|
"mtime",
|
||||||
@@ -1450,6 +1407,7 @@ impl Timeline {
|
|||||||
layer_flush_done_tx,
|
layer_flush_done_tx,
|
||||||
|
|
||||||
write_lock: tokio::sync::Mutex::new(()),
|
write_lock: tokio::sync::Mutex::new(()),
|
||||||
|
layer_removal_cs: Default::default(),
|
||||||
|
|
||||||
gc_info: std::sync::RwLock::new(GcInfo {
|
gc_info: std::sync::RwLock::new(GcInfo {
|
||||||
retain_lsns: Vec::new(),
|
retain_lsns: Vec::new(),
|
||||||
@@ -1487,10 +1445,7 @@ impl Timeline {
|
|||||||
initial_logical_size_can_start,
|
initial_logical_size_can_start,
|
||||||
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
||||||
cancel,
|
cancel,
|
||||||
gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
|
gate: Gate::new(format!("Timeline<{tenant_id}/{timeline_id}>")),
|
||||||
|
|
||||||
compaction_lock: tokio::sync::Mutex::default(),
|
|
||||||
gc_lock: tokio::sync::Mutex::default(),
|
|
||||||
};
|
};
|
||||||
result.repartition_threshold =
|
result.repartition_threshold =
|
||||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||||
@@ -1503,24 +1458,20 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
|
pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
|
||||||
let Ok(guard) = self.gate.enter() else {
|
|
||||||
info!("cannot start flush loop when the timeline gate has already been closed");
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
|
let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
|
||||||
match *flush_loop_state {
|
match *flush_loop_state {
|
||||||
FlushLoopState::NotStarted => (),
|
FlushLoopState::NotStarted => (),
|
||||||
FlushLoopState::Running { .. } => {
|
FlushLoopState::Running => {
|
||||||
info!(
|
info!(
|
||||||
"skipping attempt to start flush_loop twice {}/{}",
|
"skipping attempt to start flush_loop twice {}/{}",
|
||||||
self.tenant_shard_id, self.timeline_id
|
self.tenant_id, self.timeline_id
|
||||||
);
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
FlushLoopState::Exited => {
|
FlushLoopState::Exited => {
|
||||||
warn!(
|
warn!(
|
||||||
"ignoring attempt to restart exited flush_loop {}/{}",
|
"ignoring attempt to restart exited flush_loop {}/{}",
|
||||||
self.tenant_shard_id, self.timeline_id
|
self.tenant_id, self.timeline_id
|
||||||
);
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -1530,29 +1481,23 @@ impl Timeline {
|
|||||||
let self_clone = Arc::clone(self);
|
let self_clone = Arc::clone(self);
|
||||||
|
|
||||||
debug!("spawning flush loop");
|
debug!("spawning flush loop");
|
||||||
*flush_loop_state = FlushLoopState::Running {
|
*flush_loop_state = FlushLoopState::Running;
|
||||||
#[cfg(test)]
|
|
||||||
expect_initdb_optimization: false,
|
|
||||||
#[cfg(test)]
|
|
||||||
initdb_optimization_count: 0,
|
|
||||||
};
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
task_mgr::TaskKind::LayerFlushTask,
|
task_mgr::TaskKind::LayerFlushTask,
|
||||||
Some(self.tenant_shard_id.tenant_id),
|
Some(self.tenant_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
"layer flush task",
|
"layer flush task",
|
||||||
false,
|
false,
|
||||||
async move {
|
async move {
|
||||||
let _guard = guard;
|
|
||||||
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
|
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
|
||||||
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
|
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
|
||||||
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
|
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
|
||||||
assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
|
assert!(matches!(*flush_loop_state, FlushLoopState::Running));
|
||||||
*flush_loop_state = FlushLoopState::Exited;
|
*flush_loop_state = FlushLoopState::Exited;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))
|
.instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1567,7 +1512,7 @@ impl Timeline {
|
|||||||
) {
|
) {
|
||||||
info!(
|
info!(
|
||||||
"launching WAL receiver for timeline {} of tenant {}",
|
"launching WAL receiver for timeline {} of tenant {}",
|
||||||
self.timeline_id, self.tenant_shard_id
|
self.timeline_id, self.tenant_id
|
||||||
);
|
);
|
||||||
|
|
||||||
let tenant_conf_guard = self.tenant_conf.read().unwrap();
|
let tenant_conf_guard = self.tenant_conf.read().unwrap();
|
||||||
@@ -1628,15 +1573,12 @@ impl Timeline {
|
|||||||
|
|
||||||
// Scan timeline directory and create ImageFileName and DeltaFilename
|
// Scan timeline directory and create ImageFileName and DeltaFilename
|
||||||
// structs representing all files on disk
|
// structs representing all files on disk
|
||||||
let timeline_path = self
|
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||||
.conf
|
|
||||||
.timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
|
||||||
let conf = self.conf;
|
let conf = self.conf;
|
||||||
let span = tracing::Span::current();
|
let span = tracing::Span::current();
|
||||||
|
|
||||||
// Copy to move into the task we're about to spawn
|
// Copy to move into the task we're about to spawn
|
||||||
let generation = self.generation;
|
let generation = self.generation;
|
||||||
let shard = self.get_shard_index();
|
|
||||||
let this = self.myself.upgrade().expect("&self method holds the arc");
|
let this = self.myself.upgrade().expect("&self method holds the arc");
|
||||||
|
|
||||||
let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
|
let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
|
||||||
@@ -1685,7 +1627,6 @@ impl Timeline {
|
|||||||
index_part.as_ref(),
|
index_part.as_ref(),
|
||||||
disk_consistent_lsn,
|
disk_consistent_lsn,
|
||||||
generation,
|
generation,
|
||||||
shard,
|
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut loaded_layers = Vec::new();
|
let mut loaded_layers = Vec::new();
|
||||||
@@ -1760,30 +1701,6 @@ impl Timeline {
|
|||||||
if let Some(rtc) = self.remote_client.as_ref() {
|
if let Some(rtc) = self.remote_client.as_ref() {
|
||||||
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
|
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
|
||||||
rtc.schedule_index_upload_for_file_changes()?;
|
rtc.schedule_index_upload_for_file_changes()?;
|
||||||
// This barrier orders above DELETEs before any later operations.
|
|
||||||
// This is critical because code executing after the barrier might
|
|
||||||
// create again objects with the same key that we just scheduled for deletion.
|
|
||||||
// For example, if we just scheduled deletion of an image layer "from the future",
|
|
||||||
// later compaction might run again and re-create the same image layer.
|
|
||||||
// "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
|
|
||||||
// "same" here means same key range and LSN.
|
|
||||||
//
|
|
||||||
// Without a barrier between above DELETEs and the re-creation's PUTs,
|
|
||||||
// the upload queue may execute the PUT first, then the DELETE.
|
|
||||||
// In our example, we will end up with an IndexPart referencing a non-existent object.
|
|
||||||
//
|
|
||||||
// 1. a future image layer is created and uploaded
|
|
||||||
// 2. ps restart
|
|
||||||
// 3. the future layer from (1) is deleted during load layer map
|
|
||||||
// 4. image layer is re-created and uploaded
|
|
||||||
// 5. deletion queue would like to delete (1) but actually deletes (4)
|
|
||||||
// 6. delete by name works as expected, but it now deletes the wrong (later) version
|
|
||||||
//
|
|
||||||
// See https://github.com/neondatabase/neon/issues/5878
|
|
||||||
//
|
|
||||||
// NB: generation numbers naturally protect against this because they disambiguate
|
|
||||||
// (1) and (4)
|
|
||||||
rtc.schedule_barrier()?;
|
|
||||||
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
||||||
// on retry.
|
// on retry.
|
||||||
}
|
}
|
||||||
@@ -1827,7 +1744,6 @@ impl Timeline {
|
|||||||
"spawning logical size computation from context of task kind {:?}",
|
"spawning logical size computation from context of task kind {:?}",
|
||||||
ctx.task_kind()
|
ctx.task_kind()
|
||||||
);
|
);
|
||||||
let causing_task_kind = ctx.task_kind();
|
|
||||||
// We need to start the computation task.
|
// We need to start the computation task.
|
||||||
// It gets a separate context since it will outlive the request that called this function.
|
// It gets a separate context since it will outlive the request that called this function.
|
||||||
let self_clone = Arc::clone(self);
|
let self_clone = Arc::clone(self);
|
||||||
@@ -1838,7 +1754,7 @@ impl Timeline {
|
|||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
task_mgr::TaskKind::InitialLogicalSizeCalculation,
|
task_mgr::TaskKind::InitialLogicalSizeCalculation,
|
||||||
Some(self.tenant_shard_id.tenant_id),
|
Some(self.tenant_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
"initial size calculation",
|
"initial size calculation",
|
||||||
false,
|
false,
|
||||||
@@ -1855,8 +1771,6 @@ impl Timeline {
|
|||||||
_ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
|
_ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// hold off background tasks from starting until all timelines get to try at least
|
// hold off background tasks from starting until all timelines get to try at least
|
||||||
// once initial logical size calculation; though retry will rarely be useful.
|
// once initial logical size calculation; though retry will rarely be useful.
|
||||||
// holding off is done because heavier tasks execute blockingly on the same
|
// holding off is done because heavier tasks execute blockingly on the same
|
||||||
@@ -1864,12 +1778,7 @@ impl Timeline {
|
|||||||
//
|
//
|
||||||
// dropping this at every outcome is probably better than trying to cling on to it,
|
// dropping this at every outcome is probably better than trying to cling on to it,
|
||||||
// delay will be terminated by a timeout regardless.
|
// delay will be terminated by a timeout regardless.
|
||||||
let completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
|
let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
|
||||||
|
|
||||||
let metrics_guard = match &completion {
|
|
||||||
Some(_) => crate::metrics::initial_logical_size::START_CALCULATION.first(Some(causing_task_kind)),
|
|
||||||
None => crate::metrics::initial_logical_size::START_CALCULATION.retry(Some(causing_task_kind)),
|
|
||||||
};
|
|
||||||
|
|
||||||
let calculated_size = match self_clone
|
let calculated_size = match self_clone
|
||||||
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
|
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
|
||||||
@@ -1914,11 +1823,11 @@ impl Timeline {
|
|||||||
match self_clone
|
match self_clone
|
||||||
.current_logical_size
|
.current_logical_size
|
||||||
.initial_logical_size
|
.initial_logical_size
|
||||||
.set((calculated_size, metrics_guard.calculation_result_saved()))
|
.set(calculated_size)
|
||||||
{
|
{
|
||||||
Ok(()) => (),
|
Ok(()) => (),
|
||||||
Err(_what_we_just_attempted_to_set) => {
|
Err(_what_we_just_attempted_to_set) => {
|
||||||
let (existing_size, _) = self_clone
|
let existing_size = self_clone
|
||||||
.current_logical_size
|
.current_logical_size
|
||||||
.initial_logical_size
|
.initial_logical_size
|
||||||
.get()
|
.get()
|
||||||
@@ -1955,7 +1864,7 @@ impl Timeline {
|
|||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
task_mgr::TaskKind::OndemandLogicalSizeCalculation,
|
task_mgr::TaskKind::OndemandLogicalSizeCalculation,
|
||||||
Some(self.tenant_shard_id.tenant_id),
|
Some(self.tenant_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
"ondemand logical size calculation",
|
"ondemand logical size calculation",
|
||||||
false,
|
false,
|
||||||
@@ -2031,7 +1940,7 @@ impl Timeline {
|
|||||||
fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
|
fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
|
||||||
if !self
|
if !self
|
||||||
.conf
|
.conf
|
||||||
.metadata_path(&self.tenant_shard_id, &self.timeline_id)
|
.metadata_path(&self.tenant_id, &self.timeline_id)
|
||||||
.exists()
|
.exists()
|
||||||
{
|
{
|
||||||
error!("timeline-calculate-logical-size-pre metadata file does not exist")
|
error!("timeline-calculate-logical-size-pre metadata file does not exist")
|
||||||
@@ -2072,14 +1981,16 @@ impl Timeline {
|
|||||||
// one value while current_logical_size is set to the
|
// one value while current_logical_size is set to the
|
||||||
// other.
|
// other.
|
||||||
match logical_size.current_size() {
|
match logical_size.current_size() {
|
||||||
CurrentLogicalSize::Exact(ref new_current_size) => self
|
Ok(CurrentLogicalSize::Exact(new_current_size)) => self
|
||||||
.metrics
|
.metrics
|
||||||
.current_logical_size_gauge
|
.current_logical_size_gauge
|
||||||
.set(new_current_size.into()),
|
.set(new_current_size),
|
||||||
CurrentLogicalSize::Approximate(_) => {
|
Ok(CurrentLogicalSize::Approximate(_)) => {
|
||||||
// don't update the gauge yet, this allows us not to update the gauge back and
|
// don't update the gauge yet, this allows us not to update the gauge back and
|
||||||
// forth between the initial size calculation task.
|
// forth between the initial size calculation task.
|
||||||
}
|
}
|
||||||
|
// this is overflow
|
||||||
|
Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2382,13 +2293,7 @@ impl Timeline {
|
|||||||
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
|
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
|
||||||
// We should look at the key to determine if it's a cacheable object
|
// We should look at the key to determine if it's a cacheable object
|
||||||
let (lsn, read_guard) = cache
|
let (lsn, read_guard) = cache
|
||||||
.lookup_materialized_page(
|
.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn, ctx)
|
||||||
self.tenant_shard_id.tenant_id,
|
|
||||||
self.timeline_id,
|
|
||||||
key,
|
|
||||||
lsn,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
.await?;
|
||||||
let img = Bytes::from(read_guard.to_vec());
|
let img = Bytes::from(read_guard.to_vec());
|
||||||
Some((lsn, img))
|
Some((lsn, img))
|
||||||
@@ -2416,7 +2321,7 @@ impl Timeline {
|
|||||||
self.get_last_record_lsn(),
|
self.get_last_record_lsn(),
|
||||||
self.conf,
|
self.conf,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
Ok(layer)
|
Ok(layer)
|
||||||
@@ -2535,7 +2440,7 @@ impl Timeline {
|
|||||||
let mut my_flush_request = 0;
|
let mut my_flush_request = 0;
|
||||||
|
|
||||||
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
|
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
|
||||||
if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
|
if !matches!(flush_loop_state, FlushLoopState::Running) {
|
||||||
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
|
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2582,78 +2487,14 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))]
|
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer))]
|
||||||
async fn flush_frozen_layer(
|
async fn flush_frozen_layer(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
frozen_layer: Arc<InMemoryLayer>,
|
frozen_layer: Arc<InMemoryLayer>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), FlushLayerError> {
|
) -> Result<(), FlushLayerError> {
|
||||||
// As a special case, when we have just imported an image into the repository,
|
|
||||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
|
||||||
// files instead. This is possible as long as *all* the data imported into the
|
|
||||||
// repository have the same LSN.
|
|
||||||
let lsn_range = frozen_layer.get_lsn_range();
|
let lsn_range = frozen_layer.get_lsn_range();
|
||||||
let (layers_to_upload, delta_layer_to_add) =
|
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
|
||||||
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
|
|
||||||
#[cfg(test)]
|
|
||||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
|
||||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
|
||||||
panic!("flush loop not running")
|
|
||||||
}
|
|
||||||
FlushLoopState::Running {
|
|
||||||
initdb_optimization_count,
|
|
||||||
..
|
|
||||||
} => {
|
|
||||||
*initdb_optimization_count += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
|
|
||||||
// require downloading anything during initial import.
|
|
||||||
let (partitioning, _lsn) = self
|
|
||||||
.repartition(
|
|
||||||
self.initdb_lsn,
|
|
||||||
self.get_compaction_target_size(),
|
|
||||||
EnumSet::empty(),
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if self.cancel.is_cancelled() {
|
|
||||||
return Err(FlushLayerError::Cancelled);
|
|
||||||
}
|
|
||||||
|
|
||||||
// For image layers, we add them immediately into the layer map.
|
|
||||||
(
|
|
||||||
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
|
|
||||||
.await?,
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
#[cfg(test)]
|
|
||||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
|
||||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
|
||||||
panic!("flush loop not running")
|
|
||||||
}
|
|
||||||
FlushLoopState::Running {
|
|
||||||
expect_initdb_optimization,
|
|
||||||
..
|
|
||||||
} => {
|
|
||||||
assert!(!*expect_initdb_optimization, "expected initdb optimization");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Normal case, write out a L0 delta layer file.
|
|
||||||
// `create_delta_layer` will not modify the layer map.
|
|
||||||
// We will remove frozen layer and add delta layer in one atomic operation later.
|
|
||||||
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
|
|
||||||
(
|
|
||||||
// FIXME: even though we have a single image and single delta layer assumption
|
|
||||||
// we push them to vec
|
|
||||||
vec![layer.clone()],
|
|
||||||
Some(layer),
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
|
|
||||||
|
|
||||||
if self.cancel.is_cancelled() {
|
if self.cancel.is_cancelled() {
|
||||||
return Err(FlushLayerError::Cancelled);
|
return Err(FlushLayerError::Cancelled);
|
||||||
@@ -2672,18 +2513,17 @@ impl Timeline {
|
|||||||
return Err(FlushLayerError::Cancelled);
|
return Err(FlushLayerError::Cancelled);
|
||||||
}
|
}
|
||||||
|
|
||||||
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
|
guard.finish_flush_l0_layer(&layer, &frozen_layer, &self.metrics);
|
||||||
|
|
||||||
if disk_consistent_lsn != old_disk_consistent_lsn {
|
if disk_consistent_lsn != old_disk_consistent_lsn {
|
||||||
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
||||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||||
|
|
||||||
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
||||||
Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?)
|
Some(self.schedule_uploads(disk_consistent_lsn, [layer])?)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
// release lock on 'layers'
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
||||||
@@ -2703,14 +2543,9 @@ impl Timeline {
|
|||||||
|
|
||||||
// If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
|
// If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
|
||||||
if let Some(metadata) = metadata {
|
if let Some(metadata) = metadata {
|
||||||
save_metadata(
|
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
|
||||||
self.conf,
|
.await
|
||||||
&self.tenant_shard_id,
|
.context("save_metadata")?;
|
||||||
&self.timeline_id,
|
|
||||||
&metadata,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.context("save_metadata")?;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -2774,14 +2609,9 @@ impl Timeline {
|
|||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
|
let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
|
||||||
|
|
||||||
save_metadata(
|
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
|
||||||
self.conf,
|
.await
|
||||||
&self.tenant_shard_id,
|
.context("save_metadata")?;
|
||||||
&self.timeline_id,
|
|
||||||
&metadata,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.context("save_metadata")?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -2829,7 +2659,7 @@ impl Timeline {
|
|||||||
par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
|
par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
|
||||||
par_fsync::par_fsync(&[self_clone
|
par_fsync::par_fsync(&[self_clone
|
||||||
.conf
|
.conf
|
||||||
.timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
|
.timeline_path(&self_clone.tenant_id, &self_clone.timeline_id)])
|
||||||
.context("fsync of timeline dir")?;
|
.context("fsync of timeline dir")?;
|
||||||
|
|
||||||
anyhow::Ok(new_delta)
|
anyhow::Ok(new_delta)
|
||||||
@@ -2846,16 +2676,12 @@ impl Timeline {
|
|||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
partition_size: u64,
|
partition_size: u64,
|
||||||
flags: EnumSet<CompactFlags>,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
||||||
{
|
{
|
||||||
let partitioning_guard = self.partitioning.lock().unwrap();
|
let partitioning_guard = self.partitioning.lock().unwrap();
|
||||||
let distance = lsn.0 - partitioning_guard.1 .0;
|
let distance = lsn.0 - partitioning_guard.1 .0;
|
||||||
if partitioning_guard.1 != Lsn(0)
|
if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
|
||||||
&& distance <= self.repartition_threshold
|
|
||||||
&& !flags.contains(CompactFlags::ForceRepartition)
|
|
||||||
{
|
|
||||||
debug!(
|
debug!(
|
||||||
distance,
|
distance,
|
||||||
threshold = self.repartition_threshold,
|
threshold = self.repartition_threshold,
|
||||||
@@ -2985,7 +2811,7 @@ impl Timeline {
|
|||||||
let mut image_layer_writer = ImageLayerWriter::new(
|
let mut image_layer_writer = ImageLayerWriter::new(
|
||||||
self.conf,
|
self.conf,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
&img_range,
|
&img_range,
|
||||||
lsn,
|
lsn,
|
||||||
)
|
)
|
||||||
@@ -3058,11 +2884,9 @@ impl Timeline {
|
|||||||
.await
|
.await
|
||||||
.context("fsync of newly created layer files")?;
|
.context("fsync of newly created layer files")?;
|
||||||
|
|
||||||
par_fsync::par_fsync_async(&[self
|
par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)])
|
||||||
.conf
|
.await
|
||||||
.timeline_path(&self.tenant_shard_id, &self.timeline_id)])
|
.context("fsync of timeline dir")?;
|
||||||
.await
|
|
||||||
.context("fsync of timeline dir")?;
|
|
||||||
|
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
|
|
||||||
@@ -3212,8 +3036,13 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
|
|||||||
|
|
||||||
impl Timeline {
|
impl Timeline {
|
||||||
/// Level0 files first phase of compaction, explained in the [`Self::compact`] comment.
|
/// Level0 files first phase of compaction, explained in the [`Self::compact`] comment.
|
||||||
|
///
|
||||||
|
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
||||||
|
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
||||||
|
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
||||||
async fn compact_level0_phase1(
|
async fn compact_level0_phase1(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
|
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
|
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
|
||||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||||
target_file_size: u64,
|
target_file_size: u64,
|
||||||
@@ -3300,6 +3129,8 @@ impl Timeline {
|
|||||||
let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
|
let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
|
||||||
let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
|
let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
|
||||||
|
|
||||||
|
// FIXME: downloading while holding layer_removal_cs is not great, but we will remove that
|
||||||
|
// soon
|
||||||
deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
|
deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
|
||||||
for l in level0_deltas_iter {
|
for l in level0_deltas_iter {
|
||||||
let lsn_range = &l.layer_desc().lsn_range;
|
let lsn_range = &l.layer_desc().lsn_range;
|
||||||
@@ -3548,7 +3379,7 @@ impl Timeline {
|
|||||||
DeltaLayerWriter::new(
|
DeltaLayerWriter::new(
|
||||||
self.conf,
|
self.conf,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.tenant_shard_id,
|
self.tenant_id,
|
||||||
key,
|
key,
|
||||||
if dup_end_lsn.is_valid() {
|
if dup_end_lsn.is_valid() {
|
||||||
// this is a layer containing slice of values of the same key
|
// this is a layer containing slice of values of the same key
|
||||||
@@ -3598,24 +3429,21 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
|
// FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
|
||||||
let layer_paths: Vec<Utf8PathBuf> = new_layers
|
let mut layer_paths: Vec<Utf8PathBuf> = new_layers
|
||||||
.iter()
|
.iter()
|
||||||
.map(|l| l.local_path().to_owned())
|
.map(|l| l.local_path().to_owned())
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
// Fsync all the layer files and directory using multiple threads to
|
// Fsync all the layer files and directory using multiple threads to
|
||||||
// minimize latency.
|
// minimize latency.
|
||||||
par_fsync::par_fsync_async(&layer_paths)
|
//
|
||||||
.await
|
// FIXME: spawn_blocking above for this
|
||||||
.context("fsync all new layers")?;
|
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
|
||||||
|
|
||||||
let timeline_dir = self
|
par_fsync::par_fsync(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)])
|
||||||
.conf
|
|
||||||
.timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
|
||||||
|
|
||||||
par_fsync::par_fsync_async(&[timeline_dir])
|
|
||||||
.await
|
|
||||||
.context("fsync of timeline dir")?;
|
.context("fsync of timeline dir")?;
|
||||||
|
|
||||||
|
layer_paths.pop().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
|
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
|
||||||
@@ -3651,6 +3479,7 @@ impl Timeline {
|
|||||||
///
|
///
|
||||||
async fn compact_level0(
|
async fn compact_level0(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
|
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
target_file_size: u64,
|
target_file_size: u64,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), CompactionError> {
|
) -> Result<(), CompactionError> {
|
||||||
@@ -3662,7 +3491,7 @@ impl Timeline {
|
|||||||
let ctx = ctx.attached_child();
|
let ctx = ctx.attached_child();
|
||||||
let mut stats = CompactLevel0Phase1StatsBuilder {
|
let mut stats = CompactLevel0Phase1StatsBuilder {
|
||||||
version: Some(2),
|
version: Some(2),
|
||||||
tenant_id: Some(self.tenant_shard_id.tenant_id),
|
tenant_id: Some(self.tenant_id),
|
||||||
timeline_id: Some(self.timeline_id),
|
timeline_id: Some(self.timeline_id),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
@@ -3672,9 +3501,16 @@ impl Timeline {
|
|||||||
let now = tokio::time::Instant::now();
|
let now = tokio::time::Instant::now();
|
||||||
stats.read_lock_acquisition_micros =
|
stats.read_lock_acquisition_micros =
|
||||||
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
||||||
self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
|
let layer_removal_cs = layer_removal_cs.clone();
|
||||||
.instrument(phase1_span)
|
self.compact_level0_phase1(
|
||||||
.await?
|
layer_removal_cs,
|
||||||
|
phase1_layers_locked,
|
||||||
|
stats,
|
||||||
|
target_file_size,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.instrument(phase1_span)
|
||||||
|
.await?
|
||||||
};
|
};
|
||||||
|
|
||||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||||
@@ -3682,6 +3518,17 @@ impl Timeline {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Before deleting any layers, we need to wait for their upload ops to finish.
|
||||||
|
// See remote_timeline_client module level comment on consistency.
|
||||||
|
// Do it here because we don't want to hold self.layers.write() while waiting.
|
||||||
|
if let Some(remote_client) = &self.remote_client {
|
||||||
|
debug!("waiting for upload ops to complete");
|
||||||
|
remote_client
|
||||||
|
.wait_completion()
|
||||||
|
.await
|
||||||
|
.context("wait for layer upload ops to complete")?;
|
||||||
|
}
|
||||||
|
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
|
|
||||||
let mut duplicated_layers = HashSet::new();
|
let mut duplicated_layers = HashSet::new();
|
||||||
@@ -3713,7 +3560,12 @@ impl Timeline {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// deletion will happen later, the layer file manager calls garbage_collect_on_drop
|
// deletion will happen later, the layer file manager calls garbage_collect_on_drop
|
||||||
guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
|
guard.finish_compact_l0(
|
||||||
|
&layer_removal_cs,
|
||||||
|
&remove_layers,
|
||||||
|
&insert_layers,
|
||||||
|
&self.metrics,
|
||||||
|
);
|
||||||
|
|
||||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||||
remote_client.schedule_compaction_update(&remove_layers, &new_layers)?;
|
remote_client.schedule_compaction_update(&remove_layers, &new_layers)?;
|
||||||
@@ -3764,7 +3616,6 @@ impl Timeline {
|
|||||||
retain_lsns: Vec<Lsn>,
|
retain_lsns: Vec<Lsn>,
|
||||||
cutoff_horizon: Lsn,
|
cutoff_horizon: Lsn,
|
||||||
pitr: Duration,
|
pitr: Duration,
|
||||||
cancel: &CancellationToken,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
|
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
|
||||||
@@ -3778,10 +3629,7 @@ impl Timeline {
|
|||||||
if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
|
if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
|
||||||
let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
|
let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
|
||||||
|
|
||||||
match self
|
match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? {
|
||||||
.find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
|
|
||||||
.await?
|
|
||||||
{
|
|
||||||
LsnForTimestamp::Present(lsn) => lsn,
|
LsnForTimestamp::Present(lsn) => lsn,
|
||||||
LsnForTimestamp::Future(lsn) => {
|
LsnForTimestamp::Future(lsn) => {
|
||||||
// The timestamp is in the future. That sounds impossible,
|
// The timestamp is in the future. That sounds impossible,
|
||||||
@@ -3824,17 +3672,19 @@ impl Timeline {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///
|
||||||
/// Garbage collect layer files on a timeline that are no longer needed.
|
/// Garbage collect layer files on a timeline that are no longer needed.
|
||||||
///
|
///
|
||||||
/// Currently, we don't make any attempt at removing unneeded page versions
|
/// Currently, we don't make any attempt at removing unneeded page versions
|
||||||
/// within a layer file. We can only remove the whole file if it's fully
|
/// within a layer file. We can only remove the whole file if it's fully
|
||||||
/// obsolete.
|
/// obsolete.
|
||||||
|
///
|
||||||
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
|
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
|
||||||
let _g = self.gc_lock.lock().await;
|
|
||||||
let timer = self.metrics.garbage_collect_histo.start_timer();
|
let timer = self.metrics.garbage_collect_histo.start_timer();
|
||||||
|
|
||||||
fail_point!("before-timeline-gc");
|
fail_point!("before-timeline-gc");
|
||||||
|
|
||||||
|
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
|
||||||
// Is the timeline being deleted?
|
// Is the timeline being deleted?
|
||||||
if self.is_stopping() {
|
if self.is_stopping() {
|
||||||
anyhow::bail!("timeline is Stopping");
|
anyhow::bail!("timeline is Stopping");
|
||||||
@@ -3852,7 +3702,13 @@ impl Timeline {
|
|||||||
let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
||||||
|
|
||||||
let res = self
|
let res = self
|
||||||
.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
|
.gc_timeline(
|
||||||
|
layer_removal_cs.clone(),
|
||||||
|
horizon_cutoff,
|
||||||
|
pitr_cutoff,
|
||||||
|
retain_lsns,
|
||||||
|
new_gc_cutoff,
|
||||||
|
)
|
||||||
.instrument(
|
.instrument(
|
||||||
info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff),
|
info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff),
|
||||||
)
|
)
|
||||||
@@ -3866,6 +3722,7 @@ impl Timeline {
|
|||||||
|
|
||||||
async fn gc_timeline(
|
async fn gc_timeline(
|
||||||
&self,
|
&self,
|
||||||
|
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
horizon_cutoff: Lsn,
|
horizon_cutoff: Lsn,
|
||||||
pitr_cutoff: Lsn,
|
pitr_cutoff: Lsn,
|
||||||
retain_lsns: Vec<Lsn>,
|
retain_lsns: Vec<Lsn>,
|
||||||
@@ -3903,6 +3760,17 @@ impl Timeline {
|
|||||||
|
|
||||||
debug!("retain_lsns: {:?}", retain_lsns);
|
debug!("retain_lsns: {:?}", retain_lsns);
|
||||||
|
|
||||||
|
// Before deleting any layers, we need to wait for their upload ops to finish.
|
||||||
|
// See storage_sync module level comment on consistency.
|
||||||
|
// Do it here because we don't want to hold self.layers.write() while waiting.
|
||||||
|
if let Some(remote_client) = &self.remote_client {
|
||||||
|
debug!("waiting for upload ops to complete");
|
||||||
|
remote_client
|
||||||
|
.wait_completion()
|
||||||
|
.await
|
||||||
|
.context("wait for layer upload ops to complete")?;
|
||||||
|
}
|
||||||
|
|
||||||
let mut layers_to_remove = Vec::new();
|
let mut layers_to_remove = Vec::new();
|
||||||
let mut wanted_image_layers = KeySpaceRandomAccum::default();
|
let mut wanted_image_layers = KeySpaceRandomAccum::default();
|
||||||
|
|
||||||
@@ -4018,11 +3886,6 @@ impl Timeline {
|
|||||||
//
|
//
|
||||||
// This does not in fact have any effect as we no longer consider local metadata unless
|
// This does not in fact have any effect as we no longer consider local metadata unless
|
||||||
// running without remote storage.
|
// running without remote storage.
|
||||||
//
|
|
||||||
// This unconditionally schedules also an index_part.json update, even though, we will
|
|
||||||
// be doing one a bit later with the unlinked gc'd layers.
|
|
||||||
//
|
|
||||||
// TODO: remove when implementing <https://github.com/neondatabase/neon/issues/4099>.
|
|
||||||
self.update_metadata_file(self.disk_consistent_lsn.load(), None)
|
self.update_metadata_file(self.disk_consistent_lsn.load(), None)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -4037,16 +3900,11 @@ impl Timeline {
|
|||||||
remote_client.schedule_gc_update(&gc_layers)?;
|
remote_client.schedule_gc_update(&gc_layers)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
guard.finish_gc_timeline(&gc_layers);
|
guard.finish_gc_timeline(&layer_removal_cs, gc_layers);
|
||||||
|
|
||||||
if result.layers_removed != 0 {
|
if result.layers_removed != 0 {
|
||||||
fail_point!("after-timeline-gc-removed-layers");
|
fail_point!("after-timeline-gc-removed-layers");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
{
|
|
||||||
result.doomed_layers = gc_layers;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
@@ -4058,7 +3916,9 @@ impl Timeline {
|
|||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///
|
||||||
/// Reconstruct a value, using the given base image and WAL records in 'data'.
|
/// Reconstruct a value, using the given base image and WAL records in 'data'.
|
||||||
|
///
|
||||||
async fn reconstruct_value(
|
async fn reconstruct_value(
|
||||||
&self,
|
&self,
|
||||||
key: Key,
|
key: Key,
|
||||||
@@ -4123,7 +3983,7 @@ impl Timeline {
|
|||||||
let cache = page_cache::get();
|
let cache = page_cache::get();
|
||||||
if let Err(e) = cache
|
if let Err(e) = cache
|
||||||
.memorize_materialized_page(
|
.memorize_materialized_page(
|
||||||
self.tenant_shard_id.tenant_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
key,
|
key,
|
||||||
last_rec_lsn,
|
last_rec_lsn,
|
||||||
@@ -4167,7 +4027,7 @@ impl Timeline {
|
|||||||
let task_id = task_mgr::spawn(
|
let task_id = task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
task_mgr::TaskKind::DownloadAllRemoteLayers,
|
task_mgr::TaskKind::DownloadAllRemoteLayers,
|
||||||
Some(self.tenant_shard_id.tenant_id),
|
Some(self.tenant_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
"download all remote layers task",
|
"download all remote layers task",
|
||||||
false,
|
false,
|
||||||
@@ -4189,7 +4049,7 @@ impl Timeline {
|
|||||||
};
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))
|
.instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))
|
||||||
);
|
);
|
||||||
|
|
||||||
let initial_info = DownloadRemoteLayersTaskInfo {
|
let initial_info = DownloadRemoteLayersTaskInfo {
|
||||||
@@ -4388,13 +4248,6 @@ impl Timeline {
|
|||||||
resident_layers,
|
resident_layers,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_shard_index(&self) -> ShardIndex {
|
|
||||||
ShardIndex {
|
|
||||||
shard_number: self.tenant_shard_id.shard_number,
|
|
||||||
shard_count: self.tenant_shard_id.shard_count,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type TraversalPathItem = (
|
type TraversalPathItem = (
|
||||||
|
|||||||
@@ -4,10 +4,13 @@ use std::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
use pageserver_api::models::TimelineState;
|
||||||
use tokio::sync::OwnedMutexGuard;
|
use tokio::sync::OwnedMutexGuard;
|
||||||
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
|
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
|
||||||
use utils::{crashsafe, fs_ext, id::TimelineId};
|
use utils::{
|
||||||
|
crashsafe, fs_ext,
|
||||||
|
id::{TenantId, TimelineId},
|
||||||
|
};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
@@ -44,7 +47,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
|||||||
// Shut down the layer flush task before the remote client, as one depends on the other
|
// Shut down the layer flush task before the remote client, as one depends on the other
|
||||||
task_mgr::shutdown_tasks(
|
task_mgr::shutdown_tasks(
|
||||||
Some(TaskKind::LayerFlushTask),
|
Some(TaskKind::LayerFlushTask),
|
||||||
Some(timeline.tenant_shard_id.tenant_id),
|
Some(timeline.tenant_id),
|
||||||
Some(timeline.timeline_id),
|
Some(timeline.timeline_id),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
@@ -70,12 +73,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
|||||||
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
||||||
// so, they are not affected by this shutdown_tasks() call.
|
// so, they are not affected by this shutdown_tasks() call.
|
||||||
info!("waiting for timeline tasks to shutdown");
|
info!("waiting for timeline tasks to shutdown");
|
||||||
task_mgr::shutdown_tasks(
|
task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
|
||||||
None,
|
|
||||||
Some(timeline.tenant_shard_id.tenant_id),
|
|
||||||
Some(timeline.timeline_id),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -112,11 +110,40 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Grab the compaction and gc locks, and actually perform the deletion.
|
// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
|
||||||
|
// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
|
||||||
|
// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
|
||||||
|
// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
|
||||||
|
// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
|
||||||
|
// So we can just remove the mark file.
|
||||||
|
async fn create_delete_mark(
|
||||||
|
conf: &PageServerConf,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
) -> Result<(), DeleteTimelineError> {
|
||||||
|
fail::fail_point!("timeline-delete-before-delete-mark", |_| {
|
||||||
|
Err(anyhow::anyhow!(
|
||||||
|
"failpoint: timeline-delete-before-delete-mark"
|
||||||
|
))?
|
||||||
|
});
|
||||||
|
let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
|
||||||
|
|
||||||
|
// Note: we're ok to replace existing file.
|
||||||
|
let _ = std::fs::OpenOptions::new()
|
||||||
|
.write(true)
|
||||||
|
.create(true)
|
||||||
|
.open(&marker_path)
|
||||||
|
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||||
|
|
||||||
|
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Grab the layer_removal_cs lock, and actually perform the deletion.
|
||||||
///
|
///
|
||||||
/// The locks prevent GC or compaction from running at the same time. The background tasks do not
|
/// This lock prevents prevents GC or compaction from running at the same time.
|
||||||
/// register themselves with the timeline it's operating on, so it might still be running even
|
/// The GC task doesn't register itself with the timeline it's operating on,
|
||||||
/// though we called `shutdown_tasks`.
|
/// so it might still be running even though we called `shutdown_tasks`.
|
||||||
///
|
///
|
||||||
/// Note that there are still other race conditions between
|
/// Note that there are still other race conditions between
|
||||||
/// GC, compaction and timeline deletion. See
|
/// GC, compaction and timeline deletion. See
|
||||||
@@ -124,24 +151,19 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
|
|||||||
///
|
///
|
||||||
/// No timeout here, GC & Compaction should be responsive to the
|
/// No timeout here, GC & Compaction should be responsive to the
|
||||||
/// `TimelineState::Stopping` change.
|
/// `TimelineState::Stopping` change.
|
||||||
// pub(super): documentation link
|
async fn delete_local_layer_files(
|
||||||
pub(super) async fn delete_local_layer_files(
|
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
|
info!("waiting for layer_removal_cs.lock()");
|
||||||
let guards = crate::timed(
|
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||||
guards,
|
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||||
"acquire gc and compaction locks",
|
|
||||||
std::time::Duration::from_secs(5),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
||||||
// by the caller.
|
// by the caller.
|
||||||
|
|
||||||
let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
|
let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||||
@@ -157,8 +179,8 @@ pub(super) async fn delete_local_layer_files(
|
|||||||
// because of a previous failure/cancellation at/after
|
// because of a previous failure/cancellation at/after
|
||||||
// failpoint timeline-delete-after-rm.
|
// failpoint timeline-delete-after-rm.
|
||||||
//
|
//
|
||||||
// ErrorKind::NotFound can also happen if we race with tenant detach, because,
|
// It can also happen if we race with tenant detach, because,
|
||||||
// no locks are shared.
|
// it doesn't grab the layer_removal_cs lock.
|
||||||
//
|
//
|
||||||
// For now, log and continue.
|
// For now, log and continue.
|
||||||
// warn! level is technically not appropriate for the
|
// warn! level is technically not appropriate for the
|
||||||
@@ -177,7 +199,7 @@ pub(super) async fn delete_local_layer_files(
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
|
let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
|
||||||
|
|
||||||
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
|
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
@@ -226,8 +248,8 @@ pub(super) async fn delete_local_layer_files(
|
|||||||
.with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
|
.with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("finished deleting layer files, releasing locks");
|
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
||||||
drop(guards);
|
drop(layer_removal_guard);
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||||
@@ -252,11 +274,11 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
|
|||||||
// (nothing can fail after its deletion)
|
// (nothing can fail after its deletion)
|
||||||
async fn cleanup_remaining_timeline_fs_traces(
|
async fn cleanup_remaining_timeline_fs_traces(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Remove local metadata
|
// Remove local metadata
|
||||||
tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
|
tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
|
||||||
.await
|
.await
|
||||||
.or_else(fs_ext::ignore_not_found)
|
.or_else(fs_ext::ignore_not_found)
|
||||||
.context("remove metadata")?;
|
.context("remove metadata")?;
|
||||||
@@ -268,7 +290,7 @@ async fn cleanup_remaining_timeline_fs_traces(
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Remove timeline dir
|
// Remove timeline dir
|
||||||
tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
|
tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
|
||||||
.await
|
.await
|
||||||
.or_else(fs_ext::ignore_not_found)
|
.or_else(fs_ext::ignore_not_found)
|
||||||
.context("timeline dir")?;
|
.context("timeline dir")?;
|
||||||
@@ -283,15 +305,13 @@ async fn cleanup_remaining_timeline_fs_traces(
|
|||||||
// to be reordered later and thus missed if a crash occurs.
|
// to be reordered later and thus missed if a crash occurs.
|
||||||
// Note that we dont need to sync after mark file is removed
|
// Note that we dont need to sync after mark file is removed
|
||||||
// because we can tolerate the case when mark file reappears on startup.
|
// because we can tolerate the case when mark file reappears on startup.
|
||||||
let timeline_path = conf.timelines_path(&tenant_shard_id);
|
let timeline_path = conf.timelines_path(&tenant_id);
|
||||||
crashsafe::fsync_async(timeline_path)
|
crashsafe::fsync_async(timeline_path)
|
||||||
.await
|
.await
|
||||||
.context("fsync_pre_mark_remove")?;
|
.context("fsync_pre_mark_remove")?;
|
||||||
|
|
||||||
// Remove delete mark
|
// Remove delete mark
|
||||||
// TODO: once we are confident that no more exist in the field, remove this
|
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
|
||||||
// line. It cleans up a legacy marker file that might in rare cases be present.
|
|
||||||
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
|
|
||||||
.await
|
.await
|
||||||
.or_else(fs_ext::ignore_not_found)
|
.or_else(fs_ext::ignore_not_found)
|
||||||
.context("remove delete mark")
|
.context("remove delete mark")
|
||||||
@@ -357,7 +377,7 @@ impl DeleteTimelineFlow {
|
|||||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||||
// error out if some of the shutdown tasks have already been completed!
|
// error out if some of the shutdown tasks have already been completed!
|
||||||
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
|
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
|
||||||
pub async fn run(
|
pub async fn run(
|
||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
@@ -371,6 +391,8 @@ impl DeleteTimelineFlow {
|
|||||||
|
|
||||||
set_deleted_in_remote_index(&timeline).await?;
|
set_deleted_in_remote_index(&timeline).await?;
|
||||||
|
|
||||||
|
create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
"failpoint: timeline-delete-before-schedule"
|
"failpoint: timeline-delete-before-schedule"
|
||||||
@@ -442,6 +464,10 @@ impl DeleteTimelineFlow {
|
|||||||
|
|
||||||
guard.mark_in_progress()?;
|
guard.mark_in_progress()?;
|
||||||
|
|
||||||
|
// Note that delete mark can be missing on resume
|
||||||
|
// because we create delete mark after we set deleted_at in the index part.
|
||||||
|
create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
|
||||||
|
|
||||||
Self::schedule_background(guard, tenant.conf, tenant, timeline);
|
Self::schedule_background(guard, tenant.conf, tenant, timeline);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -453,8 +479,7 @@ impl DeleteTimelineFlow {
|
|||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let r =
|
let r =
|
||||||
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
|
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
|
||||||
.await;
|
|
||||||
info!("Done");
|
info!("Done");
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
@@ -525,13 +550,13 @@ impl DeleteTimelineFlow {
|
|||||||
tenant: Arc<Tenant>,
|
tenant: Arc<Tenant>,
|
||||||
timeline: Arc<Timeline>,
|
timeline: Arc<Timeline>,
|
||||||
) {
|
) {
|
||||||
let tenant_shard_id = timeline.tenant_shard_id;
|
let tenant_id = timeline.tenant_id;
|
||||||
let timeline_id = timeline.timeline_id;
|
let timeline_id = timeline.timeline_id;
|
||||||
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::TimelineDeletionWorker,
|
TaskKind::TimelineDeletionWorker,
|
||||||
Some(tenant_shard_id.tenant_id),
|
Some(tenant_id),
|
||||||
Some(timeline_id),
|
Some(timeline_id),
|
||||||
"timeline_delete",
|
"timeline_delete",
|
||||||
false,
|
false,
|
||||||
@@ -544,7 +569,7 @@ impl DeleteTimelineFlow {
|
|||||||
}
|
}
|
||||||
.instrument({
|
.instrument({
|
||||||
let span =
|
let span =
|
||||||
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
|
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
|
||||||
span.follows_from(Span::current());
|
span.follows_from(Span::current());
|
||||||
span
|
span
|
||||||
}),
|
}),
|
||||||
@@ -557,14 +582,13 @@ impl DeleteTimelineFlow {
|
|||||||
tenant: &Tenant,
|
tenant: &Tenant,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
) -> Result<(), DeleteTimelineError> {
|
) -> Result<(), DeleteTimelineError> {
|
||||||
delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
|
delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
|
||||||
|
|
||||||
delete_remote_layers_and_index(timeline).await?;
|
delete_remote_layers_and_index(timeline).await?;
|
||||||
|
|
||||||
pausable_failpoint!("in_progress_delete");
|
pausable_failpoint!("in_progress_delete");
|
||||||
|
|
||||||
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
|
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
|
||||||
.await?;
|
|
||||||
|
|
||||||
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,6 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
context::{DownloadBehavior, RequestContext},
|
context::{DownloadBehavior, RequestContext},
|
||||||
pgdatadir_mapping::CollectKeySpaceError,
|
|
||||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||||
tenant::{
|
tenant::{
|
||||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||||
@@ -60,12 +59,9 @@ impl Timeline {
|
|||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
BACKGROUND_RUNTIME.handle(),
|
BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::Eviction,
|
TaskKind::Eviction,
|
||||||
Some(self.tenant_shard_id.tenant_id),
|
Some(self.tenant_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
&format!(
|
&format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
|
||||||
"layer eviction for {}/{}",
|
|
||||||
self.tenant_shard_id, self.timeline_id
|
|
||||||
),
|
|
||||||
false,
|
false,
|
||||||
async move {
|
async move {
|
||||||
let cancel = task_mgr::shutdown_token();
|
let cancel = task_mgr::shutdown_token();
|
||||||
@@ -80,7 +76,7 @@ impl Timeline {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
|
||||||
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
||||||
use crate::tenant::tasks::random_init_delay;
|
use crate::tenant::tasks::random_init_delay;
|
||||||
{
|
{
|
||||||
@@ -299,6 +295,7 @@ impl Timeline {
|
|||||||
stats.evicted += 1;
|
stats.evicted += 1;
|
||||||
}
|
}
|
||||||
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
||||||
|
// compaction/gc removed the file while we were waiting on layer_removal_cs
|
||||||
stats.not_evictable += 1;
|
stats.not_evictable += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -343,7 +340,7 @@ impl Timeline {
|
|||||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||||
// The others wait until the calculation is done so that they take into account the
|
// The others wait until the calculation is done so that they take into account the
|
||||||
// imitated accesses that the winner made.
|
// imitated accesses that the winner made.
|
||||||
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) {
|
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
|
||||||
Ok(t) => t,
|
Ok(t) => t,
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
return ControlFlow::Break(());
|
return ControlFlow::Break(());
|
||||||
@@ -353,7 +350,7 @@ impl Timeline {
|
|||||||
match state.last_layer_access_imitation {
|
match state.last_layer_access_imitation {
|
||||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||||
_ => {
|
_ => {
|
||||||
self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
|
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
|
||||||
.await;
|
.await;
|
||||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
||||||
}
|
}
|
||||||
@@ -400,16 +397,9 @@ impl Timeline {
|
|||||||
if size.is_err() {
|
if size.is_err() {
|
||||||
// ignore, see above comment
|
// ignore, see above comment
|
||||||
} else {
|
} else {
|
||||||
match e {
|
warn!(
|
||||||
CollectKeySpaceError::Cancelled => {
|
"failed to collect keyspace but succeeded in calculating logical size: {e:#}"
|
||||||
// Shutting down, ignore
|
);
|
||||||
}
|
|
||||||
err => {
|
|
||||||
warn!(
|
|
||||||
"failed to collect keyspace but succeeded in calculating logical size: {err:#}"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -419,8 +409,8 @@ impl Timeline {
|
|||||||
async fn imitate_synthetic_size_calculation_worker(
|
async fn imitate_synthetic_size_calculation_worker(
|
||||||
&self,
|
&self,
|
||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
cancel: &CancellationToken,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
|
cancel: &CancellationToken,
|
||||||
) {
|
) {
|
||||||
if self.conf.metric_collection_endpoint.is_none() {
|
if self.conf.metric_collection_endpoint.is_none() {
|
||||||
// We don't start the consumption metrics task if this is not set in the config.
|
// We don't start the consumption metrics task if this is not set in the config.
|
||||||
@@ -459,7 +449,6 @@ impl Timeline {
|
|||||||
None,
|
None,
|
||||||
&mut throwaway_cache,
|
&mut throwaway_cache,
|
||||||
LogicalSizeCalculationCause::EvictionTaskImitation,
|
LogicalSizeCalculationCause::EvictionTaskImitation,
|
||||||
cancel,
|
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.instrument(info_span!("gather_inputs"));
|
.instrument(info_span!("gather_inputs"));
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ use crate::{
|
|||||||
};
|
};
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use pageserver_api::shard::ShardIndex;
|
|
||||||
use std::{collections::HashMap, str::FromStr};
|
use std::{collections::HashMap, str::FromStr};
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -108,7 +107,6 @@ pub(super) fn reconcile(
|
|||||||
index_part: Option<&IndexPart>,
|
index_part: Option<&IndexPart>,
|
||||||
disk_consistent_lsn: Lsn,
|
disk_consistent_lsn: Lsn,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
shard: ShardIndex,
|
|
||||||
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
|
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
|
||||||
use Decision::*;
|
use Decision::*;
|
||||||
|
|
||||||
@@ -120,13 +118,10 @@ pub(super) fn reconcile(
|
|||||||
.map(|(name, file_size)| {
|
.map(|(name, file_size)| {
|
||||||
(
|
(
|
||||||
name,
|
name,
|
||||||
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
|
// The generation here will be corrected to match IndexPart in the merge below, unless
|
||||||
// it is not in IndexPart, in which case using our current generation makes sense
|
// it is not in IndexPart, in which case using our current generation makes sense
|
||||||
// because it will be uploaded in this generation.
|
// because it will be uploaded in this generation.
|
||||||
(
|
(Some(LayerFileMetadata::new(file_size, generation)), None),
|
||||||
Some(LayerFileMetadata::new(file_size, generation, shard)),
|
|
||||||
None,
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.collect::<Collected>();
|
.collect::<Collected>();
|
||||||
|
|||||||
@@ -1,9 +1,8 @@
|
|||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
use utils::{
|
use utils::{
|
||||||
id::TimelineId,
|
id::{TenantId, TimelineId},
|
||||||
lsn::{AtomicLsn, Lsn},
|
lsn::{AtomicLsn, Lsn},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -74,7 +73,7 @@ impl LayerManager {
|
|||||||
last_record_lsn: Lsn,
|
last_record_lsn: Lsn,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_id: TenantId,
|
||||||
) -> Result<Arc<InMemoryLayer>> {
|
) -> Result<Arc<InMemoryLayer>> {
|
||||||
ensure!(lsn.is_aligned());
|
ensure!(lsn.is_aligned());
|
||||||
|
|
||||||
@@ -110,8 +109,7 @@ impl LayerManager {
|
|||||||
lsn
|
lsn
|
||||||
);
|
);
|
||||||
|
|
||||||
let new_layer =
|
let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?;
|
||||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
|
|
||||||
let layer = Arc::new(new_layer);
|
let layer = Arc::new(new_layer);
|
||||||
|
|
||||||
self.layer_map.open_layer = Some(layer.clone());
|
self.layer_map.open_layer = Some(layer.clone());
|
||||||
@@ -166,7 +164,7 @@ impl LayerManager {
|
|||||||
/// Flush a frozen layer and add the written delta layer to the layer map.
|
/// Flush a frozen layer and add the written delta layer to the layer map.
|
||||||
pub(crate) fn finish_flush_l0_layer(
|
pub(crate) fn finish_flush_l0_layer(
|
||||||
&mut self,
|
&mut self,
|
||||||
delta_layer: Option<&ResidentLayer>,
|
delta_layer: &ResidentLayer,
|
||||||
frozen_layer_for_check: &Arc<InMemoryLayer>,
|
frozen_layer_for_check: &Arc<InMemoryLayer>,
|
||||||
metrics: &TimelineMetrics,
|
metrics: &TimelineMetrics,
|
||||||
) {
|
) {
|
||||||
@@ -181,17 +179,20 @@ impl LayerManager {
|
|||||||
// layer to disk at the same time, that would not work.
|
// layer to disk at the same time, that would not work.
|
||||||
assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
|
assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
|
||||||
|
|
||||||
if let Some(l) = delta_layer {
|
let mut updates = self.layer_map.batch_update();
|
||||||
let mut updates = self.layer_map.batch_update();
|
Self::insert_historic_layer(
|
||||||
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
|
delta_layer.as_ref().clone(),
|
||||||
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
&mut updates,
|
||||||
updates.flush();
|
&mut self.layer_fmgr,
|
||||||
}
|
);
|
||||||
|
metrics.record_new_file_metrics(delta_layer.layer_desc().file_size);
|
||||||
|
updates.flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Called when compaction is completed.
|
/// Called when compaction is completed.
|
||||||
pub(crate) fn finish_compact_l0(
|
pub(crate) fn finish_compact_l0(
|
||||||
&mut self,
|
&mut self,
|
||||||
|
layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
compact_from: &[Layer],
|
compact_from: &[Layer],
|
||||||
compact_to: &[ResidentLayer],
|
compact_to: &[ResidentLayer],
|
||||||
metrics: &TimelineMetrics,
|
metrics: &TimelineMetrics,
|
||||||
@@ -202,16 +203,25 @@ impl LayerManager {
|
|||||||
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
||||||
}
|
}
|
||||||
for l in compact_from {
|
for l in compact_from {
|
||||||
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
|
Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr);
|
||||||
}
|
}
|
||||||
updates.flush();
|
updates.flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Called when garbage collect has selected the layers to be removed.
|
/// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
|
||||||
pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) {
|
pub(crate) fn finish_gc_timeline(
|
||||||
|
&mut self,
|
||||||
|
layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
|
gc_layers: Vec<Layer>,
|
||||||
|
) {
|
||||||
let mut updates = self.layer_map.batch_update();
|
let mut updates = self.layer_map.batch_update();
|
||||||
for doomed_layer in gc_layers {
|
for doomed_layer in gc_layers {
|
||||||
Self::delete_historic_layer(doomed_layer, &mut updates, &mut self.layer_fmgr);
|
Self::delete_historic_layer(
|
||||||
|
layer_removal_cs,
|
||||||
|
&doomed_layer,
|
||||||
|
&mut updates,
|
||||||
|
&mut self.layer_fmgr,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
updates.flush()
|
updates.flush()
|
||||||
}
|
}
|
||||||
@@ -230,6 +240,7 @@ impl LayerManager {
|
|||||||
/// Remote storage is not affected by this operation.
|
/// Remote storage is not affected by this operation.
|
||||||
fn delete_historic_layer(
|
fn delete_historic_layer(
|
||||||
// we cannot remove layers otherwise, since gc and compaction will race
|
// we cannot remove layers otherwise, since gc and compaction will race
|
||||||
|
_layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||||
layer: &Layer,
|
layer: &Layer,
|
||||||
updates: &mut BatchedUpdates<'_>,
|
updates: &mut BatchedUpdates<'_>,
|
||||||
mapping: &mut LayerFileManager<Layer>,
|
mapping: &mut LayerFileManager<Layer>,
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use once_cell::sync::OnceCell;
|
|||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
/// Internal structure to hold all data needed for logical size calculation.
|
/// Internal structure to hold all data needed for logical size calculation.
|
||||||
@@ -23,10 +23,7 @@ pub(super) struct LogicalSize {
|
|||||||
///
|
///
|
||||||
/// NOTE: size at a given LSN is constant, but after a restart we will calculate
|
/// NOTE: size at a given LSN is constant, but after a restart we will calculate
|
||||||
/// the initial size at a different LSN.
|
/// the initial size at a different LSN.
|
||||||
pub initial_logical_size: OnceCell<(
|
pub initial_logical_size: OnceCell<u64>,
|
||||||
u64,
|
|
||||||
crate::metrics::initial_logical_size::FinishedCalculationGuard,
|
|
||||||
)>,
|
|
||||||
|
|
||||||
/// Semaphore to track ongoing calculation of `initial_logical_size`.
|
/// Semaphore to track ongoing calculation of `initial_logical_size`.
|
||||||
pub initial_size_computation: Arc<tokio::sync::Semaphore>,
|
pub initial_size_computation: Arc<tokio::sync::Semaphore>,
|
||||||
@@ -55,57 +52,25 @@ pub(super) struct LogicalSize {
|
|||||||
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
||||||
/// to modify this, it will also keep the prometheus metric in sync.
|
/// to modify this, it will also keep the prometheus metric in sync.
|
||||||
pub size_added_after_initial: AtomicI64,
|
pub size_added_after_initial: AtomicI64,
|
||||||
|
|
||||||
/// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`].
|
|
||||||
pub(super) did_return_approximate_to_walreceiver: AtomicBool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Normalized current size, that the data in pageserver occupies.
|
/// Normalized current size, that the data in pageserver occupies.
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub(crate) enum CurrentLogicalSize {
|
pub(super) enum CurrentLogicalSize {
|
||||||
/// The size is not yet calculated to the end, this is an intermediate result,
|
/// The size is not yet calculated to the end, this is an intermediate result,
|
||||||
/// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
|
/// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
|
||||||
/// yet total logical size cannot be below 0.
|
/// yet total logical size cannot be below 0.
|
||||||
Approximate(Approximate),
|
Approximate(u64),
|
||||||
// Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
|
// Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
|
||||||
// available for observation without any calculations.
|
// available for observation without any calculations.
|
||||||
Exact(Exact),
|
Exact(u64),
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone)]
|
|
||||||
pub(crate) enum Accuracy {
|
|
||||||
Approximate,
|
|
||||||
Exact,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
|
||||||
pub(crate) struct Approximate(u64);
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
|
||||||
pub(crate) struct Exact(u64);
|
|
||||||
|
|
||||||
impl From<&Approximate> for u64 {
|
|
||||||
fn from(value: &Approximate) -> Self {
|
|
||||||
value.0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<&Exact> for u64 {
|
|
||||||
fn from(val: &Exact) -> Self {
|
|
||||||
val.0
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CurrentLogicalSize {
|
impl CurrentLogicalSize {
|
||||||
pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
|
pub(super) fn size(&self) -> u64 {
|
||||||
match self {
|
*match self {
|
||||||
Self::Approximate(size) => size.into(),
|
Self::Approximate(size) => size,
|
||||||
Self::Exact(size) => size.into(),
|
Self::Exact(size) => size,
|
||||||
}
|
|
||||||
}
|
|
||||||
pub(crate) fn accuracy(&self) -> Accuracy {
|
|
||||||
match self {
|
|
||||||
Self::Approximate(_) => Accuracy::Approximate,
|
|
||||||
Self::Exact(_) => Accuracy::Exact,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -113,16 +78,11 @@ impl CurrentLogicalSize {
|
|||||||
impl LogicalSize {
|
impl LogicalSize {
|
||||||
pub(super) fn empty_initial() -> Self {
|
pub(super) fn empty_initial() -> Self {
|
||||||
Self {
|
Self {
|
||||||
initial_logical_size: OnceCell::with_value((0, {
|
initial_logical_size: OnceCell::with_value(0),
|
||||||
crate::metrics::initial_logical_size::START_CALCULATION
|
|
||||||
.first(None)
|
|
||||||
.calculation_result_saved()
|
|
||||||
})),
|
|
||||||
// initial_logical_size already computed, so, don't admit any calculations
|
// initial_logical_size already computed, so, don't admit any calculations
|
||||||
initial_size_computation: Arc::new(Semaphore::new(0)),
|
initial_size_computation: Arc::new(Semaphore::new(0)),
|
||||||
initial_part_end: None,
|
initial_part_end: None,
|
||||||
size_added_after_initial: AtomicI64::new(0),
|
size_added_after_initial: AtomicI64::new(0),
|
||||||
did_return_approximate_to_walreceiver: AtomicBool::new(false),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -132,24 +92,22 @@ impl LogicalSize {
|
|||||||
initial_size_computation: Arc::new(Semaphore::new(1)),
|
initial_size_computation: Arc::new(Semaphore::new(1)),
|
||||||
initial_part_end: Some(compute_to),
|
initial_part_end: Some(compute_to),
|
||||||
size_added_after_initial: AtomicI64::new(0),
|
size_added_after_initial: AtomicI64::new(0),
|
||||||
did_return_approximate_to_walreceiver: AtomicBool::new(false),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) fn current_size(&self) -> CurrentLogicalSize {
|
pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
|
||||||
let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
|
let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
|
||||||
// ^^^ keep this type explicit so that the casts in this function break if
|
// ^^^ keep this type explicit so that the casts in this function break if
|
||||||
// we change the type.
|
// we change the type.
|
||||||
match self.initial_logical_size.get() {
|
match self.initial_logical_size.get() {
|
||||||
Some((initial_size, _)) => {
|
Some(initial_size) => {
|
||||||
CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
|
initial_size.checked_add_signed(size_increment)
|
||||||
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
||||||
.unwrap()))
|
.map(CurrentLogicalSize::Exact)
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
|
|
||||||
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
|
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
|
||||||
CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment))
|
Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -163,7 +121,7 @@ impl LogicalSize {
|
|||||||
/// available for re-use. This doesn't contain the incremental part.
|
/// available for re-use. This doesn't contain the incremental part.
|
||||||
pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
|
pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
|
||||||
match self.initial_part_end {
|
match self.initial_part_end {
|
||||||
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
|
Some(v) if v == lsn => self.initial_logical_size.get().copied(),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,52 +43,37 @@ impl<'t> UninitializedTimeline<'t> {
|
|||||||
/// The caller is responsible for activating the timeline (function `.activate()`).
|
/// The caller is responsible for activating the timeline (function `.activate()`).
|
||||||
pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
|
pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
|
||||||
let timeline_id = self.timeline_id;
|
let timeline_id = self.timeline_id;
|
||||||
let tenant_shard_id = self.owning_tenant.tenant_shard_id;
|
let tenant_id = self.owning_tenant.tenant_id;
|
||||||
|
|
||||||
if self.raw_timeline.is_none() {
|
let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
|
||||||
return Err(anyhow::anyhow!(
|
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
|
||||||
"No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
|
})?;
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check that the caller initialized disk_consistent_lsn
|
// Check that the caller initialized disk_consistent_lsn
|
||||||
let new_disk_consistent_lsn = self
|
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
|
||||||
.raw_timeline
|
|
||||||
.as_ref()
|
|
||||||
.expect("checked above")
|
|
||||||
.0
|
|
||||||
.get_disk_consistent_lsn();
|
|
||||||
|
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
new_disk_consistent_lsn.is_valid(),
|
new_disk_consistent_lsn.is_valid(),
|
||||||
"new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
|
"new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
|
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
|
||||||
match timelines.entry(timeline_id) {
|
match timelines.entry(timeline_id) {
|
||||||
Entry::Occupied(_) => anyhow::bail!(
|
Entry::Occupied(_) => anyhow::bail!(
|
||||||
"Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
|
"Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
|
||||||
),
|
),
|
||||||
Entry::Vacant(v) => {
|
Entry::Vacant(v) => {
|
||||||
// after taking here should be no fallible operations, because the drop guard will not
|
|
||||||
// cleanup after and would block for example the tenant deletion
|
|
||||||
let (new_timeline, uninit_mark) =
|
|
||||||
self.raw_timeline.take().expect("already checked");
|
|
||||||
|
|
||||||
// this is the mutual exclusion between different retries to create the timeline;
|
|
||||||
// this should be an assertion.
|
|
||||||
uninit_mark.remove_uninit_mark().with_context(|| {
|
uninit_mark.remove_uninit_mark().with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
|
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
v.insert(Arc::clone(&new_timeline));
|
v.insert(Arc::clone(&new_timeline));
|
||||||
|
|
||||||
new_timeline.maybe_spawn_flush_loop();
|
new_timeline.maybe_spawn_flush_loop();
|
||||||
|
|
||||||
Ok(new_timeline)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(new_timeline)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Prepares timeline data by loading it from the basebackup archive.
|
/// Prepares timeline data by loading it from the basebackup archive.
|
||||||
@@ -134,7 +119,7 @@ impl<'t> UninitializedTimeline<'t> {
|
|||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"No raw timeline {}/{} found",
|
"No raw timeline {}/{} found",
|
||||||
self.owning_tenant.tenant_shard_id, self.timeline_id
|
self.owning_tenant.tenant_id, self.timeline_id
|
||||||
)
|
)
|
||||||
})?
|
})?
|
||||||
.0)
|
.0)
|
||||||
@@ -144,7 +129,7 @@ impl<'t> UninitializedTimeline<'t> {
|
|||||||
impl Drop for UninitializedTimeline<'_> {
|
impl Drop for UninitializedTimeline<'_> {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
||||||
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
|
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered();
|
||||||
error!("Timeline got dropped without initializing, cleaning its files");
|
error!("Timeline got dropped without initializing, cleaning its files");
|
||||||
cleanup_timeline_directory(uninit_mark);
|
cleanup_timeline_directory(uninit_mark);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ impl WalReceiver {
|
|||||||
mut broker_client: BrokerClientChannel,
|
mut broker_client: BrokerClientChannel,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let tenant_id = timeline.tenant_shard_id.tenant_id;
|
let tenant_id = timeline.tenant_id;
|
||||||
let timeline_id = timeline.timeline_id;
|
let timeline_id = timeline.timeline_id;
|
||||||
let walreceiver_ctx =
|
let walreceiver_ctx =
|
||||||
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
|
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let id = TenantTimelineId {
|
let id = TenantTimelineId {
|
||||||
tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
|
tenant_id: connection_manager_state.timeline.tenant_id,
|
||||||
timeline_id: connection_manager_state.timeline.timeline_id,
|
timeline_id: connection_manager_state.timeline.timeline_id,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -388,7 +388,7 @@ struct BrokerSkTimeline {
|
|||||||
impl ConnectionManagerState {
|
impl ConnectionManagerState {
|
||||||
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
|
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
|
||||||
let id = TenantTimelineId {
|
let id = TenantTimelineId {
|
||||||
tenant_id: timeline.tenant_shard_id.tenant_id,
|
tenant_id: timeline.tenant_id,
|
||||||
timeline_id: timeline.timeline_id,
|
timeline_id: timeline.timeline_id,
|
||||||
};
|
};
|
||||||
Self {
|
Self {
|
||||||
|
|||||||
@@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
WALRECEIVER_RUNTIME.handle(),
|
WALRECEIVER_RUNTIME.handle(),
|
||||||
TaskKind::WalReceiverConnectionPoller,
|
TaskKind::WalReceiverConnectionPoller,
|
||||||
Some(timeline.tenant_shard_id.tenant_id),
|
Some(timeline.tenant_id),
|
||||||
Some(timeline.timeline_id),
|
Some(timeline.timeline_id),
|
||||||
"walreceiver connection",
|
"walreceiver connection",
|
||||||
false,
|
false,
|
||||||
@@ -396,12 +396,11 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
|
|
||||||
// Send the replication feedback message.
|
// Send the replication feedback message.
|
||||||
// Regular standby_status_update fields are put into this message.
|
// Regular standby_status_update fields are put into this message.
|
||||||
let current_timeline_size = timeline
|
let (timeline_logical_size, _) = timeline
|
||||||
.get_current_logical_size(&ctx)
|
.get_current_logical_size(&ctx)
|
||||||
// FIXME: https://github.com/neondatabase/neon/issues/5963
|
.context("Status update creation failed to get current logical size")?;
|
||||||
.size_dont_care_about_accuracy();
|
|
||||||
let status_update = PageserverFeedback {
|
let status_update = PageserverFeedback {
|
||||||
current_timeline_size,
|
current_timeline_size: timeline_logical_size,
|
||||||
last_received_lsn,
|
last_received_lsn,
|
||||||
disk_consistent_lsn,
|
disk_consistent_lsn,
|
||||||
remote_consistent_lsn,
|
remote_consistent_lsn,
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use super::storage_layer::LayerFileName;
|
use super::storage_layer::LayerFileName;
|
||||||
use super::storage_layer::ResidentLayer;
|
use super::storage_layer::ResidentLayer;
|
||||||
|
use super::Generation;
|
||||||
use crate::tenant::metadata::TimelineMetadata;
|
use crate::tenant::metadata::TimelineMetadata;
|
||||||
use crate::tenant::remote_timeline_client::index::IndexPart;
|
use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||||
@@ -14,9 +15,6 @@ use utils::lsn::AtomicLsn;
|
|||||||
use std::sync::atomic::AtomicU32;
|
use std::sync::atomic::AtomicU32;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
use utils::generation::Generation;
|
|
||||||
|
|
||||||
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
|
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
|
||||||
// memory for Uninitialized variants. Doesn't matter in practice, there are not
|
// memory for Uninitialized variants. Doesn't matter in practice, there are not
|
||||||
// that many upload queues in a running pageserver, and most of them are initialized
|
// that many upload queues in a running pageserver, and most of them are initialized
|
||||||
@@ -90,14 +88,6 @@ pub(crate) struct UploadQueueInitialized {
|
|||||||
/// bug causing leaks, then it's better to not leave this enabled for production builds.
|
/// bug causing leaks, then it's better to not leave this enabled for production builds.
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
|
pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
|
||||||
|
|
||||||
/// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
|
|
||||||
pub(crate) shutting_down: bool,
|
|
||||||
|
|
||||||
/// Permitless semaphore on which any number of `RemoteTimelineClient::shutdown` futures can
|
|
||||||
/// wait on until one of them stops the queue. The semaphore is closed when
|
|
||||||
/// `RemoteTimelineClient::launch_queued_tasks` encounters `UploadOp::Shutdown`.
|
|
||||||
pub(crate) shutdown_ready: Arc<tokio::sync::Semaphore>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl UploadQueueInitialized {
|
impl UploadQueueInitialized {
|
||||||
@@ -156,8 +146,6 @@ impl UploadQueue {
|
|||||||
queued_operations: VecDeque::new(),
|
queued_operations: VecDeque::new(),
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
dangling_files: HashMap::new(),
|
dangling_files: HashMap::new(),
|
||||||
shutting_down: false,
|
|
||||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
*self = UploadQueue::Initialized(state);
|
*self = UploadQueue::Initialized(state);
|
||||||
@@ -205,8 +193,6 @@ impl UploadQueue {
|
|||||||
queued_operations: VecDeque::new(),
|
queued_operations: VecDeque::new(),
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
dangling_files: HashMap::new(),
|
dangling_files: HashMap::new(),
|
||||||
shutting_down: false,
|
|
||||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
*self = UploadQueue::Initialized(state);
|
*self = UploadQueue::Initialized(state);
|
||||||
@@ -218,13 +204,7 @@ impl UploadQueue {
|
|||||||
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
|
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
|
||||||
anyhow::bail!("queue is in state {}", self.as_str())
|
anyhow::bail!("queue is in state {}", self.as_str())
|
||||||
}
|
}
|
||||||
UploadQueue::Initialized(x) => {
|
UploadQueue::Initialized(x) => Ok(x),
|
||||||
if !x.shutting_down {
|
|
||||||
Ok(x)
|
|
||||||
} else {
|
|
||||||
anyhow::bail!("queue is shutting down")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -252,7 +232,7 @@ pub(crate) struct UploadTask {
|
|||||||
/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
|
/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct Delete {
|
pub(crate) struct Delete {
|
||||||
pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
pub(crate) layers: Vec<(LayerFileName, Generation)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -268,10 +248,6 @@ pub(crate) enum UploadOp {
|
|||||||
|
|
||||||
/// Barrier. When the barrier operation is reached,
|
/// Barrier. When the barrier operation is reached,
|
||||||
Barrier(tokio::sync::watch::Sender<()>),
|
Barrier(tokio::sync::watch::Sender<()>),
|
||||||
|
|
||||||
/// Shutdown; upon encountering this operation no new operations will be spawned, otherwise
|
|
||||||
/// this is the same as a Barrier.
|
|
||||||
Shutdown,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for UploadOp {
|
impl std::fmt::Display for UploadOp {
|
||||||
@@ -293,7 +269,6 @@ impl std::fmt::Display for UploadOp {
|
|||||||
write!(f, "Delete({} layers)", delete.layers.len())
|
write!(f, "Delete({} layers)", delete.layers.len())
|
||||||
}
|
}
|
||||||
UploadOp::Barrier(_) => write!(f, "Barrier"),
|
UploadOp::Barrier(_) => write!(f, "Barrier"),
|
||||||
UploadOp::Shutdown => write!(f, "Shutdown"),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -98,257 +98,260 @@ impl<'a> WalIngest<'a> {
|
|||||||
self.checkpoint_modified = true;
|
self.checkpoint_modified = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
match decoded.xl_rmid {
|
// Heap AM records need some special handling, because they modify VM pages
|
||||||
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
|
// without registering them with the standard mechanism.
|
||||||
// Heap AM records need some special handling, because they modify VM pages
|
if decoded.xl_rmid == pg_constants::RM_HEAP_ID
|
||||||
// without registering them with the standard mechanism.
|
|| decoded.xl_rmid == pg_constants::RM_HEAP2_ID
|
||||||
self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
|
{
|
||||||
.await?;
|
self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
|
||||||
}
|
.await?;
|
||||||
pg_constants::RM_NEON_ID => {
|
}
|
||||||
self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
|
if decoded.xl_rmid == pg_constants::RM_NEON_ID {
|
||||||
.await?;
|
self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
|
||||||
}
|
.await?;
|
||||||
// Handle other special record types
|
}
|
||||||
pg_constants::RM_SMGR_ID => {
|
// Handle other special record types
|
||||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||||
|
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
if info == pg_constants::XLOG_SMGR_CREATE {
|
== pg_constants::XLOG_SMGR_CREATE
|
||||||
let create = XlSmgrCreate::decode(&mut buf);
|
{
|
||||||
self.ingest_xlog_smgr_create(modification, &create, ctx)
|
let create = XlSmgrCreate::decode(&mut buf);
|
||||||
.await?;
|
self.ingest_xlog_smgr_create(modification, &create, ctx)
|
||||||
} else if info == pg_constants::XLOG_SMGR_TRUNCATE {
|
.await?;
|
||||||
let truncate = XlSmgrTruncate::decode(&mut buf);
|
} else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||||
self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
|
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
.await?;
|
== pg_constants::XLOG_SMGR_TRUNCATE
|
||||||
}
|
{
|
||||||
}
|
let truncate = XlSmgrTruncate::decode(&mut buf);
|
||||||
pg_constants::RM_DBASE_ID => {
|
self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
|
||||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
.await?;
|
||||||
debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");
|
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||||
|
debug!(
|
||||||
if self.timeline.pg_version == 14 {
|
"handle RM_DBASE_ID for Postgres version {:?}",
|
||||||
if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
|
self.timeline.pg_version
|
||||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
);
|
||||||
debug!("XLOG_DBASE_CREATE v14");
|
if self.timeline.pg_version == 14 {
|
||||||
|
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
|
== postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
|
||||||
.await?;
|
|
||||||
} else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
|
|
||||||
let dropdb = XlDropDatabase::decode(&mut buf);
|
|
||||||
for tablespace_id in dropdb.tablespace_ids {
|
|
||||||
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
|
||||||
modification
|
|
||||||
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if self.timeline.pg_version == 15 {
|
|
||||||
if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
|
||||||
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
|
||||||
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
|
||||||
// The XLOG record was renamed between v14 and v15,
|
|
||||||
// but the record format is the same.
|
|
||||||
// So we can reuse XlCreateDatabase here.
|
|
||||||
debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
|
||||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
|
||||||
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
|
|
||||||
.await?;
|
|
||||||
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
|
|
||||||
let dropdb = XlDropDatabase::decode(&mut buf);
|
|
||||||
for tablespace_id in dropdb.tablespace_ids {
|
|
||||||
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
|
||||||
modification
|
|
||||||
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if self.timeline.pg_version == 16 {
|
|
||||||
if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
|
||||||
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
|
||||||
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
|
||||||
// The XLOG record was renamed between v14 and v15,
|
|
||||||
// but the record format is the same.
|
|
||||||
// So we can reuse XlCreateDatabase here.
|
|
||||||
debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
|
||||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
|
||||||
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
|
|
||||||
.await?;
|
|
||||||
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
|
|
||||||
let dropdb = XlDropDatabase::decode(&mut buf);
|
|
||||||
for tablespace_id in dropdb.tablespace_ids {
|
|
||||||
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
|
||||||
modification
|
|
||||||
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pg_constants::RM_TBLSPC_ID => {
|
|
||||||
trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
|
|
||||||
}
|
|
||||||
pg_constants::RM_CLOG_ID => {
|
|
||||||
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
|
|
||||||
|
|
||||||
if info == pg_constants::CLOG_ZEROPAGE {
|
|
||||||
let pageno = buf.get_u32_le();
|
|
||||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
|
||||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
|
||||||
self.put_slru_page_image(
|
|
||||||
modification,
|
|
||||||
SlruKind::Clog,
|
|
||||||
segno,
|
|
||||||
rpageno,
|
|
||||||
ZERO_PAGE.clone(),
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
} else {
|
|
||||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
|
||||||
let xlrec = XlClogTruncate::decode(&mut buf);
|
|
||||||
self.ingest_clog_truncate_record(modification, &xlrec, ctx)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pg_constants::RM_XACT_ID => {
|
|
||||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
|
||||||
|
|
||||||
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
|
|
||||||
let parsed_xact =
|
|
||||||
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
|
|
||||||
self.ingest_xact_record(
|
|
||||||
modification,
|
|
||||||
&parsed_xact,
|
|
||||||
info == pg_constants::XLOG_XACT_COMMIT,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
} else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
|
||||||
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
|
||||||
{
|
{
|
||||||
let parsed_xact =
|
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||||
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
|
debug!("XLOG_DBASE_CREATE v14");
|
||||||
self.ingest_xact_record(
|
|
||||||
modification,
|
|
||||||
&parsed_xact,
|
|
||||||
info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
|
|
||||||
trace!(
|
|
||||||
"Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
|
|
||||||
decoded.xl_xid,
|
|
||||||
parsed_xact.xid,
|
|
||||||
lsn,
|
|
||||||
);
|
|
||||||
modification
|
|
||||||
.drop_twophase_file(parsed_xact.xid, ctx)
|
|
||||||
.await?;
|
|
||||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
|
||||||
modification
|
|
||||||
.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pg_constants::RM_MULTIXACT_ID => {
|
|
||||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
|
||||||
|
|
||||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
|
||||||
let pageno = buf.get_u32_le();
|
|
||||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
|
||||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
|
||||||
self.put_slru_page_image(
|
|
||||||
modification,
|
|
||||||
SlruKind::MultiXactOffsets,
|
|
||||||
segno,
|
|
||||||
rpageno,
|
|
||||||
ZERO_PAGE.clone(),
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
|
|
||||||
let pageno = buf.get_u32_le();
|
|
||||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
|
||||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
|
||||||
self.put_slru_page_image(
|
|
||||||
modification,
|
|
||||||
SlruKind::MultiXactMembers,
|
|
||||||
segno,
|
|
||||||
rpageno,
|
|
||||||
ZERO_PAGE.clone(),
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
|
||||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
|
||||||
self.ingest_multixact_create_record(modification, &xlrec)?;
|
|
||||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
|
||||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
|
||||||
self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
|
|
||||||
.await?;
|
.await?;
|
||||||
}
|
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
}
|
== postgres_ffi::v14::bindings::XLOG_DBASE_DROP
|
||||||
pg_constants::RM_RELMAP_ID => {
|
|
||||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
|
||||||
self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
pg_constants::RM_XLOG_ID => {
|
|
||||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
|
||||||
|
|
||||||
if info == pg_constants::XLOG_NEXTOID {
|
|
||||||
let next_oid = buf.get_u32_le();
|
|
||||||
if self.checkpoint.nextOid != next_oid {
|
|
||||||
self.checkpoint.nextOid = next_oid;
|
|
||||||
self.checkpoint_modified = true;
|
|
||||||
}
|
|
||||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
|
||||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
|
||||||
{
|
{
|
||||||
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
|
let dropdb = XlDropDatabase::decode(&mut buf);
|
||||||
buf.copy_to_slice(&mut checkpoint_bytes);
|
for tablespace_id in dropdb.tablespace_ids {
|
||||||
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
||||||
trace!(
|
modification
|
||||||
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
|
||||||
xlog_checkpoint.oldestXid,
|
.await?;
|
||||||
self.checkpoint.oldestXid
|
}
|
||||||
);
|
}
|
||||||
if (self
|
} else if self.timeline.pg_version == 15 {
|
||||||
.checkpoint
|
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
.oldestXid
|
== postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG
|
||||||
.wrapping_sub(xlog_checkpoint.oldestXid) as i32)
|
{
|
||||||
< 0
|
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||||
{
|
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
== postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY
|
||||||
self.checkpoint_modified = true;
|
{
|
||||||
|
// The XLOG record was renamed between v14 and v15,
|
||||||
|
// but the record format is the same.
|
||||||
|
// So we can reuse XlCreateDatabase here.
|
||||||
|
debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
||||||
|
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||||
|
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
|
||||||
|
.await?;
|
||||||
|
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
|
== postgres_ffi::v15::bindings::XLOG_DBASE_DROP
|
||||||
|
{
|
||||||
|
let dropdb = XlDropDatabase::decode(&mut buf);
|
||||||
|
for tablespace_id in dropdb.tablespace_ids {
|
||||||
|
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
||||||
|
modification
|
||||||
|
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if self.timeline.pg_version == 16 {
|
||||||
|
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
|
== postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG
|
||||||
|
{
|
||||||
|
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||||
|
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
|
== postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY
|
||||||
|
{
|
||||||
|
// The XLOG record was renamed between v14 and v15,
|
||||||
|
// but the record format is the same.
|
||||||
|
// So we can reuse XlCreateDatabase here.
|
||||||
|
debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
||||||
|
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||||
|
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
|
||||||
|
.await?;
|
||||||
|
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||||
|
== postgres_ffi::v16::bindings::XLOG_DBASE_DROP
|
||||||
|
{
|
||||||
|
let dropdb = XlDropDatabase::decode(&mut buf);
|
||||||
|
for tablespace_id in dropdb.tablespace_ids {
|
||||||
|
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
||||||
|
modification
|
||||||
|
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
|
||||||
|
.await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pg_constants::RM_LOGICALMSG_ID => {
|
} else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
|
||||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
|
||||||
|
} else if decoded.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||||
|
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||||
|
if info == pg_constants::CLOG_ZEROPAGE {
|
||||||
|
let pageno = buf.get_u32_le();
|
||||||
|
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||||
|
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||||
|
self.put_slru_page_image(
|
||||||
|
modification,
|
||||||
|
SlruKind::Clog,
|
||||||
|
segno,
|
||||||
|
rpageno,
|
||||||
|
ZERO_PAGE.clone(),
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
} else {
|
||||||
|
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||||
|
let xlrec = XlClogTruncate::decode(&mut buf);
|
||||||
|
self.ingest_clog_truncate_record(modification, &xlrec, ctx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
|
||||||
|
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||||
|
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
|
||||||
|
let parsed_xact =
|
||||||
|
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
|
||||||
|
self.ingest_xact_record(
|
||||||
|
modification,
|
||||||
|
&parsed_xact,
|
||||||
|
info == pg_constants::XLOG_XACT_COMMIT,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
} else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||||
|
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||||
|
{
|
||||||
|
let parsed_xact =
|
||||||
|
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
|
||||||
|
self.ingest_xact_record(
|
||||||
|
modification,
|
||||||
|
&parsed_xact,
|
||||||
|
info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
|
||||||
|
trace!(
|
||||||
|
"Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
|
||||||
|
decoded.xl_xid,
|
||||||
|
parsed_xact.xid,
|
||||||
|
lsn,
|
||||||
|
);
|
||||||
|
modification
|
||||||
|
.drop_twophase_file(parsed_xact.xid, ctx)
|
||||||
|
.await?;
|
||||||
|
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||||
|
modification
|
||||||
|
.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
} else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||||
|
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||||
|
|
||||||
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||||
let xlrec = XlLogicalMessage::decode(&mut buf);
|
let pageno = buf.get_u32_le();
|
||||||
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
|
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||||
let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
|
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||||
if prefix == "neon-test" {
|
self.put_slru_page_image(
|
||||||
// This is a convenient way to make the WAL ingestion pause at
|
modification,
|
||||||
// particular point in the WAL. For more fine-grained control,
|
SlruKind::MultiXactOffsets,
|
||||||
// we could peek into the message and only pause if it contains
|
segno,
|
||||||
// a particular string, for example, but this is enough for now.
|
rpageno,
|
||||||
crate::failpoint_support::sleep_millis_async!(
|
ZERO_PAGE.clone(),
|
||||||
"wal-ingest-logical-message-sleep"
|
ctx,
|
||||||
);
|
)
|
||||||
} else if let Some(path) = prefix.strip_prefix("neon-file:") {
|
.await?;
|
||||||
modification.put_file(path, message, ctx).await?;
|
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
|
||||||
}
|
let pageno = buf.get_u32_le();
|
||||||
|
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||||
|
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||||
|
self.put_slru_page_image(
|
||||||
|
modification,
|
||||||
|
SlruKind::MultiXactMembers,
|
||||||
|
segno,
|
||||||
|
rpageno,
|
||||||
|
ZERO_PAGE.clone(),
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||||
|
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||||
|
self.ingest_multixact_create_record(modification, &xlrec)?;
|
||||||
|
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||||
|
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||||
|
self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||||
|
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||||
|
self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
|
||||||
|
.await?;
|
||||||
|
} else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
|
||||||
|
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||||
|
if info == pg_constants::XLOG_NEXTOID {
|
||||||
|
let next_oid = buf.get_u32_le();
|
||||||
|
if self.checkpoint.nextOid != next_oid {
|
||||||
|
self.checkpoint.nextOid = next_oid;
|
||||||
|
self.checkpoint_modified = true;
|
||||||
|
}
|
||||||
|
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||||
|
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||||
|
{
|
||||||
|
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
|
||||||
|
buf.copy_to_slice(&mut checkpoint_bytes);
|
||||||
|
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||||
|
trace!(
|
||||||
|
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
||||||
|
xlog_checkpoint.oldestXid,
|
||||||
|
self.checkpoint.oldestXid
|
||||||
|
);
|
||||||
|
if (self
|
||||||
|
.checkpoint
|
||||||
|
.oldestXid
|
||||||
|
.wrapping_sub(xlog_checkpoint.oldestXid) as i32)
|
||||||
|
< 0
|
||||||
|
{
|
||||||
|
self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||||
|
self.checkpoint_modified = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_x => {
|
} else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
|
||||||
// TODO: should probably log & fail here instead of blindly
|
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||||
// doing something without understanding the protocol
|
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
||||||
|
let xlrec = XlLogicalMessage::decode(&mut buf);
|
||||||
|
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
|
||||||
|
let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
|
||||||
|
if prefix == "neon-test" {
|
||||||
|
// This is a convenient way to make the WAL ingestion pause at
|
||||||
|
// particular point in the WAL. For more fine-grained control,
|
||||||
|
// we could peek into the message and only pause if it contains
|
||||||
|
// a particular string, for example, but this is enough for now.
|
||||||
|
crate::failpoint_support::sleep_millis_async!(
|
||||||
|
"wal-ingest-logical-message-sleep"
|
||||||
|
);
|
||||||
|
} else if let Some(path) = prefix.strip_prefix("neon-file:") {
|
||||||
|
modification.put_file(path, message, ctx).await?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1437,16 +1440,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
// record.
|
// record.
|
||||||
// TODO: would be nice if to be more explicit about it
|
// TODO: would be nice if to be more explicit about it
|
||||||
let last_lsn = modification.lsn;
|
let last_lsn = modification.lsn;
|
||||||
|
let old_nblocks = if !self
|
||||||
// Get current size and put rel creation if rel doesn't exist
|
|
||||||
//
|
|
||||||
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
|
|
||||||
// check the cache too. This is because eagerly checking the cache results in
|
|
||||||
// less work overall and 10% better performance. It's more work on cache miss
|
|
||||||
// but cache miss is rare.
|
|
||||||
let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
|
|
||||||
nblocks
|
|
||||||
} else if !self
|
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_exists(rel, last_lsn, true, ctx)
|
.get_rel_exists(rel, last_lsn, true, ctx)
|
||||||
.await?
|
.await?
|
||||||
@@ -2085,88 +2079,4 @@ mod tests {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Replay a wal segment file taken directly from safekeepers.
|
|
||||||
///
|
|
||||||
/// This test is useful for benchmarking since it allows us to profile only
|
|
||||||
/// the walingest code in a single-threaded executor, and iterate more quickly
|
|
||||||
/// without waiting for unrelated steps.
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_ingest_real_wal() {
|
|
||||||
use crate::tenant::harness::*;
|
|
||||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
|
||||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
|
||||||
|
|
||||||
// Define test data path and constants.
|
|
||||||
//
|
|
||||||
// Steps to reconstruct the data, if needed:
|
|
||||||
// 1. Run the pgbench python test
|
|
||||||
// 2. Take the first wal segment file from safekeeper
|
|
||||||
// 3. Compress it using `zstd --long input_file`
|
|
||||||
// 4. Copy initdb.tar.zst from local_fs_remote_storage
|
|
||||||
// 5. Grep sk logs for "restart decoder" to get startpoint
|
|
||||||
// 6. Run just the decoder from this test to get the endpoint.
|
|
||||||
// It's the last LSN the decoder will output.
|
|
||||||
let pg_version = 15; // The test data was generated by pg15
|
|
||||||
let path = "test_data/sk_wal_segment_from_pgbench";
|
|
||||||
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
|
|
||||||
let startpoint = Lsn::from_hex("14AEC08").unwrap();
|
|
||||||
let endpoint = Lsn::from_hex("1FFFF98").unwrap();
|
|
||||||
|
|
||||||
// Bootstrap a real timeline. We can't use create_test_timeline because
|
|
||||||
// it doesn't create a real checkpoint, and Walingest::new tries to parse
|
|
||||||
// the garbage data.
|
|
||||||
//
|
|
||||||
// TODO use the initdb.tar.zst file stored with the test data to avoid
|
|
||||||
// problems with inconsistent initdb results after pg minor version bumps.
|
|
||||||
let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal")
|
|
||||||
.unwrap()
|
|
||||||
.load()
|
|
||||||
.await;
|
|
||||||
let tline = tenant
|
|
||||||
.bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// We fully read and decompress this into memory before decoding
|
|
||||||
// to get a more accurate perf profile of the decoder.
|
|
||||||
let bytes = {
|
|
||||||
use async_compression::tokio::bufread::ZstdDecoder;
|
|
||||||
let file = tokio::fs::File::open(wal_segment_path).await.unwrap();
|
|
||||||
let reader = tokio::io::BufReader::new(file);
|
|
||||||
let decoder = ZstdDecoder::new(reader);
|
|
||||||
let mut reader = tokio::io::BufReader::new(decoder);
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
tokio::io::copy_buf(&mut reader, &mut buffer).await.unwrap();
|
|
||||||
buffer
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO start a profiler too
|
|
||||||
let started_at = std::time::Instant::now();
|
|
||||||
|
|
||||||
// Initialize walingest
|
|
||||||
let xlogoff: usize = startpoint.segment_offset(WAL_SEGMENT_SIZE);
|
|
||||||
let mut decoder = WalStreamDecoder::new(startpoint, pg_version);
|
|
||||||
let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
let mut modification = tline.begin_modification(endpoint);
|
|
||||||
let mut decoded = DecodedWALRecord::default();
|
|
||||||
println!("decoding {} bytes", bytes.len() - xlogoff);
|
|
||||||
|
|
||||||
// Decode and ingest wal. We process the wal in chunks because
|
|
||||||
// that's what happens when we get bytes from safekeepers.
|
|
||||||
for chunk in bytes[xlogoff..].chunks(50) {
|
|
||||||
decoder.feed_bytes(chunk);
|
|
||||||
while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
|
|
||||||
walingest
|
|
||||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let duration = started_at.elapsed();
|
|
||||||
println!("done in {:?}", duration);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -41,14 +41,9 @@ use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
|||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::metrics::{
|
use crate::metrics::{
|
||||||
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
|
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
||||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
|
|
||||||
WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
|
||||||
};
|
};
|
||||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||||
use crate::repository::Key;
|
use crate::repository::Key;
|
||||||
@@ -95,7 +90,6 @@ struct ProcessOutput {
|
|||||||
pub struct PostgresRedoManager {
|
pub struct PostgresRedoManager {
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
|
||||||
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
|
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -192,26 +186,10 @@ impl PostgresRedoManager {
|
|||||||
PostgresRedoManager {
|
PostgresRedoManager {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
conf,
|
conf,
|
||||||
last_redo_at: std::sync::Mutex::default(),
|
|
||||||
redo_process: RwLock::new(None),
|
redo_process: RwLock::new(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This type doesn't have its own background task to check for idleness: we
|
|
||||||
/// rely on our owner calling this function periodically in its own housekeeping
|
|
||||||
/// loops.
|
|
||||||
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
|
||||||
if let Ok(g) = self.last_redo_at.try_lock() {
|
|
||||||
if let Some(last_redo_at) = *g {
|
|
||||||
if last_redo_at.elapsed() >= idle_timeout {
|
|
||||||
drop(g);
|
|
||||||
let mut guard = self.redo_process.write().unwrap();
|
|
||||||
*guard = None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Process one request for WAL redo using wal-redo postgres
|
/// Process one request for WAL redo using wal-redo postgres
|
||||||
///
|
///
|
||||||
@@ -226,8 +204,6 @@ impl PostgresRedoManager {
|
|||||||
wal_redo_timeout: Duration,
|
wal_redo_timeout: Duration,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
) -> anyhow::Result<Bytes> {
|
) -> anyhow::Result<Bytes> {
|
||||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
|
||||||
|
|
||||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||||
let mut n_attempts = 0u32;
|
let mut n_attempts = 0u32;
|
||||||
@@ -242,13 +218,10 @@ impl PostgresRedoManager {
|
|||||||
let mut proc_guard = self.redo_process.write().unwrap();
|
let mut proc_guard = self.redo_process.write().unwrap();
|
||||||
match &*proc_guard {
|
match &*proc_guard {
|
||||||
None => {
|
None => {
|
||||||
let timer =
|
|
||||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
|
|
||||||
let proc = Arc::new(
|
let proc = Arc::new(
|
||||||
WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
|
WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
|
||||||
.context("launch walredo process")?,
|
.context("launch walredo process")?,
|
||||||
);
|
);
|
||||||
timer.observe_duration();
|
|
||||||
*proc_guard = Some(Arc::clone(&proc));
|
*proc_guard = Some(Arc::clone(&proc));
|
||||||
proc
|
proc
|
||||||
}
|
}
|
||||||
@@ -374,13 +347,12 @@ impl PostgresRedoManager {
|
|||||||
self.apply_record_neon(key, &mut page, *record_lsn, record)?;
|
self.apply_record_neon(key, &mut page, *record_lsn, record)?;
|
||||||
}
|
}
|
||||||
// Success!
|
// Success!
|
||||||
let duration = start_time.elapsed();
|
let end_time = Instant::now();
|
||||||
// FIXME: using the same metric here creates a bimodal distribution by default, and because
|
let duration = end_time.duration_since(start_time);
|
||||||
// there could be multiple batch sizes this would be N+1 modal.
|
|
||||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
"neon applied {} WAL records in {} us to reconstruct page image at LSN {}",
|
"neon applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||||
records.len(),
|
records.len(),
|
||||||
duration.as_micros(),
|
duration.as_micros(),
|
||||||
lsn
|
lsn
|
||||||
@@ -690,10 +662,10 @@ impl WalRedoProcess {
|
|||||||
.close_fds()
|
.close_fds()
|
||||||
.spawn_no_leak_child(tenant_id)
|
.spawn_no_leak_child(tenant_id)
|
||||||
.context("spawn process")?;
|
.context("spawn process")?;
|
||||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
|
||||||
let mut child = scopeguard::guard(child, |child| {
|
let mut child = scopeguard::guard(child, |child| {
|
||||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
child.kill_and_wait();
|
||||||
});
|
});
|
||||||
|
|
||||||
let stdin = child.stdin.take().unwrap();
|
let stdin = child.stdin.take().unwrap();
|
||||||
@@ -998,11 +970,7 @@ impl WalRedoProcess {
|
|||||||
// these files will be collected to an allure report
|
// these files will be collected to an allure report
|
||||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||||
|
|
||||||
// TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
|
let path = self.conf.tenant_path(&self.tenant_id).join(&filename);
|
||||||
let path = self
|
|
||||||
.conf
|
|
||||||
.tenant_path(&TenantShardId::unsharded(self.tenant_id))
|
|
||||||
.join(&filename);
|
|
||||||
|
|
||||||
let res = std::fs::OpenOptions::new()
|
let res = std::fs::OpenOptions::new()
|
||||||
.write(true)
|
.write(true)
|
||||||
@@ -1028,7 +996,7 @@ impl Drop for WalRedoProcess {
|
|||||||
self.child
|
self.child
|
||||||
.take()
|
.take()
|
||||||
.expect("we only do this once")
|
.expect("we only do this once")
|
||||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
.kill_and_wait();
|
||||||
self.stderr_logger_cancel.cancel();
|
self.stderr_logger_cancel.cancel();
|
||||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||||
}
|
}
|
||||||
@@ -1064,19 +1032,16 @@ impl NoLeakChild {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn kill_and_wait(mut self, cause: WalRedoKillCause) {
|
fn kill_and_wait(mut self) {
|
||||||
let child = match self.child.take() {
|
let child = match self.child.take() {
|
||||||
Some(child) => child,
|
Some(child) => child,
|
||||||
None => return,
|
None => return,
|
||||||
};
|
};
|
||||||
Self::kill_and_wait_impl(child, cause);
|
Self::kill_and_wait_impl(child);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(pid=child.id(), ?cause))]
|
#[instrument(skip_all, fields(pid=child.id()))]
|
||||||
fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
|
fn kill_and_wait_impl(mut child: Child) {
|
||||||
scopeguard::defer! {
|
|
||||||
WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
|
|
||||||
}
|
|
||||||
let res = child.kill();
|
let res = child.kill();
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
// This branch is very unlikely because:
|
// This branch is very unlikely because:
|
||||||
@@ -1121,7 +1086,7 @@ impl Drop for NoLeakChild {
|
|||||||
// This thread here is going to outlive of our dropper.
|
// This thread here is going to outlive of our dropper.
|
||||||
let span = tracing::info_span!("walredo", %tenant_id);
|
let span = tracing::info_span!("walredo", %tenant_id);
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
|
Self::kill_and_wait_impl(child);
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
});
|
});
|
||||||
@@ -1193,7 +1158,7 @@ mod tests {
|
|||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn short_v14_redo() {
|
async fn short_v14_redo() {
|
||||||
let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();
|
let expected = std::fs::read("fixtures/short_v14_redo.page").unwrap();
|
||||||
|
|
||||||
let h = RedoHarness::new().unwrap();
|
let h = RedoHarness::new().unwrap();
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@@ -20,7 +20,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
|||||||
SHLIB_LINK = -lcurl
|
SHLIB_LINK = -lcurl
|
||||||
|
|
||||||
EXTENSION = neon
|
EXTENSION = neon
|
||||||
DATA = neon--1.0.sql neon--1.0--1.1.sql
|
DATA = neon--1.0.sql
|
||||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||||
|
|
||||||
EXTRA_CLEAN = \
|
EXTRA_CLEAN = \
|
||||||
|
|||||||
@@ -1,20 +0,0 @@
|
|||||||
neon extension consists of several parts:
|
|
||||||
|
|
||||||
### shared preload library `neon.so`
|
|
||||||
|
|
||||||
- implements storage manager API and network communications with remote page server.
|
|
||||||
|
|
||||||
- walproposer: implements broadcast protocol between postgres and WAL safekeepers.
|
|
||||||
|
|
||||||
- control plane connector: Captures updates to roles/databases using ProcessUtility_hook and sends them to the control ProcessUtility_hook.
|
|
||||||
|
|
||||||
- remote extension server: Request compute_ctl to download extension files.
|
|
||||||
|
|
||||||
- file_cache: Local file cache is used to temporary store relations pages in local file system for better performance.
|
|
||||||
|
|
||||||
- relsize_cache: Relation size cache for better neon performance.
|
|
||||||
|
|
||||||
### SQL functions in `neon--*.sql`
|
|
||||||
|
|
||||||
Utility functions to expose neon specific information to user and metrics collection.
|
|
||||||
This extension is created in all databases in the cluster by default.
|
|
||||||
@@ -475,12 +475,6 @@ NeonXactCallback(XactEvent event, void *arg)
|
|||||||
Assert(CurrentDdlTable == &RootTable);
|
Assert(CurrentDdlTable == &RootTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
|
||||||
RoleIsNeonSuperuser(const char *role_name)
|
|
||||||
{
|
|
||||||
return strcmp(role_name, "neon_superuser") == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
HandleCreateDb(CreatedbStmt *stmt)
|
HandleCreateDb(CreatedbStmt *stmt)
|
||||||
{
|
{
|
||||||
@@ -507,16 +501,9 @@ HandleCreateDb(CreatedbStmt *stmt)
|
|||||||
|
|
||||||
entry->type = Op_Set;
|
entry->type = Op_Set;
|
||||||
if (downer && downer->arg)
|
if (downer && downer->arg)
|
||||||
{
|
entry->owner = get_role_oid(defGetString(downer), false);
|
||||||
const char *owner_name = defGetString(downer);
|
|
||||||
if (RoleIsNeonSuperuser(owner_name))
|
|
||||||
elog(ERROR, "can't create a database with owner neon_superuser");
|
|
||||||
entry->owner = get_role_oid(owner_name, false);
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
|
||||||
entry->owner = GetUserId();
|
entry->owner = GetUserId();
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -535,10 +522,8 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
|
|||||||
|
|
||||||
if (!found)
|
if (!found)
|
||||||
memset(entry->old_name, 0, sizeof(entry->old_name));
|
memset(entry->old_name, 0, sizeof(entry->old_name));
|
||||||
const char *new_owner = get_rolespec_name(stmt->newowner);
|
|
||||||
if (RoleIsNeonSuperuser(new_owner))
|
entry->owner = get_role_oid(get_rolespec_name(stmt->newowner), false);
|
||||||
elog(ERROR, "can't alter owner to neon_superuser");
|
|
||||||
entry->owner = get_role_oid(new_owner, false);
|
|
||||||
entry->type = Op_Set;
|
entry->type = Op_Set;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -632,9 +617,6 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
|||||||
InitRoleTableIfNeeded();
|
InitRoleTableIfNeeded();
|
||||||
DefElem *dpass = NULL;
|
DefElem *dpass = NULL;
|
||||||
ListCell *option;
|
ListCell *option;
|
||||||
const char *role_name = stmt->role->rolename;
|
|
||||||
if (RoleIsNeonSuperuser(role_name))
|
|
||||||
elog(ERROR, "can't ALTER neon_superuser");
|
|
||||||
|
|
||||||
foreach(option, stmt->options)
|
foreach(option, stmt->options)
|
||||||
{
|
{
|
||||||
@@ -649,7 +631,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
|||||||
bool found = false;
|
bool found = false;
|
||||||
RoleEntry *entry = hash_search(
|
RoleEntry *entry = hash_search(
|
||||||
CurrentDdlTable->role_table,
|
CurrentDdlTable->role_table,
|
||||||
role_name,
|
stmt->role->rolename,
|
||||||
HASH_ENTER,
|
HASH_ENTER,
|
||||||
&found);
|
&found);
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user